In [44]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
data_df = pd.read_csv("aug_train.csv")
data_df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [45]:
# Drop the non-beneficial ID columns, 'enrollee_id' and 'city'.
data_df=data_df.drop(columns=["enrollee_id","city","last_new_job","enrolled_university"])
data_df

Unnamed: 0,city_development_index,gender,relevent_experience,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,0.920,Male,Has relevent experience,Graduate,STEM,>20,,,36,1.0
1,0.776,Male,No relevent experience,Graduate,STEM,15,50-99,Pvt Ltd,47,0.0
2,0.624,,No relevent experience,Graduate,STEM,5,,,83,0.0
3,0.789,,No relevent experience,Graduate,Business Degree,<1,,Pvt Ltd,52,1.0
4,0.767,Male,Has relevent experience,Masters,STEM,>20,50-99,Funded Startup,8,0.0
...,...,...,...,...,...,...,...,...,...,...
19153,0.878,Male,No relevent experience,Graduate,Humanities,14,,,42,1.0
19154,0.920,Male,Has relevent experience,Graduate,STEM,14,,,52,1.0
19155,0.920,Male,Has relevent experience,Graduate,STEM,>20,50-99,Pvt Ltd,44,0.0
19156,0.802,Male,Has relevent experience,High School,,<1,500-999,Pvt Ltd,97,0.0


In [47]:
# Check the number of null values in data_df
data_df.isnull().sum()   # we could have two solutions: we can either drop all the null values or drop the entire columns/features, 
                          # it depends on how important the features are

city_development_index       0
gender                    4508
relevent_experience          0
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
training_hours               0
target                       0
dtype: int64

In [48]:
#drop the the rows has at least two null values
data_clean_df=data_df.dropna(thresh=8)
data_clean_df.isnull().sum()

city_development_index       0
gender                    2900
relevent_experience          0
education_level             79
major_discipline          1107
experience                  28
company_size              3373
company_type              3580
training_hours               0
target                       0
dtype: int64

In [49]:
# drop allnull values in company_size and company_type
data_clean_v2_df=data_clean_df.dropna(subset=["company_size","company_type","gender"])
data_clean_v2_df.isnull().sum()

city_development_index      0
gender                      0
relevent_experience         0
education_level            79
major_discipline          745
experience                  6
company_size                0
company_type                0
training_hours              0
target                      0
dtype: int64

In [53]:
clean_df=data_clean_v2_df.dropna(subset=["education_level","major_discipline","experience"])
clean_df.isnull().sum()

city_development_index    0
gender                    0
relevent_experience       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
training_hours            0
target                    0
dtype: int64

In [54]:
clean_df.count()

city_development_index    9040
gender                    9040
relevent_experience       9040
education_level           9040
major_discipline          9040
experience                9040
company_size              9040
company_type              9040
training_hours            9040
target                    9040
dtype: int64

In [56]:
clean_df

Unnamed: 0,city_development_index,gender,relevent_experience,education_level,major_discipline,experience,company_size,company_type,training_hours,target
1,0.776,Male,No relevent experience,Graduate,STEM,15,50-99,Pvt Ltd,47,0.0
4,0.767,Male,Has relevent experience,Masters,STEM,>20,50-99,Funded Startup,8,0.0
7,0.762,Male,Has relevent experience,Graduate,STEM,13,<10,Pvt Ltd,18,1.0
8,0.920,Male,Has relevent experience,Graduate,STEM,7,50-99,Pvt Ltd,46,1.0
11,0.920,Male,Has relevent experience,Graduate,STEM,5,5000-9999,Pvt Ltd,108,0.0
...,...,...,...,...,...,...,...,...,...,...
19147,0.624,Male,No relevent experience,Graduate,STEM,1,100-500,Pvt Ltd,52,1.0
19149,0.920,Male,Has relevent experience,Masters,STEM,9,50-99,Pvt Ltd,36,1.0
19150,0.920,Female,Has relevent experience,Graduate,STEM,10,100-500,Public Sector,23,0.0
19152,0.920,Female,Has relevent experience,Graduate,Humanities,7,10/49,Funded Startup,25,0.0


In [57]:
# Generate our categorical variable list
data_cat = clean_df.dtypes[clean_df.dtypes == "object"].index.tolist()
# Check the number of unique values in each column
clean_df[data_cat].nunique()

gender                  3
relevent_experience     2
education_level         3
major_discipline        6
experience             22
company_size            8
company_type            6
dtype: int64

In [58]:
# Look at experience value counts for binning
experience=clean_df.experience.value_counts()
experience

>20    1889
5       579
10      555
6       545
9       541
7       492
4       487
3       433
8       403
15      402
11      372
14      338
16      305
2       302
12      295
13      236
17      207
19      180
18      169
1       125
<1       99
20       86
Name: experience, dtype: int64

In [76]:
# Generate our categorical variable lists
data_cat = clean_df.dtypes[clean_df.dtypes == "object"].index.tolist()
data_cat

['gender',
 'relevent_experience',
 'education_level',
 'major_discipline',
 'experience',
 'company_size',
 'company_type']

In [77]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(clean_df[data_cat]))


# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(data_cat)
encode_df.head()

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevent_experience_Has relevent experience,relevent_experience_No relevent experience,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,...,company_size_50-99,company_size_500-999,company_size_5000-9999,company_size_<10,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd
0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [78]:
# Merge one-hot encoded features and drop the originals
clean_data_df = clean_df.merge(encode_df,left_index=True, right_index=True)
clean_data_df = clean_data_df.drop(data_cat,1)
clean_data_df.head()

Unnamed: 0,city_development_index,training_hours,target,gender_Female,gender_Male,gender_Other,relevent_experience_Has relevent experience,relevent_experience_No relevent experience,education_level_Graduate,education_level_Masters,...,company_size_50-99,company_size_500-999,company_size_5000-9999,company_size_<10,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd
1,0.776,47,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.767,8,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.762,18,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.92,46,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11,0.92,108,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [81]:
# Split our preprocessed data into our features and target arrays
y = clean_data_df["target"].values
X = clean_data_df.drop(["target"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [82]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [83]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                4240      
_________________________________________________________________
dense_1 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 6,701
Trainable params: 6,701
Non-trainable params: 0
_________________________________________________________________


In [84]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [85]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [86]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

34/34 - 0s - loss: 0.8655 - accuracy: 0.8162
Loss: 0.8654520511627197, Accuracy: 0.816231369972229


# # randomforest

In [88]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [107]:
# define the  feaetures set
X=clean_data_df.copy()
X=X.drop("target",axis=1)
X.head()

Unnamed: 0,city_development_index,training_hours,gender_Female,gender_Male,gender_Other,relevent_experience_Has relevent experience,relevent_experience_No relevent experience,education_level_Graduate,education_level_Masters,education_level_Phd,...,company_size_50-99,company_size_500-999,company_size_5000-9999,company_size_<10,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd
1,0.776,47,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.767,8,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.762,18,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.92,46,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11,0.92,108,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [108]:
# define target set
y=clean_data_df["target"].ravel()
y[:5]

array([0., 0., 1., 1., 0.])

In [109]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [110]:
#Create a StandardScaler instance
scaler=StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [111]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) 

In [112]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [113]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [114]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actually staying", "Actually leaving"], columns=["Predicted staying", "Predicted leaving"])

cm_df

Unnamed: 0,Predicted staying,Predicted leaving
Actually staying,838,47
Actually leaving,114,73


In [115]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8498134328358209

In [116]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted staying,Predicted leaving
Actually staying,838,47
Actually leaving,114,73


Accuracy Score : 0.8498134328358209
Classification Report
              precision    recall  f1-score   support

         0.0       0.88      0.95      0.91       885
         1.0       0.61      0.39      0.48       187

    accuracy                           0.85      1072
   macro avg       0.74      0.67      0.69      1072
weighted avg       0.83      0.85      0.84      1072



In [117]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.3234272 , 0.23264954, 0.00952855, 0.00949946, 0.0007093 ,
       0.00876596, 0.00840455, 0.01835688, 0.01784184, 0.00571869,
       0.00232251, 0.00458153, 0.00724765, 0.00285945, 0.00408082,
       0.01007452, 0.00325377, 0.01047801, 0.00896307, 0.00876184,
       0.00567134, 0.00660665, 0.00895078, 0.00848005, 0.00357902,
       0.0026575 , 0.00363733, 0.00687577, 0.00412466, 0.00978347,
       0.0105783 , 0.01164368, 0.01101772, 0.01057707, 0.01006292,
       0.01011947, 0.00382393, 0.01937346, 0.01432292, 0.01619605,
       0.0132932 , 0.01319224, 0.01738221, 0.00933642, 0.00732519,
       0.01026235, 0.00648461, 0.01025569, 0.00675546, 0.00320924,
       0.009533  , 0.01736319])

In [118]:
# We can sort the features by their importance.
x=zip(importances,X.columns)
sorted(x,reverse=True)

[(0.3234271981423889, 'city_development_index'),
 (0.23264953820985826, 'training_hours'),
 (0.019373456345316512, 'experience_>20'),
 (0.018356876045668175, 'education_level_Graduate'),
 (0.017841835771203622, 'education_level_Masters'),
 (0.01738221041514945, 'company_size_50-99'),
 (0.017363191193304026, 'company_type_Pvt Ltd'),
 (0.0161960496527129, 'company_size_100-500'),
 (0.01432291990376579, 'company_size_10/49'),
 (0.01329319937760612, 'company_size_1000-4999'),
 (0.01319223625852343, 'company_size_10000+'),
 (0.011643677746365851, 'experience_5'),
 (0.01101772204902146, 'experience_6'),
 (0.010578304272496104, 'experience_4'),
 (0.010577070247486333, 'experience_7'),
 (0.010478013838595753, 'experience_10'),
 (0.010262345813859989, 'company_size_<10'),
 (0.010255692251979427, 'company_type_Funded Startup'),
 (0.01011947293815743, 'experience_9'),
 (0.010074518945708523, 'major_discipline_STEM'),
 (0.01006291596610182, 'experience_8'),
 (0.00978347403772865, 'experience_3'),
