In [81]:
import pandas as pd 

In [82]:
TP=pd.read_csv('archive.zip')
TP.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [83]:
balanced_data=TP.groupby(['LeaveOrNot'],group_keys=False).apply(lambda x: x.sample(n=1600, random_state=0))

In [84]:
balanced_data['LeaveOrNot']=balanced_data['LeaveOrNot'].astype('str')
balanced_data.head(2)

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
697,Masters,2017,Pune,2,26,Female,No,4,0
3530,Bachelors,2017,New Delhi,2,37,Male,No,0,0


In [85]:
X_Leave, y_Leave=balanced_data.iloc[:,:-1], balanced_data['LeaveOrNot']

In [86]:
X_dummies=pd.get_dummies(X_Leave)
X_dummies.head(10)

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Female,Gender_Male,EverBenched_No,EverBenched_Yes
697,2017,2,26,4,0,1,0,0,0,1,1,0,1,0
3530,2017,2,37,0,1,0,0,0,1,0,0,1,1,0
3221,2012,3,38,5,1,0,0,1,0,0,1,0,1,0
3391,2017,3,41,4,1,0,0,0,0,1,0,1,1,0
3986,2014,3,35,5,1,0,0,1,0,0,0,1,0,1
2183,2014,3,28,2,1,0,0,1,0,0,0,1,1,0
3603,2017,1,36,0,0,1,0,0,1,0,1,0,1,0
1856,2017,3,26,4,0,1,0,1,0,0,1,0,1,0
1985,2014,3,25,3,1,0,0,1,0,0,0,1,1,0
3500,2017,3,41,1,1,0,0,1,0,0,0,1,1,0


In [87]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve

In [88]:
X_train, X_test, y_train, y_test=train_test_split(X_dummies, y_Leave, random_state=0)

# Decision Tree

In [89]:
from sklearn.tree import DecisionTreeClassifier

In [90]:
train_scores_dt, test_score_dt=validation_curve(DecisionTreeClassifier(random_state=0), X_train, 
                                          y_train, param_name='max_leaf_nodes',param_range=[6,8,10,15,20,30,35], cv=5)

In [91]:
train_scores_dt.mean(axis=1)

array([0.75302083, 0.776875  , 0.78614583, 0.81208333, 0.82354167,
       0.83614583, 0.83989583])

In [92]:
test_score_dt.mean(axis=1)

array([0.74625   , 0.77041667, 0.77291667, 0.80708333, 0.81333333,
       0.82208333, 0.82      ])

In [93]:
dt=DecisionTreeClassifier(max_leaf_nodes=35, random_state=0)
dt.fit(X_train,y_train)

DecisionTreeClassifier(max_leaf_nodes=35, random_state=0)

In [94]:
print('dt acc on train {:.2%}'.format(dt.score(X_train, y_train)))
print('dt acc on test {:.2%}'.format(dt.score(X_test, y_test)))

dt acc on train 84.21%
dt acc on test 80.88%


In [95]:
p2=[[2013,1,25,3,1,0,0,0,1,0,1,0,1,0]]
p3=[[2014,3,29,1,1,0,0,1,0,0,0,1,1,0]]
p4=[[2018,3,40,3,1,0,0,0,0,1,0,1,0,1]]
p5=[[2013,3,27,5,1,0,0,0,0,1,1,0,1,0]]

In [96]:
dt.predict_proba(p2)

array([[0.63888889, 0.36111111]])

In [97]:
dt.predict_proba(p3)

array([[0.6954023, 0.3045977]])

In [98]:
dt.predict_proba(p4)

array([[0.00374532, 0.99625468]])

In [99]:
dt.predict_proba(p5)

array([[0.0952381, 0.9047619]])

In [100]:
from sklearn.tree import export_graphviz

In [101]:
export_graphviz(dt,out_file='tree1_vis.dot',class_names=dt.classes_,feature_names=X_dummies.columns,impurity=False,filled=True)

In [102]:
dt.feature_importances_

array([0.27263237, 0.23556055, 0.05836223, 0.04010389, 0.02171301,
       0.12482622, 0.        , 0.02640901, 0.00553213, 0.15642265,
       0.02649253, 0.0319454 , 0.        , 0.        ])

In [103]:
feature_imp=pd.DataFrame(data=dt.feature_importances_,index=X_dummies.columns,columns=['importance'])
feature_imp.sort_values('importance',ascending=False)

Unnamed: 0,importance
JoiningYear,0.272632
PaymentTier,0.235561
City_Pune,0.156423
Education_Masters,0.124826
Age,0.058362
ExperienceInCurrentDomain,0.040104
Gender_Male,0.031945
Gender_Female,0.026493
City_Bangalore,0.026409
Education_Bachelors,0.021713


# Random Forest

In [104]:
from sklearn.ensemble import RandomForestClassifier

In [105]:
train_score_rf, test_score_rf=validation_curve(RandomForestClassifier(random_state=0), X_train, 
                                          y_train, param_name='n_estimators',param_range=[10,20,30,50,70,100,200], cv=5)

In [106]:
train_score_rf.mean(axis=1)

array([0.92479167, 0.93145833, 0.93395833, 0.93489583, 0.93489583,
       0.93510417, 0.93510417])

In [107]:
test_score_rf.mean(axis=1)

array([0.78875   , 0.79541667, 0.79375   , 0.79333333, 0.79583333,
       0.80375   , 0.8       ])

In [108]:
rf=RandomForestClassifier(random_state=0,n_estimators=100)
rf.fit(X_train,y_train)

RandomForestClassifier(random_state=0)

In [109]:
print('rf acc on train {:.2%}'.format(rf.score(X_train, y_train)))
print('rf acc on test {:.2%}'.format(rf.score(X_test, y_test)))

rf acc on train 93.04%
rf acc on test 77.75%


In [110]:
rf.predict_proba(p2)

array([[0.22, 0.78]])

In [111]:
rf.predict_proba(p3)

array([[0.94916667, 0.05083333]])

In [112]:
feature_imp1=pd.DataFrame(data=rf.feature_importances_,index=X_dummies.columns,columns=['importance'])
feature_imp1.sort_values('importance',ascending=False)

Unnamed: 0,importance
JoiningYear,0.317894
Age,0.184355
ExperienceInCurrentDomain,0.097845
PaymentTier,0.089596
City_Pune,0.063593
Education_Masters,0.055374
Gender_Male,0.039825
Gender_Female,0.038787
Education_Bachelors,0.038513
City_New Delhi,0.026342
