In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
%matplotlib inline

In [27]:
def LR_tune(X_tr,y_tr,reg):
    
    C = [0.001,0.01,0.1,1,10,100]
    penalty = [reg]
        
    # for each alpha compute cv accuracy using grid search 
    # Since we dont have sepeate CV data, using 5 fold crossvalidation on train data
    
    # Classifier to be used for grid search
    clf = LogisticRegression()
    # hyper parameters to be used for grid search
    parameters = dict(C=C,penalty=penalty)
    
    grid_clf = GridSearchCV(clf,parameters, scoring = 'f1', cv=5, n_jobs=-1,return_train_score = True)
    
    grid_clf.fit(X_tr,y_tr)

    
    #train_accuracy = grid_clf.cv_results_['mean_train_score']
    #cv_accuracy = grid_clf.cv_results_['mean_test_score']
    
    opt_c = grid_clf.best_params_['C']
    
    return opt_c

In [28]:
def LR_final(X_tr,y_tr,C,reg):
    
    clf = LogisticRegression(C=C,penalty = reg)
    clf.fit(X_tr, y_tr)

    return clf

In [101]:
def rbfsvm_tune(X_tr,y_tr):
    
    C = [0.001,0.01,0.1,1,10,100]
    #penalty = reg
        
    # for each alpha compute cv accuracy using grid search 
    # Since we dont have sepeate CV data, using 5 fold crossvalidation on train data
    
    # Classifier to be used for grid search
    clf = SVC()
    
    # hyper parameters to be used for grid search
    parameters = dict(C = C)
    
    grid_clf = GridSearchCV(clf,parameters, scoring = 'f1', cv=5, n_jobs=-1,return_train_score = True)
    
    grid_clf.fit(X_tr,y_tr)

    
    #train_accuracy = grid_clf.cv_results_['mean_train_score']
    #cv_accuracy = grid_clf.cv_results_['mean_test_score']
    
    opt_h = grid_clf.best_params_['C']
    
    return opt_h

In [102]:
def rbfsvm_final(X_tr,y_tr,C):
    
    clf = SVC(C = C)
    #cal_clf = CalibratedClassifierCV(base_estimator = clf, method ='sigmoid', cv ='prefit' )
    clf.fit(X_tr, y_tr)
    
    return clf

In [2]:
data = pd.read_csv("train.csv")

In [60]:
data.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,3,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,2,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,2,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,2,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,2,m,other,1,45,3.0,2,0,0,73,0


In [81]:
data[data['is_promoted'] == 0].head(20)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,3,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,2,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,2,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,2,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,2,m,other,1,45,3.0,2,0,0,73,0
5,58896,Analytics,region_2,2,m,sourcing,2,31,3.0,7,0,0,85,0
6,20379,Operations,region_20,2,f,other,1,31,3.0,5,0,0,59,0
7,16290,Operations,region_34,3,m,sourcing,1,33,3.0,6,0,0,63,0
8,73202,Analytics,region_20,2,m,other,1,28,4.0,5,0,0,83,0
9,28911,Sales & Marketing,region_1,3,m,sourcing,1,32,5.0,5,1,0,54,0


In [4]:
data.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [8]:
data['previous_year_rating'].fillna(0,inplace =True)
data.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating       0
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [9]:
data['education'].unique()

array(["Master's & above", "Bachelor's", nan, 'Below Secondary'],
      dtype=object)

In [57]:
data['department'].unique()

array(['Sales & Marketing', 'Operations', 'Technology', 'Analytics',
       'R&D', 'Procurement', 'Finance', 'HR', 'Legal'], dtype=object)

In [54]:
data['is_promoted'].value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

In [10]:
data['education'].fillna('Below Secondary',inplace =True)
data.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [11]:
data['education'].unique()

array(["Master's & above", "Bachelor's", 'Below Secondary'], dtype=object)

In [17]:
data['education'] = data['education'].map({"Master's & above":3,"Bachelor's":2,'Below Secondary':1})
data['education'].unique()

array([3, 2, 1], dtype=int64)

In [19]:
data.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,3,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,2,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,2,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,2,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,2,m,other,1,45,3.0,2,0,0,73,0


In [91]:
X = data[['education','no_of_trainings','age','previous_year_rating','length_of_service','KPIs_met >80%','awards_won?','avg_training_score']]
#X = data[['education','previous_year_rating','age','length_of_service','KPIs_met >80%','avg_training_score']]
y = data['is_promoted']

In [92]:
X.head()

Unnamed: 0,education,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,3,1,35,5.0,8,1,0,49
1,2,1,30,5.0,4,0,0,60
2,2,1,34,3.0,7,0,0,50
3,2,2,39,1.0,10,0,0,50
4,2,1,45,3.0,2,0,0,73


In [93]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: is_promoted, dtype: int64

In [103]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [95]:
opt_c_l1 = LR_tune(X_tr,y_tr,'l1')



In [96]:
clf1 = LR_final(X_tr,y_tr,opt_c_l1,'l1')



In [97]:
y_pred1 = clf1.predict(X_test)

In [98]:
f1_score1 = f1_score(y_test,y_pred1)

In [99]:
f1_score1

0.10256410256410256

In [75]:
opt_c_l2 = LR_tune(X_tr,y_tr,'l2')



In [76]:
clf2 = LR_final(X_tr,y_tr,opt_c_l2,'l2')



In [77]:
y_pred2 = clf2.predict(X_test)

In [78]:
f1_score2 = f1_score(y_test,y_pred2)

In [79]:
#in binary classification, the count of TN is (0,0), FN is(1,0) , TP is (1,1)  and false positives is (0,1)
confusion_matrix(y_test,y_pred2)

array([[15058,     0],
       [ 1377,     8]], dtype=int64)

In [80]:
f1_score2

0.011486001435750178

In [40]:
test_data  = pd.read_csv("test.csv")

In [41]:
test_data.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [42]:
test_data.isnull().sum()

employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [43]:
test_data['previous_year_rating'].fillna(0,inplace =True)
test_data['education'].fillna('Below Secondary',inplace =True)
test_data['education'] = test_data['education'].map({"Master's & above":3,"Bachelor's":2,'Below Secondary':1})
test_data.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [44]:
X_check = test_data[['education','no_of_trainings','age','previous_year_rating','length_of_service','KPIs_met >80%','awards_won?','avg_training_score']]
#X_check = test_data[['education','previous_year_rating','length_of_service','KPIs_met >80%','avg_training_score']]

In [45]:
y_pred3 = clf2.predict(X_check)

In [48]:
d = {'employee_id': test_data['employee_id'], 'is_promoted': y_pred3}
upload = pd.DataFrame(d)

In [49]:
upload.to_csv("upload.csv",index=False)

In [104]:
opt_h = rbfsvm_tune(X_tr,y_tr)



In [105]:
clf3 = rbfsvm_final(X_tr,y_tr,opt_h)



In [106]:
y_pred4 = clf3.predict(X_test)

In [107]:
f1_score3 = f1_score(y_test,y_pred4)

In [108]:
f1_score3

0.2504173622704507

In [109]:
y_pred5 = clf3.predict(X_check)

In [110]:
d = {'employee_id': test_data['employee_id'], 'is_promoted': y_pred5}
upload = pd.DataFrame(d)

In [111]:
upload.to_csv("upload.csv",index=False)