In [1697]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
import seaborn as sns
#Importing homemade functions
from Healthcare_EDA_functions import *

In [1698]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib

In [1699]:
#Modify the supervisor variable and add the Supervisor Rating
class Rate_Supervisor(TransformerMixin):
    
    def __init__(self,d_supervisors):
        self.goodL1 = d_supervisors['goodL1']
        self.badL1 = d_supervisors['badL1']
        self.goodL2 = d_supervisors['goodL2']
        self.badL2 = d_supervisors['badL2']
        pass
    
    def transform(self,X,y=None):
        #Fix L1 Supervisor Ratings
        badL1_lst=badL1_grp.index
        goodL1_lst = goodL1_grp.index
        X['L1_Supervisor_Rating'] = X.L1Supervisor.apply(lambda x: sortL1(x,badL1_lst,goodL1_lst))
        #Fix L2 Supervisor Ratings
        goodL2_grp = fixL2_grp[fixL2_grp.percent_resigned<26]
        badL2_lst=badL2_grp.index
        goodL2_lst = goodL2_grp.index
        X['L2_Supervisor_Rating'] = X.L2Supervisor.apply(lambda x: sortL2(x,badL2_lst,goodL2_lst))
        return X
    
    def fit(self,X,y):
        return self  

In [1700]:
#Split the Designation variable into three component pieces
class Split_Designation(TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self,X,y=None):
        X = pd.DataFrame(X)
        X['primary_designation'] = X.Designation.apply(split_desig_prim)
        X['secondary_designation'] = X.Designation.apply(split_desig_sec)
        X['descriptor_designation'] = X.Designation.apply(split_desig_des)
        X['secondary_designation'] = X.secondary_designation.fillna('not_assigned')
        return X
    
    def fit(self,X,y):
        return self

In [1701]:
#Add the Client Rating variable
class Rate_Clients(TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self,X,y=None):
        clients_bad = ['ALN', 'Athena', 'Brightree', 'Cymetrix', 'HAP', 'PPM', 'Source Medical', 'TPX']
        clients_good = ['Altruis', 'Continuum', 'MRG', 'Shared Services', 'T-Systems', 'Zotec', 'abeo']
        bad_clients = clients_bad
        good_clients = clients_good
        X['Client_Rating'] = X.Client.apply(lambda x: sortClients(x,bad_clients,good_clients))
        return X
    
    def fit(self,X,y):
        return self

In [1702]:
#Split the Functionality Variable
class Split_Functionality(TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self,X,y=None):
        X['secondary_functionality'] = X.Functionality.apply(split_functionality)
        X['primary_functionality']=X.Functionality.apply(split_functionality2)
        X['secondary_functionality']=X.secondary_functionality.fillna('none')
        return X
    
    def fit(self,X,y):
        return self

In [1722]:
#Create the variables relating to age_at_join
class Calc_Age(TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self,X,y=None):
        X['age_at_join'] = (X['DOJ'] - X['DOB']) / np.timedelta64(1, 'D')
        return X
    
    def fit(self,X,y):
        return self

In [1704]:
#Modify the Pincode Variable and add Districts and Moved variable
class Add_District(TransformerMixin):
    
    def __init__(self):
        pass
    
    def import_districts(district_file):
        pin_tab = pd.read_csv(district_file,encoding='ISO-8859-1')
        pin_tab = pin_tab.loc[0::,['pincode','Districtname','statename']]
        pin_tab=pin_tab.drop_duplicates(subset='pincode',keep='first')
        return pin_tab
    
    
    def add_moved(X):
        X['Per_Pincode'] = np.where(X['Per_Pincode'].isnull(),X['Pre_Pincode'],X['Per_Pincode'])
        X['Moved'] = np.where(X['Pre_Pincode']==X['Per_Pincode'],'no','yes')
        return X
    
    def transform(self,X,y=None):
        pin_tab = Add_District.import_districts('all_india_pin_code.csv')
        X = Add_District.add_districts(X,pin_tab)
        X = Add_District.add_moved(X)
        return X

    def add_districts(X,pin_tab):
        X = pd.merge(left=X,right=pin_tab[['Districtname']],left_on=['Pre_Pincode'],right_on=pin_tab['pincode'],how='left')
        X['Districts'] = X.Districtname.apply(district_group)
        X.Districts.fillna('Unknown',inplace=True)
        return X        
    
    def fit(self,X,y):
        return self

In [1705]:
#Cleanup the Highest Degree field
class Clean_HighestDegree(TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self,X,y=None):
        X.HighestDegree.fillna('None',inplace=True)
        X['HighestDegree'] = X.HighestDegree.astype(str)
        X['HighestDegree'] = X.HighestDegree.apply(clean_column)
        X['HighestDegree'] = X.HighestDegree.apply(RepBSC)
        X['HighestDegree'] = X.HighestDegree.apply(RepBCOM)
        X['HighestDegree'] = X.HighestDegree.apply(RepBE)
        X['HighestDegree'] = X.HighestDegree.apply(RepBCA)
        X['HighestDegree'] = X.HighestDegree.apply(RepBPHARM)
        X['HighestDegree'] = X.HighestDegree.apply(RepBTECH)
        X['HighestDegree'] = X.HighestDegree.apply(RepBA)
        X['HighestDegree'] = X.HighestDegree.apply(RepMSC)
        X['HighestDegree'] = X.HighestDegree.apply(RepMBA)
        X['HighestDegree'] = X.HighestDegree.apply(RepBCS)
        X['HighestDegree'] = X.HighestDegree.apply(RepBBA)
        X['HighestDegree'] = X.HighestDegree.apply(RepBBM)
        X['HighestDegree'] = X.HighestDegree.apply(RepMCA)
        X['HighestDegree'] = X.HighestDegree.apply(RepBE2)
        X['HighestDegree'] = X.HighestDegree.apply(RepDip)
        X['HighestDegree'] = X.HighestDegree.apply(MPharm)
        X['HighestDegree'] = X.HighestDegree.apply(clean_desig)
        X['degree_title'] = X.HighestDegree.apply(split_degree)
        return X
    
    def fit(self,X,y):
        return self

In [1706]:
#Create the total leaves variable
class Add_Leaves(TransformerMixin):
    
    def __init__(self):
        self.sick = X.NoOfSickLeaveAvailed
        self.casual = X.NoOfCLAvailed
        pass
    
    def transform(self,X,y=None):
        X['total_leaves'] = self.casual + self.sick
        return X
    
    def fit(self,X,y):
        return self

In [1707]:
#Add misc. variables
class Add_Vars(TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self,X,y=None):
        X['Operation_Assistant'] = np.where(((X['primary_functionality']=='operations') & 
                                       (X['primary_designation']=='assistant')),1,0)
        X['Shared_Software'] = np.where(((X['primary_functionality']=='shared services') & 
                                       (X['primary_designation']=='software')),1,0)
        X=X.rename(columns={'Productivity%':'Productivity','Quality%':'Quality'})
        #Variable for when Productivity is null and Quality isn't
        X['NullProd'] = np.where((X.Productivity.isnull() & X.Quality.notnull()),1,0)
        #Variable for when Quality is null and Productivity isn't
        X['NullQual'] = np.where((X.Quality.isnull() & X.Productivity.notnull()),1,0)
        #Variable for when both Productivity and Quality are null
        X['Both_Null_Prod_Qual'] = np.where((X.Quality.isnull() & X.Productivity.isnull()),1,0)
        return X
    
    def fit(self,X,y):
        return self

In [1708]:
#Keep only useful features
class Keep_Features(TransformerMixin):
    
    def __init__(self):
        self.feats =['Gender','primary_designation','secondary_designation',\
           'Client','secondary_functionality','MaritalStatus','NullProd','NullQual','Both_Null_Prod_Qual',\
           'HighestDegree','primary_functionality','Districts','PrevCompanyExp','Moved',\
           'total_leaves','age_at_join','degree_title','Dependents','Productivity','LateEntry',\
           'Operation_Assistant','Shared_Software','Shift']
        pass
    
    def transform(self,X,y=None):
        
        X = X[self.feats]
        return X
    
    def fit(self,X,y):
        return self

In [1709]:
#Create dummy variables
class Dummy(TransformerMixin):
    
    def __init__(self,d_accepted):
        self.d_accepted = d_accepted
        pass
    
    def transform(self,X,y=None):
        for var in X.columns:
            if X[var].dtypes == object:
                X[var] = pd.Categorical(X[var],self.d_accepted[var])
        X = pd.get_dummies(X)
        return X
        
    def fit(self,X,y):
        return self

In [1710]:
#Impute values for Productivity
class Impute(TransformerMixin):
    
    def __init__(self,mean):
        self.mean = mean
        pass
    
    def transform(self,X,y=None):
        X['Productivity']=X.Productivity.fillna(self.mean)
        return X
    
    def fit(self,X,y):
        return self

In [1728]:
X

Unnamed: 0,Empcode,Name,Gender,L1Supervisor,L2Supervisor,Designation,Client,Functionality,DOJ,DOB,...,Shift,L1_Supervisor_Rating,L2_Supervisor_Rating,primary_designation,secondary_designation,descriptor_designation,Client_Rating,secondary_functionality,primary_functionality,age_at_join
0,C07383,Dineshkumar Umapathi,M,seju.sebastian,harish.chandras,Assistant Team Leader,Source Medical,Operations - Revenue Cycle,2012-07-11,1986-06-19,...,Day,Bad,Neutral,assistant,team leader,,Bad,revenue cycle,operations,9519.0
1,F034899,Rajeshwari Amarsingh,F,sathishkumar.ch,kannan.poornali,Executive,Athena,Operations - Clinical Document Services,2015-03-13,1992-07-09,...,Day,Bad,Bad,executive,,,Bad,clinical document services,operations,8282.0
2,E093441,Abinash S,M,rathish.seethar,magesh.m1,Senior Executive,Shared Services,Shared Services - Human Resource,2014-09-08,1992-07-25,...,Day,Good,Neutral,senior,executive,,Good,human resource,shared services,8080.0
3,D051008,Logesh Kumar,M,sathishkumar.ch,kannan.poornali,Senior Executive,Athena,Operations – Clinical,2013-05-20,1991-08-03,...,Day,Bad,Bad,senior,executive,,Bad,clinical,operations,7961.0
4,F116696,Raja K,M,rajan.s,sathishkumar.j,Executive,Athena,Operations - Data Entry,2015-11-27,1988-05-14,...,Day,Bad,Bad,executive,,,Bad,data entry,operations,10058.0
5,G038672,Posen Nokpa,F,seju.sebastian,harish.chandras,Senior Team Leader,Source Medical,Operations - Revenue Cycle,2016-03-22,1986-12-11,...,Day,Bad,Neutral,senior,team leader,,Bad,revenue cycle,operations,10694.0
6,F126884,Vinothini J,F,nishamary.kuria,mirza.karim,Assistant Executive,Athena,Operations - Data Entry,2015-12-16,1989-05-13,...,Day,Neutral,Bad,assistant,executive,,Bad,data entry,operations,9713.0
7,F126978,Bhaskar Sawant,M,sarita.raitani,suleman.shaikh,Assistant Executive,Athena,Operations – Clinical,2015-12-18,1990-07-05,...,Day,Bad,Good,assistant,executive,,Bad,clinical,operations,9297.0
8,F127004,Bickas Bahadur T,M,aliakbar.sibbak,rajkumar.balasu,Executive,Athena,Operations - Payments Posting Data Entry,2015-12-23,1995-07-19,...,Day,Bad,Neutral,executive,,,Bad,payments posting data entry,operations,7462.0
9,D081399,Mythili Devi,F,sivashankar.chi,nirmalvickne.s,Subject Matter Expert,ALN,Operations - Revenue Cycle,2013-08-19,1990-03-08,...,Day,Bad,Bad,subject,matter expert,,Bad,revenue cycle,operations,8565.0


In [1731]:
file = "HR Data.xlsx"
df1 = pd.read_excel(file)

#Cleaning up the DOL field
df1['DOL'] = df1.DOL.apply(lambda x: 0 if x=='No' else 1 if x=='Yes' else 0 if x==0 else 1)

df1=df1.rename(columns={'Productivity%':'Productivity','Quality%':'Quality'})

y = df1['DOL']

X = df1.drop('DOL',axis=1)

d_accepted = joblib.load('column_vals.pkl')

d_supervisors = joblib.load('sup_dict.pkl')

processor = Pipeline([
                ('supervisor',Rate_Supervisor(d_supervisors)),
                ('designation',Split_Designation()),
                ('clients',Rate_Clients()),
                ('functionality',Split_Functionality()),
                ('age',Calc_Age()),
                ('districts',Add_District()),
                ('degree',Clean_HighestDegree()),
                ('leaves',Add_Leaves()),
                ('misc',Add_Vars()),
                ('feats',Keep_Features()),
                ('Impute_prod',Impute(X_train.Productivity.mean())),
                ('get_dummies',Dummy(d_accepted)),
                ('gbm',GradientBoostingClassifier(random_state=2,learning_rate=0.1,n_estimators=174,max_depth=8,
                                          min_samples_split=650,max_features =0.5,subsample = 0.95))
                ])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=2)

model = processor.fit(X_train,y_train)
pred = model.predict(X_test)
gbc_score_base = accuracy_score(pred,y_test)
print('Accuracy is: '+str(gbc_score_base))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Accuracy is: 0.889066666667


In [1732]:
joblib.dump(model,'model.pkl')

['model.pkl']

In [1717]:
joblib.dump(model,'pipeline.h5')

['pipeline.h5']

In [1691]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y,test_size=0.4,random_state=45)
pred2 = processor.predict(X_test2)
gbc_score_base2 = accuracy_score(pred2,y_test2)
print('Accuracy is: '+str(gbc_score_base2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Accuracy is: 0.904


In [1733]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y,test_size=0.4,random_state=456)
test_pipe = joblib.load('model.pkl')
pred3 = test_pipe.predict(X_test2)
gbc_score_base3 = accuracy_score(pred3,y_test2)
print('Accuracy is: '+str(gbc_score_base3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Accuracy is: 0.9016


In [1736]:
test_pipe = joblib.load('model.pkl')
pred3 = test_pipe.predict(X)
pred4 = pred3.tolist()
gbc_score_base3 = accuracy_score(pred3,y)
print('Accuracy is: '+str(gbc_score_base3))

Accuracy is: 0.8976


In [1735]:
Emps = X.loc[0::,['Empcode']]
Emps['Prediction'] = pred4
Emps['Actual'] = y
print(Emps)

       Empcode  Prediction  Actual
0       C07383           0       0
1      F034899           0       0
2      E093441           0       0
3      D051008           1       1
4      F116696           1       1
5      G038672           0       0
6      F126884           1       1
7      F126978           1       1
8      F127004           0       0
9      D081399           1       1
10     E073222           0       0
11     E093453           0       0
12     F126918           0       0
13     F126753           1       1
14     F126979           0       0
15     G027530           0       0
16     F127009           0       0
17     F127012           0       0
18     E103756           0       0
19     E113937           1       1
20     G069683           1       1
21     F024683           0       1
22     E124186           0       0
23     G037810           0       0
24     G038257           1       1
25     G038423           0       0
26     G059359           0       0
27     G038431      