# Import libraries

In [28]:
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Reading Data

In [29]:
df_train = pd.read_csv('train_data_prep.csv')
df_test = pd.read_csv('test_data_prep.csv')
df_train.head()

Unnamed: 0,num__Age,num__Monthly Income,num__Distance from Home,cat__Work-Life Balance_Fair,cat__Work-Life Balance_Good,cat__Work-Life Balance_Poor,cat__Overtime_Yes,cat__Company Reputation_Fair,cat__Company Reputation_Good,cat__Company Reputation_Poor,...,cat__Job Satisfaction_Medium,cat__Job Satisfaction_Very High,cat__Marital Status_Married,cat__Marital Status_Single,cat__Number of Dependents_low,cat__Performance Rating_Below Average,cat__Performance Rating_High,cat__Performance Rating_Low,cat__Gender_Male,Attrition
0,-1.287802,2.294077,-0.808698,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,-0.539896,-0.442406,0.419519,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0
2,0.374213,0.974902,-0.28232,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0
3,-1.204702,0.964621,0.279151,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1
4,-1.454004,-0.531192,-0.913974,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1


In [30]:
X_train = df_train.drop('Attrition', axis=1)
y_train = df_train['Attrition']
X_test = df_test.drop('Attrition', axis=1)
y_test = df_test['Attrition']

# Modelling

## Base Model

In [31]:
log_reg = LogisticRegression()
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=5)

print('Logistic Regression Scores:', log_reg_scores)
print('Mean Logistic Regression Score:', log_reg_scores.mean())
print('Standard Deviation:', log_reg_scores.std())

Logistic Regression Scores: [0.75970055 0.75917334 0.75421763 0.75358499 0.75832982]
Mean Logistic Regression Score: 0.7570012652889078
Standard Deviation: 0.0025763814822374184


## More Models

In [8]:
knn = KNeighborsClassifier()
svm = SVC()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()

scores_dic = {}
models=[]
scores=[]
deviations=[]
for model in [knn, svm, tree, rf]:
    model_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    models.append(model.__class__.__name__)
    scores.append(model_scores.mean())
    deviations.append(model_scores.std())

scores_dic['Model'] = models
scores_dic['Mean_Score'] = scores
scores_dic['Standard_Deviation'] = deviations

In [9]:
scores_dic

{'Model': ['KNeighborsClassifier',
  'SVC',
  'DecisionTreeClassifier',
  'RandomForestClassifier'],
 'Mean_Score': [0.6871573533315315,
  0.7521510078218677,
  0.6658161768666565,
  0.7408477584580737],
 'Standard_Deviation': [0.0013866328130485819,
  0.002743618504654566,
  0.0028452166107567375,
  0.0006481027954308448]}

In [10]:
scores_df = pd.DataFrame(scores_dic)
scores_df

Unnamed: 0,Model,Mean_Score,Standard_Deviation
0,KNeighborsClassifier,0.687157,0.001387
1,SVC,0.752151,0.002744
2,DecisionTreeClassifier,0.665816,0.002845
3,RandomForestClassifier,0.740848,0.000648


# Feature Selection

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
importances = rf.feature_importances_

importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
importances_df.sort_values('Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=importances_df)

In [None]:
top_10 = importances_df.sort_values('Importance', ascending=False).head(10)
top_10_features = top_10['Feature'].values

In [None]:
X_train = X_train[top_10_features]
X_test = X_test[top_10_features]

# Hyperparameter Tuning

In [19]:
def grid_search(model, params, X_train, y_train):
    grid = GridSearchCV(model, params, cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_estimator_, grid.best_params_, grid.best_score_

In [24]:
grid_params = {svm: {'kernel': ['linear', 'rbf'],
                        'C': [0.1, 1, 10]},

               tree:{'max_depth': [ 4, 5],
                       'min_samples_split': [ 4, 5]},

               rf:{'n_estimators': [100, 200, 300],
                    'max_depth': [ 4, 5]}
                }


In [27]:
grid_dict = {}
models=[]
best_estimators=[]
best_params_lst=[]
best_scores=[]
for model, params in grid_params.items():
    best_estimator, best_params, best_score = grid_search(model, params, X_train, y_train)
    models.append(model.__class__.__name__)
    best_estimators.append(best_estimator)
    best_params_lst.append(best_params)
    best_scores.append(best_score)

grid_dict['Model'] = models
grid_dict['Best_Estimator'] = best_estimators
grid_dict['Best_Params'] = best_params
grid_dict['Best_Score'] = best_scores  

AttributeError: 'dict' object has no attribute 'append'

In [None]:
models_df = pd.DataFrame(grid_dict)
models_df

In [None]:
best_model = log_reg  # Assumption

# Model Saving

In [None]:
import pickle

pickle.dump(best_model, open('model.pkl', 'wb'))