In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset=pd.read_csv(r"C:\Users\anasagar\Documents\LearnBay Machine Learning\PROJECT SESSION LEARNBAY\23rd & 24th SEP(HR DOMAIN)\Attrition.csv")
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
# Descriptive Analytics
dataset.describe()

**EDA(EXPLORATORY DATA ANALYSIS)**

In [None]:
dataset.columns

In [None]:
plt.pie(dataset['Attrition'].value_counts(),labels=['Yes','No'],autopct='%1.1f%%')
plt.title('Attrition')
plt.show()

In [None]:
# Analyse Age vs Attrition
sns.distplot(dataset[dataset['Attrition']=='No']['Age'], hist=False, label='No', color='black')
sns.distplot(dataset[dataset['Attrition']=='Yes']['Age'], hist=False, label='Yes', color='red')
plt.title('"Relation between Age and Attrition of the Employee"')
plt.show()


In [None]:
# Department Age vs Attrition
department = pd.crosstab(dataset['Department'], dataset['Attrition'])
department['Total'] = department['No'] + department['Yes']
department['Percentage'] = department['Yes']/department['Total']
department

In [None]:
gender_wise = pd.crosstab(dataset['Department'], dataset['Attrition'])
gender_wise.plot(kind='bar')
plt.title("Age vs Attrition")

In [None]:
# Analyse PercentSalaryHike vs Attrition
sns.distplot(dataset[dataset['Attrition']=='No']['PercentSalaryHike'], hist=False, label='No', color='black')
sns.distplot(dataset[dataset['Attrition']=='Yes']['PercentSalaryHike'], hist=False, label='Yes', color='red')
plt.title("Relation between PercentSalaryHike and Attrition of the Employee")
plt.show()

In [None]:
gender_wise = pd.crosstab(dataset['Gender'], dataset['Attrition'])
gender_wise.plot(kind='bar')
plt.title("Gender vs Attrition")

In [None]:
# Analyse PercentSalaryHike vs Attrition
sns.distplot(dataset[dataset['Attrition']=='No']['WorkLifeBalance'], hist=False, label='No', color='black')
sns.distplot(dataset[dataset['Attrition']=='Yes']['WorkLifeBalance'], hist=False, label='Yes', color='red')
plt.title("Relation between WorkLifeBalance and Attrition of the Employee")
plt.show()

In [None]:
# # Pandas profiling
# !pip install pandas_profiling
# import pandas_profiling
# profile = pandas_profiling.ProfileReport(dataset)
# profile.to_file(output_file="dataset_analysis.html")

In [None]:
# !pip install dtale
# import dtale
# dtale.show(dataset)

In [None]:
# this is just for your reference - dataset = titanic which is avialable on kaggle
# !pip install dataprep
# from dataprep.datasets import load_dataset
# from dataprep.eda import plot
# df = load_dataset("titanic")
# create_report(df).show()

**Outlier Treatment**

In [None]:
def distplots(col):
  sns.distplot(dataset[col])
  print("**************************",col,"***********************")
  plt.show()

for col in dataset.select_dtypes(exclude='object').columns[0:]:
  distplots(col)

In [None]:
def boxplots(col):
  sns.boxplot(dataset[col])
  print("**************************",col,"***********************")
  plt.show()

for col in dataset.select_dtypes(exclude='object').columns[0:]:
  boxplots(col)


In [None]:
 # Checking correlation between features
plt.figure(figsize=(20,15))
sns.heatmap(dataset.corr(), annot=True, cmap='coolwarm')
plt.show()

**Encoding**

In [None]:
cat_column=[]
for i in dataset.columns:
  if dataset[i].dtype=='object':
    # if dataset[i].nunique()<=6:
    cat_column.append(i)
print(cat_column)

In [None]:
# Lable encoder
for i in cat_column:
  if dataset[i].nunique()<=2:
    dataset[i]=dataset[i].astype('category')
    dataset[i]=dataset[i].cat.codes

In [None]:
# One Hot Encoder
for i in cat_column:
  if dataset[i].nunique()>2:
    dataset=pd.get_dummies(dataset,columns=[i],drop_first=True)

In [None]:
x = dataset.drop('Attrition', axis=1)
y = dataset['Attrition']

In [None]:
x.head()

In [None]:
y.head()

**Imbalance Checks**

In [None]:
y.value_counts()

In [None]:
import imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler()
x_over, y_over = over.fit_resample(x,y)

In [None]:
print(y.value_counts())
print()
print(y_over.value_counts())

In [None]:
# split the data into training and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.25, random_state=101, stratify=y_over)
# stratify=y_smote - it's not required because we already have 50% : 0 and 50% : 1

**Pycaret Package**

In [None]:
# creating training data
data=dataset.sample(frac=0.90,random_state=None).reset_index(drop=True)

# Creating test data
data_unseen=dataset.drop(data.index).reset_index(drop=True)

# x_train,y_train for model building and x_test,y_test is for prediction


In [None]:
print("data for modelling :",data.shape)
print("unseen data for prediction :",data_unseen.shape)

In [None]:
!pip install pycaret

In [None]:
# !pip install --upgrade scipy
# !pip install --upgrade numpy pandas pycaret


In [None]:
# setting up an environment in pycaret
from pycaret.classification import *

In [None]:
exp_clf=setup(data=data,target='Attrition',session_id=123)

In [None]:
# check model
compare_models()

In [None]:
# Hyper Parameter tuning
# Hyperparameter means some option there like in random forest max_depth, criterion(gini,entropy),max_feature,bootstrap is there,so here tune_model check one by one after that they will give answer,which parameter is best
tuned_rf=tune_model(rf)

In [None]:

gbc=create_model('gbc')

In [None]:
# Evaluate the model
evaluate_model(gbc)


In [None]:
predict_model(gbc)

In [None]:
# Predict the unseen test data
unseen_prediction=predict_model(gbc,data=data_unseen)
unseen_prediction

In [None]:
# Save model for deployment
save_model(gbc,'gradientBoosting_model')

**RandomForest Classifier Mode**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [None]:
y_pred_train = rf.predict(x_train)
y_pred_test = rf.predict(x_test)

In [None]:
# Evaluation metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
print(classification_report(y_train, y_pred_train))
print("************"*10)
print(classification_report(y_test, y_pred_test))

In [None]:
print(confusion_matrix(y_train, y_pred_train))
print("************"*10)
print(confusion_matrix(y_test, y_pred_test))

In [None]:
print(accuracy_score(y_train, y_pred_train))
print("************"*10)
print(accuracy_score(y_test, y_pred_test))

**Cross Validation Model**

In [None]:
from sklearn.model_selection import cross_val_score
training_accuracy = cross_val_score(rf, x_train, y_train, cv=10)
print("Avg Accuracy :",training_accuracy.mean())
print("Max Accuracy :",training_accuracy.max())
print("Min Accuracy :",training_accuracy.min())

**SVM**

In [None]:
from sklearn.svm import SVC

In [None]:
# SVM -
# kernel = linear
svm_linear = SVC(kernel='linear')
linear = svm_linear.fit(x_train, y_train)
y_pred_linear = svm_linear.predict(x_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# kernel = sigmoid
svm_sigmoid = SVC(kernel='sigmoid')
sigmoid = svm_sigmoid.fit(x_train, y_train)
y_pred_sigmoid = svm_sigmoid.predict(x_test)
accuracy_sigmoid = accuracy_score(y_test, y_pred_sigmoid)

# kernel = poly
svm_poly = SVC(kernel='poly')
poly = svm_poly.fit(x_train, y_train)
y_pred_poly = svm_poly.predict(x_test)
accuracy_poly = accuracy_score(y_test, y_pred_poly)

# kernel = rbf
svm_rbf = SVC(kernel='rbf')
rbf = svm_rbf.fit(x_train, y_train)
y_pred_rbf = svm_rbf.predict(x_test)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)

**Boosting method**

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [None]:
# AdaBoost Model
adaboost = AdaBoostClassifier()
adbt = adaboost.fit(x_train, y_train)
y_pred_adbt = adaboost.predict(x_test)
accuracy_adbt = accuracy_score(y_test, y_pred_adbt)

In [None]:
# GradientBoostingClassifier
gdboosting = GradientBoostingClassifier()
gdbt = gdboosting.fit(x_train, y_train)
y_pred_gdbt = gdboosting.predict(x_test)
accuracy_gdbt = accuracy_score(y_test, y_pred_gdbt)


In [None]:
# XGBClassifier
xgboost = XGBClassifier()
xgbt = xgboost.fit(x_train, y_train)
y_pred_xgbt = xgboost.predict(x_test)
accuracy_xgbt = accuracy_score(y_test, y_pred_xgbt)


**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Select the best K value to decide "n_neighbors" basis error_rate

error_rate = []

for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    error_rate.append(np.mean(y_pred !=y_test))

In [None]:
error_rate

In [None]:
plt.figure(figsize=(16,16))
plt.plot(range(1,50), error_rate, color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='blue')
plt.title("Error Rate vs K-Value")
plt.xlabel("K-Value")
plt.ylabel("Error Rate")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

In [None]:
y_pred_train = knn.predict(x_train)
y_pred_test = knn.predict(x_test)

In [None]:
# Evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
print(confusion_matrix(y_train, y_pred_train))
print("*****************"*10)
print(confusion_matrix(y_test, y_pred_test))

In [None]:
print(classification_report(y_train, y_pred_train))
print("*****************"*10)
print(classification_report(y_test, y_pred_test))

In [None]:
print("Training Accuracy :", accuracy_score(y_train, y_pred_train))
print("*****************"*10)
print("Test Accuracy :",accuracy_score(y_test, y_pred_test))

**Hyperparameter Tunning**

Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#Support Vector Machine
param_grid={'C':[0.1,1,10,100],'gamma':[1,0.1,0.01,0.0001],'kernel':['linear','sigmoid','poly','rbf']}
# C is cost of misclassification(Regularization parameter) and gamma influence of sample
# lambda and C is inversionally proportional(λ=1/C)
# C higher is better and gamma lower is better - we want the optimum value

grid=GridSearchCV(SVC(),param_grid,refit=True)
grid.fit(x_train,y_train)
grid_pred_train=grid.predict(x_train)
grid_pred_test=grid.predict(x_test)
print(accuracy_score(y_train,grid_pred_train))
print(accuracy_score(y_test,grid_pred_test))

print("Best Tuned HyperParameters k :{}".format(grid.best_params_))

RandomizedsearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Random Forest#Support Vector Machine
param_grid={'C':[0.1,1,10,100],'gamma':[1,0.1,0.01,0.0001],'kernel':['linear','sigmoid','poly','rbf']}
# C is cost of misclassification(Regularization parameter) and gamma influence of sample
# lambda and C is inversionally proportional(λ=1/C)
# C higher is better and gamma lower is better - we want the optimum value

rnd=RandomizedSearchCV(SVC(),param_grid,refit=True)
rnd.fit(x_train,y_train)
rnd_pred_train=rnd.predict(x_train)
rnd_pred_test=rnd.predict(x_test)
print(accuracy_score(y_train,rnd_pred_train))
print(accuracy_score(y_test,rnd_pred_test))

print("Best Tuned HyperParameters k :{}".format(rnd.best_params_))