In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
df = pd.read_csv('/desktop/Bootcamp/data1.csv')

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(),yticklabels=False,cmap='viridis')

In [None]:
df.columns

In [None]:
df['Dateofjoining'] = pd.to_datetime(df['Dateofjoining'])
df['LastWorkingDate'] = pd.to_datetime(df['LastWorkingDate'])

In [None]:
df['Year_of_join'] = df['Dateofjoining'].apply(lambda t:t.year)
df['Month_of_join'] = df['Dateofjoining'].apply(lambda t:t.month)
df['Day_of_join'] = df['Dateofjoining'].apply(lambda t:t.day)
df['Year_of_leave'] = df['LastWorkingDate'].apply(lambda t:t.year)
df['Month_of_leave'] = df['LastWorkingDate'].apply(lambda t:t.month)

In [None]:
df.drop(columns='Dateofjoining',inplace=True)

In [None]:
df['Attrition'] = np.nan

In [None]:
mypop = df.pop('Attrition')
df.insert(1,'Attrition',mypop)
mypop1 = df.pop('Year_of_join')
df.insert(8,'Year_of_join',mypop1)
mypop2 = df.pop('Month_of_join')
df.insert(9,'Month_of_join',mypop2)
mypop3 = df.pop('Day_of_join')
df.insert(10,'Day_of_join',mypop3)

In [None]:
df = df.astype({'Year_of_join':int,'Month_of_join':int,'Day_of_join':int})

In [None]:
df['Attrition']=np.where(df['LastWorkingDate'].isnull(),0,1)

In [None]:
df.drop(columns='LastWorkingDate',inplace=True)

In [None]:
df.head(3)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=df,x='Gender',hue='Education_Level')

In [None]:
joiners = df.groupby(by=['Year_of_join','Month_of_join']).count()['Emp_ID'].unstack()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(joiners,annot=True,fmt='.4g',cmap='magma')

In [None]:
df['Attrition'].value_counts()

In [None]:
leavers = df.groupby(by=['Year_of_leave','Month_of_leave']).count()['Emp_ID'].unstack()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(leavers,annot=True,fmt='.4g',cmap='magma')

In [None]:
sns.jointplot(data=df,x='Age',y='Salary',hue='Gender',height=10)

In [None]:
plt.figure(figsize=(10,6))
fg = sns.FacetGrid(df, col="Gender",  row='Year_of_join')
fg.map(sns.scatterplot, "Salary", "Age")

In [None]:
df.groupby(by=['Education_Level','Gender','Attrition','City']).count()['Emp_ID'].unstack()

In [None]:
sns.jointplot(data=df,x='Age',y='Salary',hue='Attrition',height=10)

In [None]:
plt.figure(figsize=(10,6))
fg = sns.FacetGrid(df, col="Attrition",  row='Year_of_join')
fg.map(sns.scatterplot, "Salary", "Age")

In [None]:
plt.figure(figsize=(10,6))
fg = sns.FacetGrid(df, col="Quarterly Rating",  row='Attrition')
fg.map(sns.scatterplot, "Salary", "Age")

In [None]:
df['Length_of_work'] = df['Year_of_leave'] - df['Year_of_join']

In [None]:
sns.jointplot(data=df,x='Age',y='Salary',hue='Length_of_work',height=10)

In [None]:
plt.figure(figsize=(10,6))
fg = sns.FacetGrid(df, col="Length_of_work",  row='Attrition')
fg.map(sns.scatterplot, "Salary", "Age")

In [None]:
plt.figure(figsize=(10,6))
fg = sns.FacetGrid(df, col="Length_of_work",  row='Gender')
fg.map(sns.scatterplot, "Salary", "Age")

In [None]:
sns.jointplot(data=df,x='Age',y='Total Business Value',hue='Attrition',height=10)

In [None]:
sns.jointplot(data=df,x='Age',y='Total Business Value',hue='Gender',height=10)

In [None]:
df.info()

In [None]:
df.head(1)

In [None]:
sex = pd.get_dummies(df['Gender'],drop_first=True)
city = pd.get_dummies(df['City'])
edu = pd.get_dummies(df['Education_Level'])

In [None]:
train = df.copy()

In [None]:
train.head(1)

In [None]:
train.drop(columns=['MMM-YY','Emp_ID','Gender','City','Education_Level','Joining Designation','Designation','Year_of_leave','Month_of_leave','Length_of_work'],inplace=True)

In [None]:
train = pd.concat([train,sex,city,edu],axis=1)

In [None]:
x = train.drop('Attrition',axis=1)
y = train['Attrition']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(solver='liblinear')

In [None]:
logreg.fit(x_train,y_train)

In [None]:
logreg_pred = logreg.predict(x_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
viz_str = '-'* 20
print(viz_str,'LOGISTIC REGRESSION',viz_str)
print('Classification report:')
print(classification_report(y_test,logreg_pred))
print('-'*61)
plt.title('Confusion Matrix')
sns.heatmap(confusion_matrix(y_test,logreg_pred),annot=True,fmt='g',cmap='cubehelix',cbar=False, yticklabels=False, xticklabels=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(x_test,y_test)

In [None]:
dtree_pred = dtree.predict(x_test)

In [None]:
viz_str = '-'* 20
print(viz_str,'DECISION TREE',viz_str)
print('Classification report:')
print(classification_report(y_test,dtree_pred))
print('-'*61)
plt.title('Confusion Matrix')
sns.heatmap(confusion_matrix(y_test,dtree_pred),annot=True,fmt='g',cmap='cubehelix',cbar=False, yticklabels=False, xticklabels=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rndfrst = RandomForestClassifier(n_estimators=100)

In [None]:
rndfrst.fit(x_train,y_train)

In [None]:
rndfrst_pred = rndfrst.predict(x_test)

In [None]:
viz_str = '-'* 20
print(viz_str,'RANDOM FOREST',viz_str)
print('Classification report:')
print(classification_report(y_test,rndfrst_pred))
print('-'*61)
plt.title('Confusion Matrix')
sns.heatmap(confusion_matrix(y_test,rndfrst_pred),annot=True,fmt='g',cmap='cubehelix',cbar=False, yticklabels=False, xticklabels=False)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C':[0.01,0.1,1,10],'gamma':[1,0.01,0.001]}

In [None]:
svm = SVC()

In [None]:
grid = GridSearchCV(SVC(),param_grid,verbose=3)

In [None]:
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
final_svm = SVC(C=1, gamma=0.01)

In [None]:
final_svm.fit(x_train,y_train)

In [None]:
svm_pred = final_svm.predict(x_test)

In [None]:
viz_str = '-'* 20
print(viz_str,'KNN',viz_str)
print('Classification report:')
print(classification_report(y_test,svm_pred))
print('-'*55)
plt.title('Confusion Matrix')
sns.heatmap(confusion_matrix(y_test,svm_pred),annot=True,fmt='g',cmap='cubehelix',cbar=False, yticklabels=False, xticklabels=False)