In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data')
df

In [None]:
df['EmployeeID'].nunique(), df['MaritalStatus'].value_counts(), df['Over18'].value_counts()

In [None]:
df = pd.read_csv('data', index_col='EmployeeID')
df

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.hist(grid=False, bins=20, figsize=(15, 10));

In [None]:
df_copy = df.copy()

In [None]:
df_copy['Attrition'] = df_copy['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)
df_copy['Gender'] = df_copy['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

marital_status = {'Married':0, 'Single':1, 'Divorced':2}
df_copy['MaritalStatus'] = df_copy['MaritalStatus'].apply(lambda x: marital_status[x])

df_copy

In [None]:
df_copy = df_copy.drop(['EmployeeCount', 'StandardHours', 'Over18'], axis=1)
df_copy

In [None]:
round(df_copy['Attrition'].value_counts()[1]/len(df_copy), 2)*100

In [None]:
df_copy[df_copy['Attrition']==0].describe().T

In [None]:
df_copy[df_copy['Attrition']==1].describe().T

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_copy.corr(numeric_only=True), annot = True)

In [None]:
plt.figure(figsize=[20,10])
plt.subplot(2,2,1)
plt.xticks(rotation=30, fontsize=8)
sns.countplot(x = 'JobRole', hue = 'Attrition', data = df_copy)
plt.subplot(2,2,2)
sns.countplot(x = 'MaritalStatus', hue = 'Attrition', data = df_copy)
plt.subplot(2,2,3)
sns.countplot(x = 'Age', hue = 'Attrition', data = df_copy)
plt.subplot(2,2,4)
sns.countplot(x = 'JobLevel', hue = 'Attrition', data = df_copy)
plt.tight_layout()

In [None]:
sns.kdeplot(x='MonthlyIncome', hue = 'Attrition', data=df_copy, fill=True)

In [None]:
sns.kdeplot(x='DistanceFromHome', hue = 'Attrition', data=df_copy, fill=True)

In [None]:
X = df_copy.drop('Attrition', axis=1)
y = df_copy['Attrition']

In [None]:
num_cols = [col for col in df_copy.columns if pd.api.types.is_numeric_dtype(df_copy[col]) and col != 'Attrition']
cat_cols = [col for col in df_copy.columns if col not in num_cols and col != 'Attrition']

In [None]:
len(cat_cols), len(num_cols), len(df_copy.columns)

In [None]:
df_copy[cat_cols].nunique()

In [None]:
df_copy['JobRole'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)

In [None]:
imputer = SimpleImputer(strategy="mean")
scaler = MinMaxScaler()
onehot = OneHotEncoder()

num_pipe = Pipeline(steps=[('miss', imputer), ('scale', scaler)])
preprocessor = ColumnTransformer([('numeric', num_pipe, num_cols), ('categorical', onehot, cat_cols)])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# X_train_name = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
# X_test_name = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

In [None]:
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()

In [None]:
preprocessor_cv = Pipeline(steps=[('pre', preprocessor), ('model', model)])
cross_val_score(preprocessor_cv, X, y).mean()

In [None]:
import pickle

pickle.dump(model, open("data", "wb"))