In [None]:
 import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('/content/Employee.csv')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
data.isna().sum()

In [None]:
#fill the woklife balance with mode
data['Work-Life Balance'].fillna(data['Work-Life Balance'].mode()[0],inplace=True)

In [None]:
#fill the perfomance rating with mode
data['Performance Rating'].fillna(data['Performance Rating'].mode()[0],inplace=True)

In [None]:
#fill the employee recognition with mode
data['Employee Recognition'].fillna(data['Employee Recognition'].mode()[0],inplace=True)

In [None]:
#fill the Number of Dependents,Company Size,Remote Work,Job Level with mode
data['Number of Dependents'].fillna(data['Number of Dependents'].mode()[0],inplace=True)
data['Company Size'].fillna(data['Company Size'].mode()[0],inplace=True)
data['Remote Work'].fillna(data['Remote Work'].mode()[0],inplace=True)
data['Job Level'].fillna(data['Job Level'].mode()[0],inplace=True)



In [None]:
sns.distplot(data['Number of Promotions'])

In [None]:
data['Number of Promotions'].fillna(data['Number of Promotions'].mean(),inplace=True)
data['Marital Status'].fillna(data['Marital Status'].mode()[0],inplace=True)
data['Education Level'].fillna(data['Education Level'].mode()[0],inplace=True)

In [None]:
data.isna().sum()

*Data Visualizations*

In [None]:
plt.hist(data['Attrition'])
plt.xlabel('Attrition')
plt.ylabel('Frequency')
plt.title('Histogram of Attrition')
plt.show()

In [None]:
print(data.columns)

In [None]:
#Histogram for understanding the distribution of numerical features like Monthly Income
plt.hist(data['Monthly Income'])
plt.xlabel('Monthly Income')
plt.ylabel('Frequency')
plt.width=0.2
plt.title('Histogram of Monthly Income')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x='Work-Life Balance', y='Attrition', data=data)
plt.xlabel('Work-Life Balance')
plt.ylabel('Attrition')
plt.title('Bar Plot of Work-Life Balance vs. Attrition')
plt.xticks(rotation=45)
plt.show()


In [None]:
#Pie chart for visualizing the distribution of employees within job role.
plt.figure(figsize=(8, 6))
data['Job Role'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Job Role')
plt.ylabel('')
plt.show()

In [None]:
#plotting a scatterplot for two numerical values
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Age', y='Monthly Income', data=data)
plt.xlabel('Age')
plt.ylabel('Monthly Income')
plt.title('Scatter Plot of Age vs. Monthly Income')

In [None]:
data.isna().sum()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Job Role', hue='Attrition', data=data)
plt.title('Attrition by Job Role')
plt.xticks(rotation=45)
plt.show()

In [None]:
#encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['Attrition']=le.fit_transform(data['Attrition'])
data['Marital Status']=le.fit_transform(data['Marital Status'])
data['Education Level']=le.fit_transform(data['Education Level'])
data['Job Role']=le.fit_transform(data['Job Role'])
data['Work-Life Balance']=le.fit_transform(data['Work-Life Balance'])
data['Company Size']=le.fit_transform(data['Company Size'])
data['Remote Work']=le.fit_transform(data['Remote Work'])
data['Gender']=le.fit_transform(data['Gender'])
data['Job Level']=le.fit_transform(data['Job Level'])
data['Company Reputation']=le.fit_transform(data['Company Reputation'])
data['Employee Recognition']=le.fit_transform(data['Employee Recognition'])
data['Performance Rating']=le.fit_transform(data['Performance Rating'])
data['Job Satisfaction']=le.fit_transform(data['Job Satisfaction'])
data['Overtime']=le.fit_transform(data['Overtime'])



In [None]:
data=data.drop(['Leadership Opportunities', 'Innovation Opportunities'], axis=1)

In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
attrition_rate = data['Attrition'].value_counts(normalize=True)
print(attrition_rate)

In [None]:
#plot a heatmap for numerical features
plt.figure(figsize=(20, 8))
sns.heatmap(data.corr(), annot=True, cmap='flare')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#checking for outliers
plt.figure(figsize=(8,5))
data.boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot for Numerical Features')
plt.show()

In [None]:
#handling outliers
Q1 = data['Monthly Income'].quantile(0.25)
Q3 = data['Monthly Income'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data['Monthly Income'] = np.clip(data['Monthly Income'], lower_bound, upper_bound)

In [None]:
#rechecking outliers
plt.figure(figsize=(15, 10))
data.boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot for Numerical Features')
plt.show()

In [None]:
data = data.drop(['Employee ID', 'Company Tenure', 'Company Reputation', 'Distance from Home','Number of Dependents'], axis=1)

In [None]:
data.head()

***model training***

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = data.drop(columns=['Attrition'])
y = data['Attrition']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
#using Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)


In [None]:
# Evaluating the models performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
plt.figure(figsize=(8, 5))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.show()

In [None]:
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
data.drop(columns=to_drop, inplace=True)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#use gradientboost classifier
model=GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

***Hyperparameter Tuning using GridSearchCV***

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.6, 0.8, 1.0]
}


In [None]:

gbc = GradientBoostingClassifier(random_state=42, warm_start=True)

grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)


Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


In [None]:
  y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
