In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
dataset = pd.read_csv("finattrition.csv")
dataset.drop('Unnamed: 0',axis=1,inplace=True)

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Age                           1470 non-null   int64 
 1   Attrition                     1470 non-null   object
 2   BusinessTravelFrequency       1470 non-null   object
 3   DistanceFromHome              1470 non-null   int64 
 4   EmpEducationLevel             1470 non-null   int64 
 5   EmpID                         1470 non-null   int64 
 6   Gender                        1470 non-null   object
 7   EmpJobInvolvement             1470 non-null   int64 
 8   EmpJobLevel                   1470 non-null   int64 
 9   JobSatisfaction               1470 non-null   int64 
 10  MaritalStatus                 1470 non-null   object
 11  MonthlyIncome                 1470 non-null   int64 
 12  NumCompaniesWorked            1470 non-null   int64 
 13  OverTime          

In [4]:
label_encoder = LabelEncoder()
categorical_columns = ['BusinessTravelFrequency', 'Gender', 'MaritalStatus', 'OverTime']
for column in categorical_columns:
    dataset[column] = label_encoder.fit_transform(dataset[column])

In [5]:
y = dataset.iloc[:, 1]
X = dataset.drop(['Attrition','EmpID'], axis = 1)

In [6]:
lb = LabelEncoder()
y = lb.fit_transform(y)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Age                           1470 non-null   int64 
 1   Attrition                     1470 non-null   object
 2   BusinessTravelFrequency       1470 non-null   int32 
 3   DistanceFromHome              1470 non-null   int64 
 4   EmpEducationLevel             1470 non-null   int64 
 5   EmpID                         1470 non-null   int64 
 6   Gender                        1470 non-null   int32 
 7   EmpJobInvolvement             1470 non-null   int64 
 8   EmpJobLevel                   1470 non-null   int64 
 9   JobSatisfaction               1470 non-null   int64 
 10  MaritalStatus                 1470 non-null   int32 
 11  MonthlyIncome                 1470 non-null   int64 
 12  NumCompaniesWorked            1470 non-null   int64 
 13  OverTime          

In [8]:
X_train, X_test, y_train, y_test, names_train, names_test = train_test_split(
    X, y, dataset['EmpID'], test_size=0.2, random_state=42
)

In [9]:
model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
accuracy1 = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy1:.2f}')
print("Classification Report:\n", classification_rep)

Accuracy: 0.87
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.99      0.93       255
           1       0.57      0.10      0.17        39

    accuracy                           0.87       294
   macro avg       0.72      0.55      0.55       294
weighted avg       0.84      0.87      0.83       294



from sklearn.linear_model import LogisticRegression
model1=LogisticRegression()
y_pred=model1.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy2:.2f}")
print("Classification Report:\n", classification_rep)

import matplotlib.pyplot as plt
models = ['Random Forest','Logistic Regression']
accuracies=[accuracy1,accuracy2]
plt.bar(models, accuracies, color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison: Logistic Regression vs. Random Forest')
plt.show()

from sklearn.metrics import roc_curve, auc

logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)
rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict_proba(X_test)[:, 1]
y_pred_rf = rf_model.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr_logistic, tpr_logistic, _ = roc_curve(y_test, y_pred_logistic)
roc_auc_logistic = auc(fpr_logistic, tpr_logistic)

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_logistic, tpr_logistic, color='blue', lw=2, label=f'Logistic Regression (AUC = {roc_auc_logistic:.2f})')
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', label='Random Guess')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison: Logistic Regression vs. Random Forest')
plt.legend(loc='lower right')
plt.show()

In [11]:
predictions_df = pd.DataFrame({
    'EmpID': names_test,
    'Attrition_Prediction': y_pred
})

In [12]:
attrition_predictions = predictions_df[predictions_df['Attrition_Prediction'] == 1]['EmpID']

print(attrition_predictions)

363     485
892    1248
777    1079
23       30
416     556
422     566
921    1286
Name: EmpID, dtype: int64


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

classifiers = {
    "Random Forest":RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and evaluate each classifier
results = {"Algorithm": [], "Accuracy": [], "ROC AUC": []}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    results["Algorithm"].append(name)
    results["Accuracy"].append(accuracy)
    results["ROC AUC"].append(roc_auc)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
results_df

Unnamed: 0,Algorithm,Accuracy,ROC AUC
0,Random Forest,0.870748,0.717044
1,Logistic Regression,0.853741,0.702363
2,SVM,0.867347,0.470588
3,Decision Tree,0.782313,0.581297


In [15]:
import pickle
file=open('attrition_model.pkl','wb')
pickle.dump(model,file)