<a href="https://colab.research.google.com/github/anenaanilkumar/ML-Project/blob/main/project_staff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import uniform, randint

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Model Selection & Resampling
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
df = pd.read_csv('/content/Staff Satisfaction.csv')

In [None]:
df

#**INSPECTION**

In [None]:
df.head(10)

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

# **HANDLING NULL VALUES & CATEGORICAL DATA**

In [None]:
df["Promoted in the last 5 years?"] = df["Promoted in the last 5 years?"].fillna("No")

In [None]:
# Initialize LabelEncoder
le_salary = LabelEncoder()
le_department = LabelEncoder()
le_promotion = LabelEncoder()
le_accident = LabelEncoder()


# Apply Label Encoding separately
df["salary"] = le_salary.fit_transform(df["salary"])
df["Department"] = le_department.fit_transform(df["Department"])
df["Promoted in the last 5 years?"] = le_promotion.fit_transform(df["Promoted in the last 5 years?"])
df["Work Accident"] = le_accident.fit_transform(df["Work Accident"])

In [None]:
# Convert "Date Hired" to datetime format
df["Date Hired"] = pd.to_datetime(df["Date Hired"])

# Calculate "Years Since Hired"
current_year = pd.Timestamp.now().year
df["Years Since Hired"] = current_year - df["Date Hired"].dt.year

# Drop the original "Date Hired" column
df.drop(columns=["Date Hired"], inplace=True)

In [None]:
df.head()

In [None]:
le_left = LabelEncoder()

# Apply Label Encoding to "Left" column
df["Left"] = le_left.fit_transform(df["Left"])

In [None]:
# Compute correlation with 'price'
correlation = df.corr(numeric_only=True)['Left']

# Print the correlation values
print(correlation)

In [None]:
df = df.drop(columns=["salary", "Department", "Number of Projects", "Last Evaluation"])

# **VISUALIZATIONS**

**How Many Employees Left?**

 What It Shows:

A countplot to visualize how many employees stayed (0) vs. left (1).

Uses green for employees who stayed and red for those who left.

Helps understand the proportion of employee turnover in the dataset

In [None]:
# Custom colors
custom_colors = ["#4CAF50", "#FF5733"]  # Green for stayed, Red for left

sns.countplot(x=df["Left"], palette=custom_colors)
plt.title("Employee Turnover Count (0 = Stayed, 1 = Left)")
plt.xlabel("Left")
plt.ylabel("Count")
plt.show()


**Work Accidents & Employee Turnover**

What It Shows:

A countplot grouped by whether an employee had a work accident (0 = No, 1 = Yes).

The hue="Left" groups the data by employees who stayed vs. left.

Helps determine if work accidents have an effect on employee attrition.


In [None]:
sns.countplot(x=df["Work Accident"], hue=df["Left"], palette="coolwarm")
plt.title("Impact of Work Accidents on Employee Turnover")
plt.xlabel("Work Accident (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.legend(title="Left (0 = Stayed, 1 = Left)")
plt.show()

**Average Monthly Hours Distribution**

What It Shows:

A histogram to visualize the distribution of employees' average monthly working hours.

The KDE (Kernel Density Estimate) curve shows the density of working hours.

Helps understand if there are any patterns, like overtime or underworking.

In [None]:
sns.histplot(df["Average Monthly Hours"], bins=30, kde=True, color="blue")
plt.title("Distribution of Average Monthly Hours")
plt.xlabel("Average Monthly Hours")
plt.ylabel("Count")
plt.show()

# **OUT** LAYERS

In [None]:
import pandas as pd
import numpy as np

# Select numerical columns
num_cols = ["Satisfaction Level", "Work Accident", "Average Monthly Hours", "Years in the Company", "Years Since Hired"]

# Compute Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1  # Interquartile range

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = ((df[num_cols] < lower_bound) | (df[num_cols] > upper_bound))

# Print the number of outliers in each column
print("Number of outliers in each column:\n", outliers.sum())

# Optionally, remove or cap outliers
df[num_cols] = df[num_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)


In [None]:
# Select numerical columns that might have outliers
num_cols = ["Satisfaction Level", "Work Accident", "Average Monthly Hours", "Years in the Company", "Years Since Hired"]

# Compute the lower (5th percentile) and upper (95th percentile) bounds
lower_bound = df[num_cols].quantile(0.05)
upper_bound = df[num_cols].quantile(0.95)

# Apply clipping to cap outliers within the range
df[num_cols] = df[num_cols].clip(lower=lower_bound, upper=upper_bound,axis=1)

# Print dataset shape to confirm no rows were removed
print("After capping, dataset shape:", df.shape)

In [None]:
# Set style
sns.set_style("whitegrid")

# Define numerical columns
num_cols = ["Satisfaction Level", "Work Accident", "Average Monthly Hours", "Years in the Company", "Years Since Hired"]

# Create histograms
plt.figure(figsize=(12, 8))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], bins=30, kde=True, color="blue")
    plt.title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

# **TRAIN - TEST SPLIT**

In [None]:
# Define features (X) and target variable (y)
x=df.drop('Left',axis=1) # Drop the target column
y = df["Left"]  # Target column (employee attrition)

In [None]:
sd=StandardScaler()
sd.fit(x)
x=sd.transform(x)

In [None]:
y.value_counts()

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state =42,stratify=y)

In [None]:
ytrain.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
sm=SMOTE()
xres,yres=sm.fit_resample(xtrain,ytrain)

In [None]:
yres.value_counts()

# **MODEL BUILDING**



---



# **KNN**

---



**BEFORE SMOTE**

In [None]:
knn=KNeighborsClassifier()
knn.fit(xtrain,ytrain)

ypred=knn.predict(xtest)

acc=accuracy_score(ytest,ypred)
print(acc)
cn=confusion_matrix(ytest,ypred)
print(cn)
cr=classification_report(ytest,ypred)
print(cr)


In [None]:
training_score=knn.score(xtrain,ytrain)
print(training_score)

In [None]:
testing_score=knn.score(xtest,ytest)
print(testing_score)

**AFTER SMOTE**

In [None]:
knnsm = KNeighborsClassifier()
knnsm.fit(xres, yres)

ypredsm = knnsm.predict(xtest)

acc = accuracy_score(ytest, ypredsm)
print(acc)

cn = confusion_matrix(ytest, ypredsm)
print(cn)

cr = classification_report(ytest, ypredsm)
print(cr)

In [None]:
training_score=knnsm.score(xres,yres)
print(training_score)

In [None]:
testing_score=knnsm.score(xtest,ytest)
print(testing_score)

**HYPER TUNING**

In [None]:
help(knn)

In [None]:
# Initialize KNN classifier
knn1 = KNeighborsClassifier()

# Define hyperparameter distribution
param_dist = {
    'n_neighbors': range(1, 30, 2),  # Odd numbers between 1 and 29
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'weights': ['uniform', 'distance']
}

# Perform Randomized Search
random_search = RandomizedSearchCV(knn1, param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(xtrain, ytrain)

# Best Hyperparameters
print("Best Hyperparameters:", random_search.best_params_)
print(random_search.score(xtest,ytest))


**Best Hyperparameters**





In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize KNN classifier with best hyperparameters
knn_hy = KNeighborsClassifier(metric='manhattan', n_neighbors=9, weights='distance')

In [None]:
# Train the model on SMOTE-processed data
knn_hy.fit(xres, yres)  # Using xres and yres after SMOTE

# Predict on test data
ypred_hy = knn_hy.predict(xtest)

# Accuracy score
acc_hy = accuracy_score(ytest, ypred_hy)
print(acc_hy)

# Classification report
cr_hy = classification_report(ytest, ypred_hy)
print(cr_hy)

In [None]:
training_score=knn_hy.score(xtrain,ytrain)
print(training_score)

In [None]:
testing_score=knn_hy.score(xtest,ytest)
print(testing_score)



---

# **RANDOM FOREST**  


---



**BEFORE SMOTE**

In [None]:
# Random Forest on original training data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(xtrain, ytrain)

ypred_rand = rf.predict(xtest)

acc_rand = accuracy_score(ytest, ypred_rand)
print(acc_rand)

cn_rand = confusion_matrix(ytest, ypred_rand)
print(cn_rand)

cr_rand = classification_report(ytest, ypred_rand)
print(cr_rand)


In [None]:
training_score=rf.score(xtrain,ytrain)
print(training_score)

In [None]:
testing_score=rf.score(xtest,ytest)
print(testing_score)

**AFTER SMOTE**

In [None]:
# Random Forest on SMOTE-applied data
rfa = RandomForestClassifier(n_estimators=100, random_state=42)
rfa.fit(xres, yres)  # Training on SMOTE data

ypred_r = rfa.predict(xtest)

acc_rand1 = accuracy_score(ytest, ypred_r)
print(acc_rand1)

cn_rand1 = confusion_matrix(ytest, ypred_r)
print(cn_rand1)

cr_rand1 = classification_report(ytest, ypred_r)
print(cr_rand1)

In [None]:
training_score=rfa.score(xres,yres)
print(training_score)

In [None]:
testing_score=rfa.score(xtest,ytest)
print(testing_score)



---



# **RANDOM SEARCH CV**

---



**BEFORE SMOTING**

In [None]:
# Initialize Random Forest Classifier
rf1 = RandomForestClassifier(random_state=42)

# Define Hyperparameter Distribution
param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

# Perform Randomized Search
random_s = RandomizedSearchCV(
    rf1, param_distributions=param_dist,
    n_iter=10, cv=3, scoring="accuracy",
    n_jobs=-1, verbose=2, random_state=42
)

random_s.fit(xtrain, ytrain)

# Best Hyperparameters
print("Best Hyperparameters:", random_s.best_params_)
print("Best Score:", random_s.best_score_)

In [None]:
# Train Random Forest with Best Hyperparameters
rd_hy = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2,max_features='log2', max_depth=20, bootstrap=False, random_state=42)


In [None]:
rd_hy.fit(xtrain, ytrain)

# Predictions and Evaluation
ypred_rd = rd_hy.predict(xtest)

acc_hy = accuracy_score(ytest, ypred_rd)
print(acc_hy)

cr_rd = classification_report(ytest, ypred_rd)
print(cr_rd)

cn_rd = confusion_matrix(ytest, ypred_rd)
print(cn_rd)

In [None]:
# Training and Testing Scores
training_score = rd_hy.score(xtrain, ytrain)
print(training_score)

testing_score = rd_hy.score(xtest, ytest)
print(testing_score)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rd_hy, xtrain, ytrain, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

**AFTER SMOTE**

In [None]:
# Initialize Random Forest Classifier
rf11 = RandomForestClassifier(random_state=42)

# Define Hyperparameter Distribution
param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

# Perform Randomized Search
random_ss = RandomizedSearchCV(
    rf11, param_distributions=param_dist,
    n_iter=10, cv=3, scoring="accuracy",
    n_jobs=-1, verbose=1, random_state=42
)

random_ss.fit(xres, yres)

# Best Hyperparameters
print("Best Hyperparameters:", random_ss.best_params_)
print("Best Score:", random_ss.best_score_)

In [None]:
# Train Random Forest with Best Hyperparameters
rd_hyy = RandomForestClassifier(
    n_estimators=100, min_samples_split=5, min_samples_leaf=2,
    max_features='log2', max_depth=20, bootstrap=False, random_state=42
)

In [None]:
rd_hyy.fit(xres, yres)

# Predictions and Evaluation
ypred_rdd = rd_hyy.predict(xtest)

acc_hy = accuracy_score(ytest, ypred_rdd)
print(acc_hy)

cr_rd = classification_report(ytest, ypred_rdd)
print(cr_rd)

In [None]:
# Training and Testing Scores
training_score = rd_hyy.score(xres, yres)
print(training_score)

testing_score = rd_hyy.score(xtest, ytest)
print(testing_score)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rd_hyy, xres, yres, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


---

# **SVM**

---



**BEFORE SMOTING**

In [None]:
# Initialize SVM model with RBF kernel
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model.fit(xtrain, ytrain)

# Predict on test data
svm_preds = svm_model.predict(xtest)

# Evaluate model performance
svm_acc = accuracy_score(ytest, svm_preds)
svm_classification_report = classification_report(ytest, svm_preds)
svm_confusion_matrix = confusion_matrix(ytest, svm_preds)

# Print evaluation metrics
print("SVM Accuracy:", svm_acc)
print("SVM Classification Report:\n", svm_classification_report)
print("SVM Confusion Matrix:\n", svm_confusion_matrix)

**AFTER SMOTING**

In [None]:
# Initialize SVM model with RBF kernel
svm_smo = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_smo.fit(xres, yres)  # Training on resampled data

# Predict on test data
svm_preds_sm = svm_smo.predict(xtest)

# Evaluate model performance
acc_svm = accuracy_score(ytest, svm_preds_sm)
classification_report_svm = classification_report(ytest, svm_preds_sm)
confusion_matrix_svm = confusion_matrix(ytest, svm_preds_sm)

# Print evaluation metrics
print("SVM Accuracy:", acc_svm)
print("SVM Classification Report:\n", classification_report_svm)
print("SVM Confusion Matrix:\n", confusion_matrix_svm)

**HYPER TUNING**

In [None]:
# Initialize SVM model
svm = SVC()

# Define hyperparameter distribution
param_dist = {
    "C": uniform(0.01, 10),
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "gamma": ["scale", "auto"],
    "degree": randint(2, 5)
}

# Perform Randomized Search
random_search3 = RandomizedSearchCV(svm, param_dist, cv=5, scoring="accuracy", random_state=42)
random_search3.fit(xtrain, ytrain)

# Print best hyperparameters
print(random_search3.best_params_)

In [None]:
svm_hy=SVC()

svm_hy.fit(xtrain,ytrain)

ypre=svm_hy.predict(xtest)

acc_hy=accuracy_score(ytest,ypre)
print(acc_hy)

cr_hy=classification_report(ytest,ypre)
print(cr_hy)



---

# **LOGISTIC REGRESSION**

---



**BEFORE SMOTE**

In [None]:
# Logistic Regression without SMOTE
log_reg_model = LogisticRegression()
log_reg_model.fit(xtrain, ytrain)

log_reg_preds = log_reg_model.predict(xtest)

log_reg_acc = accuracy_score(ytest, log_reg_preds)
log_reg_classification_report = classification_report(ytest, log_reg_preds)
log_reg_confusion_matrix = confusion_matrix(ytest, log_reg_preds)

print("Logistic Regression Accuracy:", log_reg_acc)
print("Logistic Regression Classification Report:\n", log_reg_classification_report)
print("Logistic Regression Confusion Matrix:\n", log_reg_confusion_matrix)

In [None]:
training_score = log_reg_model.score(xtrain, ytrain)
print(training_score)
testing_score = log_reg_model.score(xtest, ytest)
print(testing_score)

**AFTER SMOTE**

In [None]:
# Logistic Regression with SMOTE
log_reg_sm = LogisticRegression()
log_reg_sm.fit(xres, yres)

log_pred = log_reg_sm.predict(xtest)

reg_acc = accuracy_score(ytest, log_pred)
reg_classification_report = classification_report(ytest, log_pred)
reg_confusion_matrix = confusion_matrix(ytest, log_pred)

print("Logistic Regression Accuracy:", reg_acc)
print("Logistic Regression Classification Report:\n", reg_classification_report)
print("Logistic Regression Confusion Matrix:\n", reg_confusion_matrix)

In [None]:
training_score = log_reg_sm.score(xres, yres)
print(training_score)
testing_score = log_reg_sm.score(xtest, ytest)
print(testing_score)

**HYPER TUNING**

In [None]:
# Define the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Define the hyperparameter search space
param_dist = {
    'C': uniform(0.01, 10),
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'solver': ['liblinear', 'saga'],
}

# Perform Randomized Search
random_searchl = RandomizedSearchCV(
    log_reg, param_distributions=param_dist,
    n_iter=20, scoring='accuracy',
    cv=5, random_state=42, n_jobs=-1
)
random_searchl.fit(xtrain, ytrain)

# Print best parameters and best accuracy score
print("Best Parameters:", random_searchl.best_params_)
print("Best Accuracy:", random_searchl.best_score_)





In [None]:
# Train Logistic Regression model with best hyperparameters
log_reg_hy = LogisticRegression(C=8.334426408004218, penalty='l2', solver='saga', max_iter=1000)

In [None]:
log_reg_hy.fit(xtrain, ytrain)# Predict on test set

ypred_a = log_reg_hy.predict(xtest)

# Evaluate model
acc_hy = accuracy_score(ytest, ypred_a)
print(acc_hy)

cr_hy = classification_report(ytest, ypred_a)
print(cr_hy)

In [None]:
# Training and Testing Scores
training_score = log_reg_hy.score(xtrain, ytrain)
print(training_score)
testing_score = log_reg_hy.score(xtest, ytest)
print(testing_score)



---

# **NAIVE BAYES**

---



**BEFORE SMOTE**

In [None]:
# Train Gaussian Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(xtrain, ytrain)

# Predictions
nb_preds = nb_model.predict(xtest)

# Evaluation
nb_acc = accuracy_score(ytest, nb_preds)
nb_classification_report = classification_report(ytest, nb_preds)
nb_confusion_matrix = confusion_matrix(ytest, nb_preds)

print("Naive Bayes Accuracy:", nb_acc)
print("Naive Bayes Classification Report:\n", nb_classification_report)
print("Naive Bayes Confusion Matrix:\n", nb_confusion_matrix)

In [None]:
# Training & Testing Scores
training_score = nb_model.score(xtrain, ytrain)
print(training_score)
testing_score = nb_model.score(xtest, ytest)
print(testing_score)

In [None]:
df.columns

**AFTER SMOTE**

In [None]:
# ====== TRAINING WITH SMOTING ======
nb_smo = GaussianNB()
nb_smo.fit(xres, yres)

# Predictions
nb_preds_sm = nb_smo.predict(xtest)

# Evaluation
acc_nb = accuracy_score(ytest, nb_preds_sm)
classification_report_nb = classification_report(ytest, nb_preds_sm)
confusion_matrix_nb = confusion_matrix(ytest, nb_preds_sm)

print("Naive Bayes Accuracy:", acc_nb)
print("Naive Bayes Classification Report:\n", classification_report_nb)
print("Naive Bayes Confusion Matrix:\n", confusion_matrix_nb)

In [None]:
training_score=nb_smo.score(xres,yres)
print(training_score)
testing_score=nb_smo.score(xtest,ytest)
print(testing_score)

**HYPER TUNING**

In [None]:
# ====== HYPERPARAMETER TUNING WITH RANDOMIZED SEARCH ======
nb_model1 = GaussianNB()

# Define hyperparameter search space
param_dist = {
    "var_smoothing": uniform(1e-9, 1e-2)  # Smoothing parameter
}

# Perform Randomized Search
random_g = RandomizedSearchCV(
    nb_model1, param_distributions=param_dist,
    n_iter=10, scoring='accuracy',
    cv=5, random_state=42, n_jobs=-1
)
random_g.fit(xtrain, ytrain)

# Print best hyperparameters
print("Best Hyperparameters:", random_g.best_params_)
print("Best Score:", random_g.best_score_)

In [None]:
# Train GaussianNB with best hyperparameters
nb_model_hy = GaussianNB(var_smoothing=0.009507144064099162)


In [None]:
nb_model_hy.fit(xtrain, ytrain)
# Predictions
ypred_b = nb_model_hy.predict(xtest)

# Evaluation
acc_hy1 = accuracy_score(ytest, ypred_b)
print(acc_hy1)

cl_hy = classification_report(ytest, ypred_b)
print(cl_hy)


In [None]:
# Training & Testing Scores
training_score = nb_model_hy.score(xtrain, ytrain)
print(training_score)
testing_score = nb_model_hy.score(xtest, ytest)
print(testing_score)



---

# **DECISION TREE CLASSIFIRE**

---



**BEFORE SMOTE**

In [None]:
# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(xtrain, ytrain)

# Predictions
dt_preds = dt_model.predict(xtest)

# Evaluation
dt_acc = accuracy_score(ytest, dt_preds)
dt_classification_report = classification_report(ytest, dt_preds)
dt_confusion_matrix = confusion_matrix(ytest, dt_preds)

print("Decision Tree Accuracy:", dt_acc)
print("Decision Tree Classification Report:\n", dt_classification_report)
print("Decision Tree Confusion Matrix:\n", dt_confusion_matrix)

In [None]:
training_score = dt_model.score(xtrain, ytrain)
print(training_score)
testing_score = dt_model.score(xtest, ytest)
print(testing_score)

**BEFORE SMOTE**

In [None]:
smote = SMOTE(random_state=42)
xres, yres = smote.fit_resample(xtrain, ytrain)

dt_smo = DecisionTreeClassifier()
dt_smo.fit(xres, yres)

dt_preds_sm = dt_smo.predict(xtest)

acc_dt = accuracy_score(ytest, dt_preds_sm)
classification_report_dt = classification_report(ytest, dt_preds_sm)
confusion_matrix_dt = confusion_matrix(ytest, dt_preds_sm)

print("Decision Tree Accuracy:", acc_dt)
print("Decision Tree Classification Report:\n", classification_report_dt)
print("Decision Tree Confusion Matrix:\n", confusion_matrix_dt)

**HYPER TUNING**

In [None]:
dt = DecisionTreeClassifier()

param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": randint(3, 50),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2", None]
}

random_search1 = RandomizedSearchCV(dt, param_dist, n_iter=20, cv=5, scoring="accuracy", random_state=42)
random_search1.fit(xtrain, ytrain)

print(random_search1.best_params_)

In [None]:
dt_hy=DecisionTreeClassifier(criterion= 'gini', max_depth= 11, max_features= None, min_samples_leaf= 2, min_samples_split= 5)

In [None]:
dt_hy.fit(xtrain,ytrain)

ypred_c=dt_hy.predict(xtest)

acc_hy=accuracy_score(ytest,ypred_c)
print(acc_hy)

cl_hy=classification_report(ytest,ypred_c)
print(cl_hy)

cn_hy=confusion_matrix(ytest,ypred_c)
print(cn_hy)

In [None]:
training_score=dt_hy.score(xtrain,ytrain)
print(training_score)
testing_score=dt_hy.score(xtest,ytest)
print(testing_score)



---

# **ADABOOST**

---



**BEFORE SMOTE**

In [None]:
base_estimator = DecisionTreeClassifier()
adaboost_model = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)
adaboost_model.fit(xtrain, ytrain)

# Predictions
adaboost_preds = adaboost_model.predict(xtest)

# Evaluation
adaboost_acc = accuracy_score(ytest, adaboost_preds)
adaboost_classification_report = classification_report(ytest, adaboost_preds)
adaboost_confusion_matrix = confusion_matrix(ytest, adaboost_preds)

print("AdaBoost Accuracy:", adaboost_acc)
print("AdaBoost Classification Report:\n", adaboost_classification_report)
print("AdaBoost Confusion Matrix:\n", adaboost_confusion_matrix)

In [None]:
training_score = adaboost_model.score(xtrain, ytrain)
print(training_score)
testing_score = adaboost_model.score(xtest, ytest)
print(testing_score)

**AFTER SMOTE**

In [None]:
base_estimator = DecisionTreeClassifier()
adaboost_smo = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)
adaboost_smo.fit(xres,yres)

adaboost_preds_sm = adaboost_smo.predict(xtest)

acc_ada = accuracy_score(ytest, adaboost_preds_sm)
classification_report_ada = classification_report(ytest, adaboost_preds_sm)
confusion_matrix_ada = confusion_matrix(ytest, adaboost_preds_sm)

print("AdaBoost Accuracy:", acc_ada)
print("AdaBoost Classification Report:\n", classification_report_ada)
print("AdaBoost Confusion Matrix:\n", confusion_matrix_ada)

In [None]:
training_score=adaboost_smo.score(xres,yres)
print(training_score)
testing_score=adaboost_smo.score(xtest,ytest)
print(testing_score)

**HYPER TUNING**

In [None]:
adaboost = AdaBoostClassifier(estimator=DecisionTreeClassifier(), random_state=42)
param_dist_adaboost = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': randint(1, 10)
}

random_search_adaboost = RandomizedSearchCV(estimator=adaboost, param_distributions=param_dist_adaboost,
                                            scoring='accuracy', cv=5, verbose=1, n_jobs=-1, n_iter=10, random_state=42)
random_search_adaboost.fit(xtrain, ytrain)
print("Best Hyperparameters (AdaBoost):\n", random_search_adaboost.best_params_)

In [None]:
adaboost_hy=AdaBoostClassifier(n_estimators= 100, learning_rate= 0.1)
adaboost_hy.fit(xtrain,ytrain)

In [None]:
ypred_e=adaboost_hy.predict(xtest)

acc_hy=accuracy_score(ytest,ypred_e)
print(acc_hy)

cl_hy=classification_report(ytest,ypred_e)
print(cl_hy)

cn_hy=confusion_matrix(ytest,ypred_e)
print(cn_hy)

In [None]:
training_score=adaboost_hy.score(xtrain,ytrain)
print(training_score)
testing_score=adaboost_hy.score(xtest,ytest)
print(testing_score)



---

# **XGBOOST**

---



**BEFORE SMOTE**

In [None]:
xgb=XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42)
xgb.fit(xtrain,ytrain)

ypred_x=xgb.predict(xtest)

acc=accuracy_score(ytest,ypred_x)
print(acc)

cl=classification_report(ytest,ypred_x)
print(cl)

cn=confusion_matrix(ytest,ypred_x)
print(cn)

In [None]:
training_score=xgb.score(xtrain,ytrain)
print(training_score)
testing_score=xgb.score(xtest,ytest)
print(testing_score)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(xgb, xtrain, ytrain, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

**AFTER SMOTE**

In [None]:
xgb_sm=XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42)
xgb_sm.fit(xres,yres)

ypred_xsm=xgb_sm.predict(xtest)

acc=accuracy_score(ytest,ypred_xsm)
print(acc)

cl=classification_report(ytest,ypred_xsm)
print(cl)

cn=confusion_matrix(ytest,ypred_xsm)
print(cn)

In [None]:
training_score=xgb_sm.score(xres,yres)
print(training_score)
testing_score=xgb_sm.score(xtest,ytest)
print(testing_score)

**HYPER TUNING**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import numpy as np
xgbx = XGBClassifier(eval_metric='logloss')

xgb_param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}


random_xgb =  RandomizedSearchCV(xgbx, param_distributions=xgb_param_dist,
    n_iter=20, cv=5, scoring='accuracy',
    verbose=2, n_jobs=-1, random_state=42)

random_xgb.fit(xtrain, ytrain)
print("Best Hyperparameters:\n", random_xgb.best_params_)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
xgb_hy=XGBClassifier(subsample= 0.7, n_estimators= 500, min_child_weight= 1, max_depth= 10, learning_rate=0.03,gamma=0.1, colsample_bytree= 1.0)


In [None]:
xgb_hy.fit(xtrain,ytrain)

ypred_d=xgb_hy.predict(xtest)

acc_hy=accuracy_score(ytest,ypred_d)
print(acc_hy)

cl_hy=classification_report(ytest,ypred_d)
print(cl_hy)

cn_hy=confusion_matrix(ytest,ypred_d)
print(cn_hy)

In [None]:
training_score=xgb_hy.score(xtrain,ytrain)
print(training_score)
testing_score=xgb_hy.score(xtest,ytest)
print(testing_score)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(xgb_hy, xtrain, ytrain, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

In [None]:
import pickle
with open("best_model.pkl", "wb") as file:
    pickle.dump(rf, file)

In [None]:
with open("best_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
    prediction = loaded_model.predict(new_data)