In [None]:
pip install hvplot holoviews

In [None]:
# Import libraries from python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import hvplot.pandas

In [None]:
# Load the csv files

df_IT= pd.read_csv('/workspaces/Employee_Turnover_Prediction/HR-Employee-Attrition.csv')
df_health= pd.read_csv('/workspaces/Employee_Turnover_Prediction/watson_healthcare.csv')

Drop unwanted columns from both the dataset

In [None]:
df_IT.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)
df_health.drop(['EmployeeCount', 'EmployeeID', 'Over18', 'StandardHours'], axis=1, inplace=True)

In [None]:
# Check for duplicate rows in IT sector data
df_IT.duplicated().sum()

In [None]:
# Check for duplicate rows in healthcare data
df_health.duplicated().sum()

As there is 56 duplicate rows in healthcare dataset, so we need to drop those rows

In [None]:
# Drop duplicate rows
df_health.drop_duplicates(inplace=True)

In [None]:
# Basic summary statistics for numerical columns in df_health
df_health.describe().T

In [None]:
# Basic summary statistics for numerical columns in df_IT
df_IT.describe().T

In [None]:
# Compare columns from both the database
pd.DataFrame(df_IT.columns.to_list(), df_health.columns.to_list())

However, in df_IT dataframe, the column name of 'Shift' is represented as 'StockOptionLevel'

In [None]:
# Rename the column name
df_IT.rename(columns={'StockOptionLevel': 'Shift'}, inplace=True)

In [None]:
# Merge both the datasets row-wise
df_merge = pd.concat([df_IT,df_health],ignore_index=True)

In [None]:
# Check for duplicate value in merged data
df_merge.duplicated().sum()

In [None]:
# Shape of data
df_merge.shape

In [None]:
# Sample data of df_merge
df_merge.tail()

In [None]:
# Display unique value for each numerical data column,
# where unique value of column is less than equal to 20
for col in df_merge.columns:
    if df_merge[col].dtype != 'object' and df_merge[col].nunique() <= 20:
      print(f" {col} : {df_merge[col].unique()}\n")


In [None]:
# Display count, min and max value for each numerical column,
# where unique value of column is greater than 20
for col in df_merge.columns:
    if df_merge[col].dtype != 'object' and df_merge[col].nunique() > 20:
      print(f" {col} :\n Count: {df_merge[col].nunique()}, Minimum: {df_merge[col].min()}, Maximum: {df_merge[col].max()}\n")


In [None]:
# Identify categorical columns
categorical_cols = df_merge.select_dtypes(include=['object']).columns
print("\nCategorical Columns:")
print(categorical_cols.tolist())

# Display unique values for each categorical column
print("\nUnique values in categorical columns:")
for col in categorical_cols:
    print(f" {df_merge[col].value_counts()}\n")

In [None]:
# Label Encoding
# Convert OverTime to numeric: Yes=1, No=0
df_merge['OverTime'] = df_merge['OverTime'].map({'Yes': 1, 'No': 0})

# Convert Attrition to numeric: Yes=1, No=0
df_merge['Attrition'] = df_merge['Attrition'].map({'Yes': 1, 'No': 0})

# Convert BusinessTravel to numeric
df_merge['BusinessTravel'] = df_merge['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

In [None]:
# One hot Encoding
# Transform categorical data into dummies
dummy_col = [column for column in df_merge.select_dtypes(include=['object']).columns]

# Implementing one-hot encoding
data = pd.get_dummies(df_merge, columns=dummy_col, drop_first=True, dtype='uint8')

# Column info of encoded data
data.info()

# Exploratory Data Analysis

In [None]:
# Correlation with Attrition
correlation = data.corr(numeric_only=True)['Attrition'].sort_values(ascending=False)

# Display correlation values
print("Correlation with Attrition:")
print(correlation)

# Plot the top 10 positive and negative correlations (excluding Attrition itself)
correlation_filtered = correlation.drop('Attrition')
top_positive = correlation_filtered.head(10)
top_negative = correlation_filtered.tail(10)

# Combine for plotting
combined = pd.concat([top_positive, top_negative])


# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=combined.values, y=combined.index, palette='coolwarm')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.grid(True)
plt.tight_layout()
plt.savefig('correlation_plot.png')

In [None]:
# Correlation martix with important features
correlation1= data.corr(numeric_only=True)
imp_feat= correlation1['Attrition'].abs() > 0.085
imp_corr= correlation1.loc[imp_feat,imp_feat]

# Triangular Heatmap
mask = np.triu(np.ones_like(imp_corr, dtype=bool))
cmap = sns.diverging_palette(225, 20, as_cmap=True)
plt.figure(figsize=(6,4))
sns.heatmap(imp_corr, mask= mask, cmap='RdBu', center=0, vmin=-0.3, vmax=0.5)
plt.title(" Correlation Heatmap")
plt.savefig('correlation_heatmap.png')

In [None]:
# Histplot of TotalWorkingYears with Attrition
plt.figure(figsize=(6, 4))
sns.histplot(data=df_merge, x='TotalWorkingYears', hue='Attrition', multiple='stack', discrete= True, palette=['cornflowerblue', 'orangered'])
plt.savefig('TotalWorkingYears_bar_plot.png')

In [None]:
# Histplot of Age with Attrition
plt.figure(figsize=(6, 4))
sns.histplot(data=df_merge, x='Age', hue='Attrition', multiple='stack', discrete= True, palette= ['cornflowerblue', 'orangered'])
plt.savefig('Age_bar_plot.png')

In [None]:
# Histplot of MonthlyIncome with Attrition

data.hvplot.hist(y='MonthlyIncome', by='Attrition', subplots=False, width=600, height=300, bins=10, ylabel='Count', color=['cornflowerblue', 'orangered'])

In [None]:
# Histplot of OverTime with Attrition

data.hvplot.hist(y='OverTime', by='Attrition', subplots=False, width=600, height=300, ylabel= 'Frequency', color=['cornflowerblue', 'orangered'])

In [None]:
# Barplot of MaritalStatus with Attrition
df = df_merge.groupby('MaritalStatus')['Attrition'].value_counts()
sns.barplot(x=df.index.get_level_values(0), y=df.values, hue=df.index.get_level_values(1), palette=['cornflowerblue', 'orangered'],  )
plt.xlabel('Marital Status')
plt.ylabel('Count')

plt.savefig('MarritalStatus_bar_plot.png')

# 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop('Attrition', axis=1)
y = data.Attrition

# Split the dataset into training and testing ratio of 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,
                                                    stratify=y)

# Standard Scaling
standard_scaler = StandardScaler()
X_train_std = standard_scaler.fit_transform(X_train)
X_test_std = standard_scaler.transform(X_test)


In [None]:
y.value_counts()

In [None]:
# Create the pie chart of Attrition
x=data.drop(["Attrition"],axis=1)
y=data["Attrition"]

count_class = y.value_counts() # Count the occurrences of each class

plt.figure(figsize=(4,4))
plt.pie(
    count_class.values,
    labels= ['not left', 'left'],
    colors= ['cyan', 'deeppink'] ,
    explode= (0, 0.1),
    autopct='%1.1f %%',
    shadow=True,
    startangle=90
)

plt.axis('equal')  # Equal aspect ratio ensures it's a circle
plt.title('Employee Attrition', fontsize=12)
plt.savefig('pie_chart.png')


In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE technique to balance the class

smote = SMOTE(sampling_strategy='minority', random_state= 42)
X_train_std_resampled, y_train_std_resampled = smote.fit_resample(X_train_std, y_train)
y_train_std_resampled.value_counts()

In [None]:
from sklearn.decomposition import PCA

# Apply PCA (after SMOTE)
pca1 = PCA(n_components=.95)  # Reduce till 95% variance dimensions
X_train_std_pca = pca1.fit_transform(X_train_std_resampled)
X_test_std_pca = pca1.transform(X_test_std)

In [None]:
# Apply PCA (no SMOTE)
pca2 = PCA(n_components=.95)  # Reduce till 95% variance dimensions
X_train_pca = pca2.fit_transform(X_train_std)
X_test_pca = pca2.transform(X_test_std)

In [None]:
print("Explained variance:", pca1.explained_variance_ratio_)
print("Cumulative:", np.cumsum(pca1.explained_variance_ratio_))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Define a function to evaluate the model
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")
    print(f"PRECISION SCORE:\n{precision_score(y_test, y_test_pred):.4f}")
    print(f"RECALL SCORE:\n{recall_score(y_test, y_test_pred):.4f}")
    print(f"F1 SCORE:\n{f1_score(y_test, y_test_pred):.4f}")
    print(f"AUC SCORE:\n{roc_auc_score(y_test, y_test_pred):.4f}")
    print("===============================")

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Logistic Regression with SMOTE and PCA
lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train_std_pca, y_train_std_resampled)

evaluate(lr_clf, X_train_std_pca, X_test_std_pca, y_train_std_resampled, y_test)

In [None]:
# Logistic Regression with SMOTE
lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train_std_resampled, y_train_std_resampled)

evaluate(lr_clf, X_train_std_resampled, X_test_std, y_train_std_resampled, y_test)

In [None]:
# Logistic Regression with PCA
lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train_pca, y_train)

evaluate(lr_clf, X_train_pca, X_test_pca, y_train, y_test)

In [None]:
# Logistic Regression without SMOTE/PCA
lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train_std, y_train)

evaluate(lr_clf, X_train_std, X_test_std, y_train, y_test)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Random Forest with SMOTE and PCA

rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42,
                                   max_depth=15
                                    )
rf_clf.fit(X_train_std_pca, y_train_std_resampled)

evaluate(rf_clf, X_train_std_pca, X_test_std_pca, y_train_std_resampled, y_test)

In [None]:
# Random Forest with SMOTE

rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=4,
                                max_depth= None )
rf_clf.fit(X_train_std_resampled, y_train_std_resampled)

evaluate(rf_clf, X_train_std_resampled, X_test_std, y_train_std_resampled, y_test)

In [None]:
def feature_imp(df, model):
    fi = pd.DataFrame()
    fi["feature"] = df.columns
    fi["importance"] = model.feature_importances_
    return fi.sort_values(by="importance", ascending=False)

In [None]:
# Feature Importance according to Random Forest

df = feature_imp(X, rf_clf)[:30]
df.set_index('feature', inplace=True)
df.plot(kind='barh', figsize=(6, 6))
plt.xlabel('Feature Importance Score')

In [None]:
# Random Forest with PCA

rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42,
                                max_depth=15
                                    )
rf_clf.fit(X_train_pca, y_train)

evaluate(rf_clf, X_train_pca, X_test_pca, y_train, y_test)

In [None]:
# Random Forest without SMOTE or PCA

rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=4,
                                max_depth= None )

rf_clf.fit(X_train_std, y_train)

evaluate(rf_clf, X_train_std, X_test_std, y_train, y_test)

# Linear-SVM

In [None]:
from sklearn.svm import SVC

In [None]:
# Linear-SVM with SMOTE and PCA

svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train_std_pca, y_train_std_resampled)

evaluate(svm_clf, X_train_std_pca, X_test_std_pca, y_train_std_resampled, y_test)

In [None]:
# Linear-SVM with SMOTE

svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train_std_resampled, y_train_std_resampled)

evaluate(svm_clf, X_train_std_resampled, X_test_std, y_train_std_resampled, y_test)

In [None]:
# Linear-SVM with PCA

svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train_pca, y_train)

evaluate(svm_clf, X_train_pca, X_test_pca, y_train, y_test)

In [None]:
# Linear-SVM without SMOTE or PCA

svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train_std, y_train)

evaluate(svm_clf, X_train_std, X_test_std, y_train, y_test)

# Radial Basis Function SVM

In [None]:
# GridSearch (with SMOTE and PCA data)
from sklearn.model_selection import GridSearchCV
svm_clf = SVC(random_state=42)

param_grid = [
    {'C': [1, 10, 100], 'kernel': ['linear']},
    {'C': [.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
]

search = GridSearchCV(svm_clf, param_grid=param_grid, scoring='roc_auc', cv=5, refit=True, verbose=1)
search.fit(X_train_std_pca, y_train_std_resampled)

In [None]:
# RBF-SVM with SMOTE and PCA

svm_clf = SVC(kernel='rbf', C=10, gamma=0.001)
svm_clf.fit(X_train_std_pca, y_train_std_resampled)

evaluate(svm_clf, X_train_std_pca, X_test_std_pca, y_train_std_resampled, y_test)

In [None]:
# GridSearch (with SMOTE data)
from sklearn.model_selection import GridSearchCV
svm_clf = SVC(random_state=42)

param_grid = [
    {'C': [1, 10, 100], 'kernel': ['linear']},
    {'C': [.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
]

search = GridSearchCV(svm_clf, param_grid=param_grid, scoring='roc_auc', cv=5, refit=True, verbose=1)
search.fit(X_train_std_resampled, y_train_std_resampled)

In [None]:
# RBF-SVM with SMOTE

svm_clf = SVC(kernel='rbf', C=10, gamma=0.001)
svm_clf.fit(X_train_std_resampled, y_train_std_resampled)

evaluate(svm_clf, X_train_std_resampled, X_test_std, y_train_std_resampled, y_test)

In [None]:
# GridSearch (with PCA only data)
from sklearn.model_selection import GridSearchCV
svm_clf = SVC(random_state=42)

param_grid = [
    {'C': [1, 10, 100], 'kernel': ['linear']},
    {'C': [.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
]

search = GridSearchCV(svm_clf, param_grid=param_grid, scoring='roc_auc', cv=5, refit=True, verbose=1)
search.fit(X_train_pca, y_train)

In [None]:
# RBF-SVM with PCA

svm_clf = SVC(**search.best_params_)
svm_clf.fit(X_train_pca, y_train)

evaluate(svm_clf, X_train_pca, X_test_pca, y_train, y_test)

In [None]:
# RBF-SVM without SMOTE or PCA

svm_clf = SVC(kernel='rbf', C=100, gamma=0.001)
svm_clf.fit(X_train_std, y_train)

evaluate(svm_clf, X_train_std, X_test_std, y_train, y_test)

# Artificial Neural Network

In [None]:
pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
# ANN with SMOTE and PCA

# Seed
tf.keras.utils.set_random_seed(42)

model_ann = Sequential()
# Input layer and first hidden layer
model_ann.add(Dense(64, activation='relu', input_shape=(X_train_std_pca.shape[1],)))
model_ann.add(Dropout(0.3))        #helps prevent overfitting

# Second hidden layer
model_ann.add(Dense(32, activation='relu'))

# Output layer
model_ann.add(Dense(1, activation='sigmoid'))  # Binary classification

model_ann.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

history = model_ann.fit(
    X_train_std_pca, y_train_std_resampled,
    validation_data=(X_test_std_pca, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)


In [None]:
# Predict on test set
y_test_prob = model_ann.predict(X_test_std_pca).ravel()
y_test_pred = (y_test_prob > 0.5).astype(int)

# Predict on train set
y_train_prob = model_ann.predict(X_train_std_pca).ravel()
y_train_pred = (y_train_prob > 0.5).astype(int)

print("TRAINIG RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_train_std_resampled, y_train_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train_std_resampled, y_train_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_train_std_resampled, y_train_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")

print("TESTING RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print(f"PRECISION SCORE:\n{precision_score(y_test, y_test_pred):.4f}")
print(f"RECALL SCORE:\n{recall_score(y_test, y_test_pred):.4f}")
print(f"F1 SCORE:\n{f1_score(y_test, y_test_pred):.4f}")
print(f"AUC SCORE:\n{roc_auc_score(y_test, y_test_pred):.4f}")
print("===============================")

In [None]:
# ANN with SMOTE

# Seed
tf.keras.utils.set_random_seed(40)

model_ann = Sequential()
# Input layer and first hidden layer
model_ann.add(Dense(64, activation='relu', input_shape=(X_train_std_resampled.shape[1],)))
model_ann.add(Dropout(0.3))        #helps prevent overfitting

# Second hidden layer
model_ann.add(Dense(32, activation='relu'))

# Output layer
model_ann.add(Dense(1, activation='sigmoid'))  # Binary classification

model_ann.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = model_ann.fit(
    X_train_std_resampled, y_train_std_resampled,
    validation_data=(X_test_std, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

In [None]:
# Predict on test set
y_test_prob = model_ann.predict(X_test_std).ravel()
y_test_pred = (y_test_prob > 0.5).astype(int)

# Predict on train set
y_train_prob = model_ann.predict(X_train_std_resampled).ravel()
y_train_pred = (y_train_prob > 0.5).astype(int)

print("TRAINIG RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_train_std_resampled, y_train_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train_std_resampled, y_train_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_train_std_resampled, y_train_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")

print("TESTING RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print(f"PRECISION SCORE:\n{precision_score(y_test, y_test_pred):.4f}")
print(f"RECALL SCORE:\n{recall_score(y_test, y_test_pred):.4f}")
print(f"F1 SCORE:\n{f1_score(y_test, y_test_pred):.4f}")
print(f"AUC SCORE:\n{roc_auc_score(y_test, y_test_pred):.4f}")
print("===============================")

In [None]:
# ANN with PCA

# Seed
tf.keras.utils.set_random_seed(32)

model_ann = Sequential()
# Input layer and first hidden layer
model_ann.add(Dense(64, activation='relu', input_shape=(X_train_pca.shape[1],)))
model_ann.add(Dropout(0.3))        #helps prevent overfitting

# Second hidden layer
model_ann.add(Dense(32, activation='relu'))

# Output layer
model_ann.add(Dense(1, activation='sigmoid'))  # Binary classification

model_ann.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = model_ann.fit(
    X_train_pca, y_train,
    validation_data=(X_test_pca, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

In [None]:
# Predict on test set
y_test_prob = model_ann.predict(X_test_pca).ravel()
y_test_pred = (y_test_prob > 0.5).astype(int)

# Predict on train set
y_train_prob = model_ann.predict(X_train_pca).ravel()
y_train_pred = (y_train_prob > 0.5).astype(int)

print("TRAINIG RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")

print("TESTING RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print(f"PRECISION SCORE:\n{precision_score(y_test, y_test_pred):.4f}")
print(f"RECALL SCORE:\n{recall_score(y_test, y_test_pred):.4f}")
print(f"F1 SCORE:\n{f1_score(y_test, y_test_pred):.4f}")
print(f"AUC SCORE:\n{roc_auc_score(y_test, y_test_pred):.4f}")
print("===============================")

In [None]:
# ANN without SMOTE or PCA

# Set random seeds for reproducibility
tf.keras.utils.set_random_seed(40)

model_ann = Sequential()

# Input layer and first hidden layer
model_ann.add(Dense(64, activation='relu', input_shape=(X_train_std.shape[1],)))
model_ann.add(Dropout(0.3))  # Prevent overfitting

# Second hidden layer
model_ann.add(Dense(32, activation='relu'))

# Output layer
model_ann.add(Dense(1, activation='sigmoid'))  # Binary classification

model_ann.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

history = model_ann.fit(
    X_train_std, y_train,
    validation_data=(X_test_std, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

In [None]:
# Predict on test set
y_test_prob = model_ann.predict(X_test_std).ravel()
y_test_pred = (y_test_prob > 0.5).astype(int)

# Predict on train set
y_train_prob = model_ann.predict(X_train_std).ravel()
y_train_pred = (y_train_prob > 0.5).astype(int)

print("TRAINIG RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")

print("TESTING RESULTS: \n===============================")
clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print(f"PRECISION SCORE:\n{precision_score(y_test, y_test_pred):.4f}")
print(f"RECALL SCORE:\n{recall_score(y_test, y_test_pred):.4f}")
print(f"F1 SCORE:\n{f1_score(y_test, y_test_pred):.4f}")
print(f"AUC SCORE:\n{roc_auc_score(y_test, y_test_pred):.4f}")
print("===============================")