<a href="https://colab.research.google.com/github/cbonnin88/InnovateNext-HR_Project/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd
import polars as pl
import gdown as gd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go

In [25]:
url = 'https://drive.google.com/uc?id=1tUdeNd0_9-itaIgdJtK8_FgWlbDkvxyk'
gd.download(url,'master_df.csv',quiet=True)

'master_df.csv'

In [26]:
df_final = pl.read_csv('master_df.csv')

In [4]:
df_final.head()

EmployeeID,FirstName,LastName,Department,Role,HireDate,TerminationDate,Gender,Location,Salary,Bonus,StockOptions,PerformanceRating,Promotion,EngagementScore,ManagerRating,Attrition,TenureInDays
str,str,str,str,str,str,str,str,str,i64,f64,i64,i64,str,i64,i64,i64,i64
"""E1023""","""Claude""","""Renard""","""Engineering""","""Senior Accountant""","""2016-01-24""",,"""Unknown""","""Nantes""",199042,6642.06,1200,3.0,"""No""",3,6,0,3571
"""E1647""","""Joséphine""","""Riou""","""Finance""","""Senior Accountant""","""2019-02-08""",,"""Female""","""Bordeaux""",84362,3984.27,200,3.0,"""No""",9,3,0,2460
"""E1941""","""Stéphane""","""Rocher""","""Sales""","""Analyst""","""2021-06-26""",,"""Unknown""","""Bordeaux""",232090,10044.19,200,4.0,"""No""",8,3,0,1591
"""E1695""","""Alix""","""Chauvin""","""Unknown""","""VP of Sales""","""2016-09-21""",,"""Female""","""Bordeaux""",193920,5380.39,500,,"""Yes""",3,5,0,3330
"""E1584""","""Vincent""","""Gonzalez""","""Engineering""","""Digital Marketing Analyst""","""2022-08-27""","""2023-12-10""","""Female""","""Paris""",178481,7453.43,500,3.0,"""No""",6,10,1,470


In [27]:
# Check Attrition distribution in original df_final (Polars DataFrame)
print("Attrition distribution in original df_final:")
display(df_final.group_by('Attrition').len())

# Convert to pandas
df_ml = df_final.to_pandas()

# Drop rows where the TARGET column is null (if any)
# This is crucial as the target cannot be missing for training
df_ml = df_ml.dropna(subset=[TARGET])

# Display the Attrition distribution in the new df_ml (Pandas DataFrame)
print("\nAttrition distribution in df_ml after dropping nulls in TARGET:")
display(df_ml[TARGET].value_counts())

# The preprocessing pipeline will handle nulls in feature columns.
display(df_ml.head())

Attrition distribution in original df_final:


Attrition,len
i64,u32
0,691
1,310



Attrition distribution in df_ml after dropping nulls in TARGET:


Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
0,691
1,310


Unnamed: 0,EmployeeID,FirstName,LastName,Department,Role,HireDate,TerminationDate,Gender,Location,Salary,Bonus,StockOptions,PerformanceRating,Promotion,EngagementScore,ManagerRating,Attrition,TenureInDays
0,E1023,Claude,Renard,Engineering,Senior Accountant,2016-01-24,,Unknown,Nantes,199042,6642.06,1200,3.0,No,3,6,0,3571
1,E1647,Joséphine,Riou,Finance,Senior Accountant,2019-02-08,,Female,Bordeaux,84362,3984.27,200,3.0,No,9,3,0,2460
2,E1941,Stéphane,Rocher,Sales,Analyst,2021-06-26,,Unknown,Bordeaux,232090,10044.19,200,4.0,No,8,3,0,1591
3,E1695,Alix,Chauvin,Unknown,VP of Sales,2016-09-21,,Female,Bordeaux,193920,5380.39,500,,Yes,3,5,0,3330
4,E1584,Vincent,Gonzalez,Engineering,Digital Marketing Analyst,2022-08-27,2023-12-10,Female,Paris,178481,7453.43,500,3.0,No,6,10,1,470


In [28]:
# Defining all the columns we'll use as features
FEATURES = [
    'TenureInDays', 'Salary','PerformanceRating','EngagementScore','ManagerRating','Department','Role','Location','Promotion','Gender'
]

TARGET = 'Attrition'

In [29]:
X = df_ml[FEATURES]
y = df_ml[TARGET]

In [30]:
# Split the data: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# **Creating a Preprocessing Pipeline**

In [31]:
# Here I am defining which columns are which
numeric_features = ['TenureInDays','Salary','PerformanceRating','EngagementScore','ManagerRating']
categorical_features = ['Department','Role','Location','Promotion','Gender']

In [32]:
# Create the transformer for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')), # Filling the missing values
    ('scaler',StandardScaler()) # Scale the data
])

In [33]:
# Create the transformer for categorical data
categorical_transformer= Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore')) # Turn categories into 0s and 1s
])

In [34]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)
    ]
)

In [35]:
# Creating the full pipeline
model_pipeline= Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression(class_weight='balanced')) # 'balanced' helps with imbalance
])

# Train the model
print('Training the scikit-learn model...')
model_pipeline.fit(X_train,y_train)
print('Model training complete')

Training the scikit-learn model...
Model training complete


# *Evaluating the model*

In [36]:
# Making predictions on the test data
y_pred = model_pipeline.predict(X_test)

In [39]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test,y_pred)

In [40]:
# Define the labels for the axes
axis_labels = ['Stayed (0)','Left (1)']

In [43]:
# Creating the heatmap
fig_cm = px.imshow(
    cm,
    labels=dict(x='Predicted Label',y='True Label',color='Count'),
    x=axis_labels,
    y=axis_labels,
    text_auto=True,
    color_continuous_scale='Blues',
    title='Plotly Confusion Matrix'
)

fig_cm.update_xaxes(side='bottom')
fig_cm.show()

# **Classification Report**

In [56]:
# Getting the report as a dictionary
report_dict = classification_report(y_test,y_pred,output_dict=True)

In [57]:
# Pop the accuracy, which is a single value, for the title
accuracy = report_dict.pop('accuracy')

In [59]:
# Convert the rest of the dictionary to a list of dictionaries for Polars
report_list = []
for class_name, metrics in report_dict.items():
    row_dict = {'Class': class_name}
    row_dict.update(metrics)
    report_list.append(row_dict)

# Convert the list of dictionaries to a Polars DataFrame
df_report = pl.DataFrame(report_list).select(
    # Re-order columns to be more logical, using correct lowercase column names
    ['Class','precision','recall','f1-score','support']
).with_columns(
    # Make sure 'support' is an integer
    pl.col('support').cast(pl.Int64)
)

In [62]:
# Creating the plotly table
fig_table = go.Figure(data=[go.Table(
    header=dict(values=list(df_report.columns),
                fill_color='#004A99',
                align='left',
                font=dict(color='white',size=12)),
    cells=dict(values=[df_report[col] for col in df_report.columns],
             fill_color='#F5F8FF',
             align='left',
             font=dict(size=11),
             format=[None] + ['.2f']* 3 + ['0.f']
               )
)])

fig_table.update_layout(
    title_text=f'Classification Report (Overall Accuracy: {accuracy:.2%})'
)

fig_table.show()