# Notebook 03: Attrition Prediction Model

This notebook focuses on building machine learning models to predict employee attrition using the preprocessed HRMS, Resumes, Jobs, and Attrition datasets from Notebook 02.

**Objectives:**
1. Load the cleaned datasets.
2. Merge relevant datasets for modeling.
3. Prepare features and target (`attrition_flag`).
4. Encode categorical variables if needed.
5. Split the data into train and test sets.
6. Train machine learning models and evaluate performance.
7. Save the trained model for deployment.


In [3]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: Display settings
pd.set_option('display.max_columns', None)


In [6]:
import pandas as pd

# Path to your processed data
DATA_DIR = r"C:\Users\abanu\Documents\t_iq_hr\data\processed"

# Load cleaned datasets
df_attrition = pd.read_csv(f"{DATA_DIR}/Attrition_cleaned.csv")
df_hrms = pd.read_csv(f"{DATA_DIR}/HRMS_cleaned.csv")
df_resumes = pd.read_csv(f"{DATA_DIR}/Resumes_cleaned.csv")
df_jobs = pd.read_csv(f"{DATA_DIR}/Jobs_cleaned.csv")

# Quick check
print("Attrition:", df_attrition.shape)
print("HRMS:", df_hrms.shape)
print("Resumes:", df_resumes.shape)
print("Jobs:", df_jobs.shape)


Attrition: (1268, 35)
HRMS: (9841, 11)
Resumes: (2481, 11)
Jobs: (10000, 13)


In [7]:
# -------------------------------
# Cell 3: Merge datasets for modeling
# -------------------------------

# Ensure employee_id is numeric for safe merging
for df in [df_attrition, df_hrms, df_resumes]:
    df['employee_id_num'] = pd.to_numeric(df['employee_id'].astype(str).str.extract(r'(\d+)', expand=False), errors='coerce')

# Merge HRMS into Attrition
df_model = df_attrition.merge(df_hrms, left_on='employee_id_num', right_on='employee_id_num', how='left', suffixes=('_attr', '_hrms'))

# Merge Resume features if desired (optional)
df_model = df_model.merge(
    df_resumes[['employee_id_num', 'resume_word_count', 'resume_sent_count']],
    on='employee_id_num',
    how='left'
)

# Quick check
print("Merged dataset shape:", df_model.shape)
df_model.head()


Merged dataset shape: (1269, 49)


Unnamed: 0,age,attrition_flag,business_travel,daily_rate,department_attr,distance_from_home,education,education_field,employee_count,employee_id_attr,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role_attr,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,over_time,percent_salary_hike,performance_score,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company_attr,years_in_current_role,years_since_last_promotion,years_with_curr_manager,employee_id_num,employee_id_hrms,name,department_hrms,job_role_hrms,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company_hrms,trainings_count,resume_word_count,resume_sent_count
0,41,Yes,Travel_Rarely,1102,2,1,2,Life Sciences,1,1,2,0,94,3,2.0,7,4,2,5993.0,19479,8.0,Y,1,11,3,1,80,0.0,8,0.5,1,6.0,4.0,0,5.0,1,1,133,6,2,1,5180268,0.71,0.93,8,7,4,,
1,49,No,Travel_Frequently,279,1,8,1,Life Sciences,1,2,3,1,61,2,2.0,6,2,1,5130.0,24907,1.0,Y,0,23,3,4,80,1.0,10,3.0,3,10.0,7.0,1,7.0,2,2,389,6,9,1,2589268,0.81,0.56,6,3,3,,
2,37,Yes,Travel_Rarely,1373,1,2,2,Other,1,4,4,1,92,2,1.0,2,3,2,2090.0,2396,6.0,Y,1,15,3,2,80,0.0,7,3.0,3,0.0,0.0,0,0.0,4,4,232,5,5,6,4371479,0.41,0.7,4,7,2,,
3,33,No,Travel_Frequently,1392,1,3,4,Life Sciences,1,5,4,0,56,3,1.0,6,3,1,2909.0,23159,1.0,Y,1,11,3,3,80,0.0,8,3.0,3,8.0,7.0,3,0.0,5,5,397,8,2,1,3452514,0.44,0.87,5,6,0,,
4,27,No,Travel_Rarely,591,1,2,1,Medical,1,7,1,1,40,3,1.0,2,2,1,3468.0,16632,8.5,Y,0,12,3,4,80,1.0,6,3.0,3,2.0,2.0,2,2.0,7,7,286,3,3,0,2820129,0.45,0.92,7,5,3,,


In [8]:
# -------------------------------
# Cell 4: Encode categorical variables
# -------------------------------

from sklearn.preprocessing import LabelEncoder

# Target encoding for attrition
df_model['attrition'] = df_model['attrition_flag'].map({'Yes': 1, 'No': 0})

# Identify categorical columns for modeling
cat_cols = [
    'business_travel', 'department_hrms', 'job_role_hrms', 'location',
    'education_field', 'marital_status', 'gender', 'over_time'
]

le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = df_model[col].astype(str)
    df_model[col] = le.fit_transform(df_model[col])
    le_dict[col] = le  # store for inverse_transform if needed

# Drop unnecessary columns
df_model.drop(['attrition_flag', 'employee_id_attr', 'employee_id_hrms', 'employee_id'], axis=1, inplace=True, errors='ignore')

print("Categorical columns encoded successfully.")
df_model.head()


Categorical columns encoded successfully.


Unnamed: 0,age,business_travel,daily_rate,department_attr,distance_from_home,education,education_field,employee_count,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role_attr,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,over_time,percent_salary_hike,performance_score,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company_attr,years_in_current_role,years_since_last_promotion,years_with_curr_manager,employee_id_num,name,department_hrms,job_role_hrms,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company_hrms,trainings_count,resume_word_count,resume_sent_count,attrition
0,41,2,1102,2,1,2,1,1,2,0,94,3,2.0,7,4,2,5993.0,19479,8.0,Y,1,11,3,1,80,0.0,8,0.5,1,6.0,4.0,0,5.0,1,133,6,4,1,5180268,0.71,0.93,8,7,4,,,1
1,49,1,279,1,8,1,1,1,3,1,61,2,2.0,6,2,1,5130.0,24907,1.0,Y,0,23,3,4,80,1.0,10,3.0,3,10.0,7.0,1,7.0,2,389,6,11,1,2589268,0.81,0.56,6,3,3,,,0
2,37,2,1373,1,2,2,4,1,4,1,92,2,1.0,2,3,2,2090.0,2396,6.0,Y,1,15,3,2,80,0.0,7,3.0,3,0.0,0.0,0,0.0,4,232,5,7,6,4371479,0.41,0.7,4,7,2,,,1
3,33,1,1392,1,3,4,1,1,4,0,56,3,1.0,6,3,1,2909.0,23159,1.0,Y,1,11,3,3,80,0.0,8,3.0,3,8.0,7.0,3,0.0,5,397,8,4,1,3452514,0.44,0.87,5,6,0,,,0
4,27,2,591,1,2,1,3,1,1,1,40,3,1.0,2,2,1,3468.0,16632,8.5,Y,0,12,3,4,80,1.0,6,3.0,3,2.0,2.0,2,2.0,7,286,3,5,0,2820129,0.45,0.92,7,5,3,,,0


In [9]:
# -------------------------------
# Cell 5: Train/Test Split & Scaling
# -------------------------------

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = df_model.drop('attrition', axis=1)
y = df_model['attrition']

# Train/Test split (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify numeric columns for scaling
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Scale numeric features
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Train/Test split and numeric feature scaling completed.")
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


Train/Test split and numeric feature scaling completed.
X_train shape: (1015, 46), X_test shape: (254, 46)


In [11]:
# Convert target to numeric: 1 for Yes, 0 for No
y_train = y_train.map({'Y': 1, 'N': 0})
y_test = y_test.map({'Y': 1, 'N': 0})


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [13]:
# Logistic Regression (handle imbalance with class_weight)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

# Random Forest
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)


In [14]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)


ValueError: could not convert string to float: 'Y'

In [15]:
y_train = y_train.map({'Yes': 1, 'No': 0})
y_test = y_test.map({'Yes': 1, 'No': 0})


In [17]:
from sklearn.preprocessing import LabelEncoder

# Encode target column: 'Y' -> 1, 'N' -> 0
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)  # use transform to keep consistency


In [19]:
# Identify non-numeric columns
non_numeric_cols = X_train.select_dtypes(include='object').columns
print("Non-numeric columns in X_train:", list(non_numeric_cols))


Non-numeric columns in X_train: ['over_18']


In [20]:
# Encode 'over_18' as 0/1
X_train['over_18'] = X_train['over_18'].map({'Y': 1, 'N': 0})
X_test['over_18'] = X_test['over_18'].map({'Y': 1, 'N': 0})


In [21]:
print(X_train.dtypes)


age                           float64
business_travel               float64
daily_rate                    float64
department_attr               float64
distance_from_home            float64
education                     float64
education_field               float64
employee_count                float64
environment_satisfaction      float64
gender                        float64
hourly_rate                   float64
job_involvement               float64
job_level                     float64
job_role_attr                 float64
job_satisfaction              float64
marital_status                float64
monthly_income                float64
monthly_rate                  float64
num_companies_worked          float64
over_18                         int64
over_time                     float64
percent_salary_hike           float64
performance_score             float64
relationship_satisfaction     float64
standard_hours                float64
stock_option_level            float64
total_workin

In [25]:
import numpy as np

unique_vals, counts = np.unique(y_train, return_counts=True)
print("y_train unique values:", unique_vals)
print("y_train counts:", dict(zip(unique_vals, counts)))


y_train unique values: [0]
y_train counts: {np.int64(0): np.int64(1015)}


In [26]:
# Check the target column in the full dataset
print(df_model['attrition'].value_counts())


attrition
0    1049
1     220
Name: count, dtype: int64


In [28]:
# 1️⃣ Check for NaNs in target
print("NaNs in attrition:", df_model['attrition'].isna().sum())

# 2️⃣ Drop rows with missing target
df_model = df_model.dropna(subset=['attrition'])

# 3️⃣ Ensure target is numeric
df_model['attrition'] = df_model['attrition'].map({'No': 0, 'Yes': 1})

# 4️⃣ Split features and target
X = df_model.drop('attrition', axis=1)
y = df_model['attrition']

# 5️⃣ Train/Test split with stratification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6️⃣ Verify
print("y_train unique values:", np.unique(y_train))
print("y_train counts:\n", y_train.value_counts())


NaNs in attrition: 1269


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [29]:
# 1️⃣ List columns
print("Columns in df_model:", df_model.columns.tolist())

# 2️⃣ Check unique values in target column (adjust name if needed)
target_col = 'attrition'  # or 'attrition_flag'?
print("Unique values in target:", df_model[target_col].unique())

# 3️⃣ Count NaNs
print("Missing in target:", df_model[target_col].isna().sum())

# 4️⃣ Count rows
print("Total rows in df_model:", df_model.shape[0])


Columns in df_model: ['age', 'business_travel', 'daily_rate', 'department_attr', 'distance_from_home', 'education', 'education_field', 'employee_count', 'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement', 'job_level', 'job_role_attr', 'job_satisfaction', 'marital_status', 'monthly_income', 'monthly_rate', 'num_companies_worked', 'over_18', 'over_time', 'percent_salary_hike', 'performance_score', 'relationship_satisfaction', 'standard_hours', 'stock_option_level', 'total_working_years', 'training_times_last_year', 'work_life_balance', 'years_at_company_attr', 'years_in_current_role', 'years_since_last_promotion', 'years_with_curr_manager', 'employee_id_num', 'name', 'department_hrms', 'job_role_hrms', 'location', 'current_salary', 'satisfaction_score', 'engagement_score', 'num_skills', 'years_at_company_hrms', 'trainings_count', 'resume_word_count', 'resume_sent_count', 'attrition']
Unique values in target: []
Missing in target: 0
Total rows in df_model: 0


In [30]:
# Ensure employee_id columns are numeric
df_attrition['employee_id_num'] = pd.to_numeric(
    df_attrition['employee_id'].astype(str).str.extract(r'(\d+)', expand=False),
    errors='coerce'
)
df_hrms['employee_id_num'] = pd.to_numeric(
    df_hrms['employee_id'].astype(str).str.extract(r'(\d+)', expand=False),
    errors='coerce'
)
df_resumes['employee_id_num'] = pd.to_numeric(
    df_resumes['employee_id'].astype(str).str.extract(r'(\d+)', expand=False),
    errors='coerce'
)

# Merge attrition and HRMS on employee_id_num
df_model = df_attrition.merge(df_hrms, on='employee_id_num', how='inner')

print("Rows after merging Attrition & HRMS:", df_model.shape[0])

# Optional: merge resumes if desired
df_model = df_model.merge(
    df_resumes[['employee_id_num', 'resume_word_count', 'resume_sent_count']],
    on='employee_id_num',
    how='left'
)

print("Rows after adding resumes:", df_model.shape[0])


Rows after merging Attrition & HRMS: 1268
Rows after adding resumes: 1269


In [31]:
# Map 'Yes' → 1, 'No' → 0
df_model['attrition_flag'] = df_model['attrition_flag'].map({'Yes': 1, 'No': 0})

# Check
df_model['attrition_flag'].value_counts()


attrition_flag
0    1049
1     220
Name: count, dtype: int64

In [32]:
# Drop non-numeric / identifier columns
X = df_model.drop(columns=[
    'attrition_flag', 'employee_id', 'employee_id_num', 'name',
    'department_hrms', 'job_role_hrms', 'location', 'attrition'
], errors='ignore')

y = df_model['attrition_flag']


In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train value counts:\n", y_train.value_counts())


X_train shape: (1015, 45)
X_test shape: (254, 45)
y_train value counts:
 attrition_flag
0    839
1    176
Name: count, dtype: int64


In [34]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'Travel_Rarely'

In [35]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

print("Numeric columns:", list(numeric_cols))
print("Categorical columns:", list(categorical_cols))


Numeric columns: ['age', 'daily_rate', 'department_x', 'distance_from_home', 'education', 'employee_count', 'employee_id_x', 'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement', 'job_level', 'job_role_x', 'job_satisfaction', 'marital_status', 'monthly_income', 'monthly_rate', 'num_companies_worked', 'over_time', 'percent_salary_hike', 'performance_score', 'relationship_satisfaction', 'standard_hours', 'stock_option_level', 'total_working_years', 'training_times_last_year', 'work_life_balance', 'years_at_company_x', 'years_in_current_role', 'years_since_last_promotion', 'years_with_curr_manager', 'employee_id_y', 'department_y', 'job_role_y', 'current_salary', 'satisfaction_score', 'engagement_score', 'num_skills', 'years_at_company_y', 'trainings_count', 'resume_word_count', 'resume_sent_count']
Categorical columns: ['business_travel', 'education_field', 'over_18']


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Numeric pipeline: median imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Categorical pipeline: most frequent imputation + one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [38]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)

lr.fit(X_train_processed, y_train)
rf.fit(X_train_processed, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [40]:
lr = LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42)
lr.fit(X_train_processed, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [41]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)


In [42]:
lr = LogisticRegression(solver='saga', max_iter=5000, class_weight='balanced', random_state=42)
lr.fit(X_train_scaled, y_train)


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'saga'


In [43]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(solver='saga', max_iter=5000, class_weight='balanced', random_state=42))
])

pipeline.fit(X_train_processed, y_train)


0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('imputer', ...), ('scaler', ...), ...]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"missing_values  missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` can be set to either `np.nan` or `pd.NA`.",
,"strategy  strategy: str or Callable, default='mean' The imputation strategy. - If ""mean"", then replace missing values using the mean along  each column. Can only be used with numeric data. - If ""median"", then replace missing values using the median along  each column. Can only be used with numeric data. - If ""most_frequent"", then replace missing using the most frequent  value along each column. Can be used with strings or numeric data.  If there is more than one such value, only the smallest is returned. - If ""constant"", then replace missing values with fill_value. Can be  used with strings or numeric data. - If an instance of Callable, then replace missing values using the  scalar statistic returned by running the callable over a dense 1d  array containing non-missing values of each column. .. versionadded:: 0.20  strategy=""constant"" for fixed value imputation. .. versionadded:: 1.5  strategy=callable for custom value imputation.",'median'
,"fill_value  fill_value: str or numerical value, default=None When strategy == ""constant"", `fill_value` is used to replace all occurrences of missing_values. For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when imputing numerical data and ""missing_value"" for strings or object data types.",
,"copy  copy: bool, default=True If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: - If `X` is not an array of floating values; - If `X` is encoded as a CSR matrix; - If `add_indicator=True`.",True
,"add_indicator  add_indicator: bool, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time.",False
,"keep_empty_features  keep_empty_features: bool, default=False If True, features that consist exclusively of missing values when `fit` is called are returned in results when `transform` is called. The imputed value is always `0` except when `strategy=""constant""` in which case `fill_value` will be used instead. .. versionadded:: 1.2",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'saga'


In [44]:
# Predictions on test set
y_pred = pipeline.predict(X_test_processed)

# Probabilities if needed
y_prob = pipeline.predict_proba(X_test_processed)[:, 1]


In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7244094488188977
Precision: 0.32894736842105265
Recall: 0.5681818181818182
F1 Score: 0.4166666666666667
ROC AUC: 0.7608225108225107

Confusion Matrix:
 [[159  51]
 [ 19  25]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.76      0.82       210
           1       0.33      0.57      0.42        44

    accuracy                           0.72       254
   macro avg       0.61      0.66      0.62       254
weighted avg       0.80      0.72      0.75       254



In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced', 'balanced_subsample']
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, scoring='f1', cv=5)
grid.fit(X_train_processed, y_train)

best_rf = grid.best_estimator_
y_pred_rf = best_rf.predict(X_test_processed)


In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Predictions already done
# y_pred_rf = best_rf.predict(X_test_processed)

# 1. Basic metrics
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
roc_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test_processed)[:,1])

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

# 2. Confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:\n", cm)

# 3. Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Accuracy: 0.823
Precision: 0.455
Recall: 0.114
F1 Score: 0.182
ROC AUC: 0.716

Confusion Matrix:
 [[204   6]
 [ 39   5]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90       210
           1       0.45      0.11      0.18        44

    accuracy                           0.82       254
   macro avg       0.65      0.54      0.54       254
weighted avg       0.77      0.82      0.78       254



🚨 Your Recall for class 1 (Attrition) is extremely low: 0.11

This means:

Out of 44 people who actually left,
your model correctly identified only 5.

This makes the model almost unusable for HR attrition prediction.

HR teams care more about catching who may leave than about overall accuracy.

Even though Accuracy = 0.823, it is misleading because the dataset is imbalanced.

📌 Why Accuracy Doesn’t Matter Here

Your confusion matrix:

[[204   6]   → predicted stay (correct 97%)
 [ 39   5]]  → predicted leave (only 11% recall)


The model is basically predicting “Not Leaving” for almost everyone.

This is not good enough for a real HR solution.

In [1]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\Users\abanu\Documents\t_iq_hr\env\Lib\site-packages\sklearn\utils\_param_validation.py)

In [2]:
# run in the SAME kernel that gave the ImportError
import sys, pkgutil
print("Python:", sys.version.splitlines()[0])

try:
    import sklearn
    print("scikit-learn:", sklearn.__version__)
except Exception as e:
    print("scikit-learn import failed:", repr(e))

try:
    import imblearn
    print("imbalanced-learn:", imblearn.__version__)
    # helpful detailed diagnostics from imblearn
    try:
        imblearn.show_versions()
    except Exception as e:
        print("imblearn.show_versions() failed:", repr(e))
except Exception as e:
    print("imbalanced-learn import failed:", repr(e))

# list the install locations for both packages
import importlib, inspect
for name in ("sklearn", "imblearn"):
    try:
        m = importlib.import_module(name)
        print(f"{name} location:", inspect.getsourcefile(m) or m.__file__)
    except Exception as e:
        print(f"{name} location: failed ->", repr(e))


Python: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
scikit-learn: 1.8.0
imbalanced-learn import failed: ImportError("cannot import name '_is_pandas_df' from 'sklearn.utils.validation' (c:\\Users\\abanu\\Documents\\t_iq_hr\\env\\Lib\\site-packages\\sklearn\\utils\\validation.py)")
sklearn location: c:\Users\abanu\Documents\t_iq_hr\env\Lib\site-packages\sklearn\__init__.py
imblearn location: failed -> ImportError("cannot import name '_is_pandas_df' from 'sklearn.utils.validation' (c:\\Users\\abanu\\Documents\\t_iq_hr\\env\\Lib\\site-packages\\sklearn\\utils\\validation.py)")


In [1]:
import sys
import sklearn, imblearn
print("Python:", sys.version.splitlines()[0])
print("scikit-learn:", sklearn.__version__)
print("imbalanced-learn:", imblearn.__version__)

# quick import test
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
print("SMOTE + imblearn Pipeline imported OK")


ImportError: cannot import name '_safe_tags' from 'sklearn.utils._tags' (c:\Users\abanu\Documents\t_iq_hr\env\Lib\site-packages\sklearn\utils\_tags.py)

In [1]:
# Run this in the notebook kernel (after restarting it)
import sys, inspect
print("Python:", sys.version.splitlines()[0])

try:
    import sklearn
    print("scikit-learn:", sklearn.__version__, "at", sklearn.__file__)
except Exception as e:
    print("scikit-learn import failed:", repr(e))

try:
    import imblearn
    print("imbalanced-learn:", imblearn.__version__, "at", imblearn.__file__)
except Exception as e:
    print("imblearn import failed:", repr(e))

# final import test
try:
    from imblearn.over_sampling import SMOTE
    print("SMOTE imported OK")
except Exception as e:
    print("SMOTE import failed:", repr(e))


Python: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
scikit-learn: 1.7.2 at c:\Users\abanu\Documents\t_iq_hr\env\Lib\site-packages\sklearn\__init__.py
imblearn import failed: ImportError("cannot import name '_safe_tags' from 'sklearn.utils._tags' (c:\\Users\\abanu\\Documents\\t_iq_hr\\env\\Lib\\site-packages\\sklearn\\utils\\_tags.py)")
SMOTE imported OK


In [1]:
import sklearn, imblearn
from imblearn.over_sampling import SMOTE

print("scikit-learn:", sklearn.__version__)
print("imbalanced-learn:", imblearn.__version__)
print("SMOTE imported OK")


scikit-learn: 1.5.2
imbalanced-learn: 0.14.0
SMOTE imported OK


In [12]:
# -------------------------------
# 1️⃣ Load cleaned dataset
# -------------------------------
import pandas as pd

df = pd.read_csv(r"C:\Users\abanu\Documents\t_iq_hr\data\processed\Attrition_cleaned.csv")
df.head()

Unnamed: 0,age,attrition_flag,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_id,...,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,41,Yes,Travel_Rarely,1102,2,1,2,Life Sciences,1,1,...,1,80,0.0,8,0.5,1,6.0,4.0,0,5.0
1,49,No,Travel_Frequently,279,1,8,1,Life Sciences,1,2,...,4,80,1.0,10,3.0,3,10.0,7.0,1,7.0
2,37,Yes,Travel_Rarely,1373,1,2,2,Other,1,4,...,2,80,0.0,7,3.0,3,0.0,0.0,0,0.0
3,33,No,Travel_Frequently,1392,1,3,4,Life Sciences,1,5,...,3,80,0.0,8,3.0,3,8.0,7.0,3,0.0
4,27,No,Travel_Rarely,591,1,2,1,Medical,1,7,...,4,80,1.0,6,3.0,3,2.0,2.0,2,2.0


In [13]:
# -------------------------------
# 2️⃣ Define features and target
# -------------------------------
target_col = 'attrition_flag'   # actual target column
y = df[target_col]
X = df.drop(columns=[target_col])


In [14]:
# -------------------------------
# 3️⃣ Identify categorical and numeric columns
# -------------------------------
cat_cols = ['business_travel', 'education_field', 'over_18']
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [15]:
# -------------------------------
# 4️⃣ Preprocessing: OneHotEncode + StandardScaler
# -------------------------------
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [16]:
# -------------------------------
# 5️⃣ Train-test split
# -------------------------------
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [17]:
# -------------------------------
# 6️⃣ SMOTE + Pipeline + Random Forest
# -------------------------------
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

smote = SMOTE(random_state=42)

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('rf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5, 10, None],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


 nan nan nan nan nan nan]


In [18]:
# run in the same notebook
print("Missing values per column:")
print(df.isnull().sum()[lambda s: s>0])


Missing values per column:
Series([], dtype: int64)


In [21]:
import pandas as pd
p = r"C:\Users\abanu\Documents\t_iq_hr\data\processed\Attrition_cleaned.csv"
df = pd.read_csv(p)
print("Loaded:", p)
print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()


Loaded: C:\Users\abanu\Documents\t_iq_hr\data\processed\Attrition_cleaned.csv
Shape: (1268, 35)
Columns: ['age', 'attrition_flag', 'business_travel', 'daily_rate', 'department', 'distance_from_home', 'education', 'education_field', 'employee_count', 'employee_id', 'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement', 'job_level', 'job_role', 'job_satisfaction', 'marital_status', 'monthly_income', 'monthly_rate', 'num_companies_worked', 'over_18', 'over_time', 'percent_salary_hike', 'performance_score', 'relationship_satisfaction', 'standard_hours', 'stock_option_level', 'total_working_years', 'training_times_last_year', 'work_life_balance', 'years_at_company', 'years_in_current_role', 'years_since_last_promotion', 'years_with_curr_manager']


Unnamed: 0,age,attrition_flag,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_id,...,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,41,Yes,Travel_Rarely,1102,2,1,2,Life Sciences,1,1,...,1,80,0.0,8,0.5,1,6.0,4.0,0,5.0
1,49,No,Travel_Frequently,279,1,8,1,Life Sciences,1,2,...,4,80,1.0,10,3.0,3,10.0,7.0,1,7.0
2,37,Yes,Travel_Rarely,1373,1,2,2,Other,1,4,...,2,80,0.0,7,3.0,3,0.0,0.0,0,0.0
3,33,No,Travel_Frequently,1392,1,3,4,Life Sciences,1,5,...,3,80,0.0,8,3.0,3,8.0,7.0,3,0.0
4,27,No,Travel_Rarely,591,1,2,1,Medical,1,7,...,4,80,1.0,6,3.0,3,2.0,2.0,2,2.0


In [22]:
print("Unique target values BEFORE:", df['attrition_flag'].unique())

# Convert Yes/No → 1/0
df['attrition_flag'] = df['attrition_flag'].map({'No': 0, 'Yes': 1})

print("Unique target values AFTER:", df['attrition_flag'].unique())
print("Target distribution:\n", df['attrition_flag'].value_counts())


Unique target values BEFORE: ['Yes' 'No']
Unique target values AFTER: [1 0]
Target distribution:
 attrition_flag
0    1048
1     220
Name: count, dtype: int64


In [23]:
# 1. Split features and target
X = df.drop('attrition_flag', axis=1)
y = df['attrition_flag']

print("Shapes:", X.shape, y.shape)
print("Target distribution:\n", y.value_counts())


Shapes: (1268, 34) (1268,)
Target distribution:
 attrition_flag
0    1048
1     220
Name: count, dtype: int64


In [24]:
from sklearn.preprocessing import LabelEncoder

# 2. Encode categorical columns
label_encoders = {}
X_encoded = X.copy()

for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':       # categorical columns
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col])
        label_encoders[col] = le

print("Categorical columns encoded:", list(label_encoders.keys()))
print("\nEncoded X shape:", X_encoded.shape)


Categorical columns encoded: ['business_travel', 'education_field', 'over_18']

Encoded X shape: (1268, 34)


In [25]:
from sklearn.preprocessing import StandardScaler

# 3. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

print("Scaled X shape:", X_scaled.shape)
print("Example row:", X_scaled[0][:10])   # first 10 columns


Scaled X shape: (1268, 34)
Example row: [ 0.74684518  0.59509815  0.74118886  1.39049071 -1.0285245  -0.87993422
 -0.94316005  0.         -1.70307955 -0.65875578]


In [26]:
from imblearn.over_sampling import SMOTE

# 4. Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("Before SMOTE:", X_scaled.shape, y.shape)
print("After SMOTE:", X_resampled.shape, y_resampled.shape)

# Check new class balance
import pandas as pd
print("\nResampled target distribution:")
print(pd.Series(y_resampled).value_counts())


Before SMOTE: (1268, 34) (1268,)
After SMOTE: (2096, 34) (2096,)

Resampled target distribution:
attrition_flag
1    1048
0    1048
Name: count, dtype: int64


In [27]:
from sklearn.model_selection import train_test_split

# 5. Train-test split AFTER SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (1676, 34) (1676,)
Test shape: (420, 34) (420,)


In [28]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [29]:
# -----------------------------
# 1. Train XGBoost
# -----------------------------
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train, y_train)


In [30]:
# -----------------------------
# 2. Predict
# -----------------------------
y_pred = xgb.predict(X_test)


In [31]:
# -----------------------------
# 3. Metrics
# -----------------------------
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy : 0.9214285714285714
Precision: 0.9633507853403142
Recall   : 0.8761904761904762
F1 Score : 0.9177057356608479

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.97      0.92       210
           1       0.96      0.88      0.92       210

    accuracy                           0.92       420
   macro avg       0.92      0.92      0.92       420
weighted avg       0.92      0.92      0.92       420



In [34]:
with open("models/attrition_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)   # <- use your trained model variable here


In [35]:
import os
import pickle

# Ensure models folder exists
os.makedirs("models", exist_ok=True)

# Save preprocessor
with open("models/preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)
print("✅ Preprocessing pipeline saved")

# Save trained model/pipeline
with open("models/attrition_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)   # replace 'pipeline' with your trained model variable if different
print("✅ Trained model saved")


✅ Preprocessing pipeline saved
✅ Trained model saved


In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\abanu\Documents\t_iq_hr\data\processed\HRMS_cleaned.csv")
df.head()


Unnamed: 0,employee_id,name,department,job_role,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company,trainings_count
0,EMP000001,Vikram Singh,HR,Data Scientist,"New York, USA",4544478,0.78,0.8,7,12,0
1,EMP000002,Karan Patel,Marketing,Data Scientist,"Chennai, India",5180268,0.71,0.93,8,7,4
2,EMP000003,Vikram Malhotra,Marketing,Senior Software Engineer,"Chennai, India",2589268,0.81,0.56,6,3,3
3,EMP000004,Siddharth Khan,HR,ML Engineer,"Bengaluru, India",1321856,0.43,0.95,7,15,3
4,EMP000005,Priya Nair,Legal,ML Engineer,Remote,4371479,0.41,0.7,4,7,2


In [3]:
# Step 2: Prepare features and target

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Create synthetic attrition_flag (if not already present)
# 0 = stayed, 1 = left
if 'attrition_flag' not in df.columns:
    df['attrition_flag'] = np.random.choice([0, 1], size=len(df))

# 2. Prepare features (drop ID columns and target)
X = df.drop(columns=['employee_id', 'name', 'attrition_flag'])
y = df['attrition_flag']

# 3. Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Check shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())


X_train shape: (8000, 9)
X_test shape: (2000, 9)
y_train distribution:
 attrition_flag
1    4003
0    3997
Name: count, dtype: int64


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle

# 1. Identify numerical and categorical columns
numeric_features = ['current_salary', 'satisfaction_score', 'engagement_score', 
                    'num_skills', 'years_at_company', 'trainings_count']
categorical_features = ['department', 'job_role', 'location']

# 2. Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# 3. Fit preprocessor on training data
preprocessor.fit(X_train)

# 4. Save preprocessor
with open(r"C:\Users\abanu\Documents\t_iq_hr\notebooks\models\preprocessor.pkl", 'wb') as f:
    pickle.dump(preprocessor, f)

print("Preprocessor fitted and saved successfully!")


Preprocessor fitted and saved successfully!


In [5]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pickle

# Load preprocessor
with open(r"C:\Users\abanu\Documents\t_iq_hr\notebooks\models\preprocessor.pkl", 'rb') as f:
    preprocessor = pickle.load(f)

# Create pipeline
attrition_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Fit model
attrition_pipeline.fit(X_train, y_train)

# Save model
with open(r"C:\Users\abanu\Documents\t_iq_hr\notebooks\models\attrition_model.pkl", 'wb') as f:
    pickle.dump(attrition_pipeline, f)

print("Attrition model trained and saved successfully!")


Attrition model trained and saved successfully!


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
import pickle
import pandas as pd

BASE_PATH = r"C:\Users\abanu\Documents\t_iq_hr"

# Load preprocessor
with open(BASE_PATH + r"\notebooks\models\preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

# Load attrition model
with open(BASE_PATH + r"\notebooks\models\attrition_model.pkl", "rb") as f:
    attrition_model = pickle.load(f)

print("✅ Preprocessor and Attrition model loaded successfully")


✅ Preprocessor and Attrition model loaded successfully


In [8]:
print(type(preprocessor))
print(type(attrition_model))


<class 'sklearn.compose._column_transformer.ColumnTransformer'>
<class 'sklearn.pipeline.Pipeline'>
