In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os
os.chdir('/content/drive/MyDrive/Academics/Visiting Lectures/2026-H1/202601-SDP-AU/Session-11-Introduction-to-Machine-Learning')

### **Data Aquisition**

In [7]:
import pandas as pd
data = pd.read_excel('Data/bank_marketing_data.xlsx')

In [8]:
data.shape

(41188, 21)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [10]:
numeric_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)

Numeric Columns: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical Columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']


## **Encoding Target Variable**

In [16]:
# Map 'yes' to 1 and 'no' to 0 in the 'y' column
data['y'] = data['y'].map({'yes': 1, 'no': 0})

print("Value counts for the encoded 'y' variable:")
display(data['y'].value_counts())

print("\nFirst 5 rows of the DataFrame with encoded 'y' variable:")
display(data.head())

print("\nData types of the 'y' variable after encoding:")
display(data['y'].dtype)

Value counts for the encoded 'y' variable:


Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
0,36548
1,4640



First 5 rows of the DataFrame with encoded 'y' variable:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0



Data types of the 'y' variable after encoding:


dtype('int64')

## **Feature Engineering with Categorical Features**

In [37]:
import pandas as pd

def perform_feature_engineering(df, categorical_cols):
    """
    Performs one-hot encoding on categorical columns of a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        categorical_cols (list): A list of column names identified as categorical.

    Returns:
        pd.DataFrame: The DataFrame with categorical features one-hot encoded.
    """
    data_processed = df.copy()

    # Identify categorical columns to be one-hot encoded
    # The 'y' column is typically the target variable and often handled separately or after encoding other features
    cols_to_encode = [col for col in categorical_cols if col != 'y']

    print(f"Categorical columns to be one-hot encoded (excluding 'y'): {cols_to_encode}")

    # Perform one-hot encoding using pd.get_dummies
    encoded_features = pd.get_dummies(data_processed[cols_to_encode], dtype=int)

    # Drop the original categorical columns from data_processed
    data_processed = data_processed.drop(columns=cols_to_encode)

    # Join the encoded features back to the data_processed DataFrame
    data_processed = pd.concat([data_processed, encoded_features], axis=1)

    return data_processed

# Apply the function to your data
data_encoded = perform_feature_engineering(data.copy(), categorical_cols)

print("\nDataFrame after One-Hot Encoding and joining (first 5 rows):")
display(data_encoded.head())


Categorical columns to be one-hot encoded (excluding 'y'): ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

DataFrame after One-Hot Encoding and joining (first 5 rows):


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0


### **Data Splitting for Training and Testing**

Now that our data is prepared with all categorical features one-hot encoded and the target variable converted to numerical form, we can split it into training and testing sets. This is a crucial step in machine learning to evaluate the model's performance on unseen data.

We will use a 70:30 split, meaning 70% of the data will be used for training the model, and the remaining 30% for testing its performance. A `random_state` is set to ensure reproducibility of the split.

In [18]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
# Drop the 'y' column from the features set
X = data_encoded.drop('y', axis=1)
y = data_encoded['y']

print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")

# Split the data into training and testing sets
# test_size=0.30 means 30% of the data will be used for testing
# random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12345)

print("\nData split successfully!")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

# Display first few rows of training features and target
print("\nFirst 5 rows of X_train:")
display(X_train.head())
print("\nFirst 5 values of y_train:")
display(y_train.head())

Shape of features (X): (41188, 63)
Shape of target (y): (41188,)

Data split successfully!
Shape of X_train: (28831, 63)
Shape of X_test: (12357, 63)
Shape of y_train: (28831,)
Shape of y_test: (12357,)

First 5 rows of X_train:


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
26378,55,26,6,999,0,-0.1,93.2,-42.0,4.076,5195.8,...,0,0,0,0,1,0,0,0,1,0
38473,50,151,2,6,2,-3.4,92.431,-26.9,0.731,5017.5,...,1,0,0,1,0,0,0,1,0,0
29821,25,137,1,999,1,-1.8,93.075,-47.1,1.405,5099.1,...,0,0,0,1,0,0,0,1,0,0
23788,41,47,13,999,0,1.4,93.444,-36.1,4.963,5228.1,...,0,0,1,0,0,0,0,0,1,0
19696,34,336,3,999,0,1.4,93.444,-36.1,4.968,5228.1,...,0,0,0,0,1,0,0,0,1,0



First 5 values of y_train:


Unnamed: 0,y
26378,0
38473,0
29821,0
23788,0
19696,0


## **Model Training**



In [19]:
from xgboost import XGBClassifier

# Initialize the XGBoost classifier with specified parameters
# objective='binary:logistic' for binary classification
# eval_metric='logloss' for evaluation metric during training
model_xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=12345)

print("XGBoost Classifier initialized successfully:")
print(model_xgb)

XGBoost Classifier initialized successfully:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)


## **Training the Model on the Training Data**

After evaluating the model with cross-validation and on the test set, it's good practice to retrain the model on the entire training dataset (`X_train` and `y_train`). This ensures that the final model benefits from all available training examples before being potentially deployed or used for new predictions.

In [27]:
print("Retraining the XGBoost model on the full training dataset (X_train, y_train)...")
model_xgb.fit(X_train, y_train)
print("Model retraining complete.")

Retraining the XGBoost model on the full training dataset (X_train, y_train)...
Model retraining complete.


## **Model Evaluation on Training Set**

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score
import numpy as np

print("Evaluating XGBoost Model on the Training Set...")

# Make predictions on the training set
y_pred_train = model_xgb.predict(X_train)

# Calculate metrics for the training set
cm_train = confusion_matrix(y_train, y_pred_train)
acc_train = accuracy_score(y_train, y_pred_train)
sens_train = recall_score(y_train, y_pred_train, pos_label=1) # Sensitivity is recall of positive class
spec_train = recall_score(y_train, y_pred_train, pos_label=0) # Specificity is recall of negative class
f1_train = f1_score(y_train, y_pred_train)

print("\n--- Training Set Performance ---")
print(f"Confusion Matrix:\n{cm_train}")
print(f"Accuracy: {acc_train:.4f}")
print(f"Sensitivity (Recall): {sens_train:.4f}")
print(f"Specificity: {spec_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")

print("\nModel evaluation on training set complete.")

Evaluating XGBoost Model on the Training Set...

--- Training Set Performance ---
Confusion Matrix:
[[25222   351]
 [  722  2536]]
Accuracy: 0.9628
Sensitivity (Recall): 0.7784
Specificity: 0.9863
F1-Score: 0.8254

Model evaluation on training set complete.


In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score
import numpy as np

print("Evaluating XGBoost Model on the Test Set...")

# Make predictions on the test set
y_pred_test = model_xgb.predict(X_test)

# Calculate metrics for the test set
cm_test = confusion_matrix(y_test, y_pred_test)
acc_test = accuracy_score(y_test, y_pred_test)
sens_test = recall_score(y_test, y_pred_test, pos_label=1) # Sensitivity is recall of positive class
spec_test = recall_score(y_test, y_pred_test, pos_label=0) # Specificity is recall of negative class
f1_test = f1_score(y_test, y_pred_test)

print("\n--- Test Set Performance ---")
print(f"Confusion Matrix:\n{cm_test}")
print(f"Accuracy: {acc_test:.4f}")
print(f"Sensitivity (Recall): {sens_test:.4f}")
print(f"Specificity: {spec_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

print("\nModel evaluation on test set complete.")

Evaluating XGBoost Model on the Test Set...

--- Test Set Performance ---
Confusion Matrix:
[[10780   195]
 [  413   969]]
Accuracy: 0.9508
Sensitivity (Recall): 0.7012
Specificity: 0.9822
F1-Score: 0.7612

Model evaluation on test set complete.


## **Save Trained Model**

Saving the trained model is crucial for future use. This allows you to load the model later without having to retrain it, which is especially useful for deployment or making new predictions. We will save the model using `joblib` in a 'model' directory.

In [30]:
import joblib
import os

# Define the directory to save the model
model_dir = 'Model'
# Define the filename for the saved model
model_filename = 'bank_marketing.joblib'

# Create the directory if it doesn't exist
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Created directory: {model_dir}")

# Combine directory and filename for the full path
model_path = os.path.join(model_dir, model_filename)

# Save the model to the specified path
joblib.dump(model_xgb, model_path)

print(f"Model successfully saved to: {model_path}")

Model successfully saved to: Model/bank_marketing.joblib


## **Production**

## **Create a sample data for illustration**


In [32]:
import pandas as pd

# Draw a random sample of 100 records from the DataFrame df
# random_state is set for reproducibility
prod_data = data.sample(n=100, random_state=42)

print("Sampled DataFrame (first 5 rows):")
display(prod_data.head())

print(f"\nShape of the sampled DataFrame: {prod_data.shape}")

Sampled DataFrame (first 5 rows):


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
32884,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,0
3169,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,0
32206,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,0
9403,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,0
14020,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,0



Shape of the sampled DataFrame: (100, 21)


In [33]:
numeric_cols = prod_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = prod_data.select_dtypes(include=['object']).columns.tolist()

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)

Numeric Columns: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']
Categorical Columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


## **Deleting 'y' Column from Production Data**

To prepare the `prod_data` for making predictions with the trained model, we need to remove the target variable 'y' from it, as this would typically not be available in a real-world production scenario.

In [34]:
# Delete the 'y' column from prod_data
prod_data_features = prod_data.drop('y', axis=1)

print("prod_data after deleting 'y' column (first 5 rows):")
display(prod_data_features.head())

print(f"\nShape of prod_data after deleting 'y' column: {prod_data_features.shape}")

prod_data after deleting 'y' column (first 5 rows):


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
32884,57,technician,married,high.school,no,no,yes,cellular,may,mon,371,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1
3169,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,285,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0
32206,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,52,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1
9403,36,admin.,married,high.school,no,no,no,telephone,jun,fri,355,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1
14020,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,189,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1



Shape of prod_data after deleting 'y' column: (100, 20)


In [39]:
prod_data_encoded = perform_feature_engineering(prod_data_features, categorical_cols)
prod_data_encoded.shape

Categorical columns to be one-hot encoded (excluding 'y'): ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


(100, 57)

## **Load Trained Model**

To use the previously saved model for predictions or further analysis, we need to load it back into the environment. We will use `joblib` for this purpose.

In [40]:
import joblib
import os

# Define the directory and filename where the model is saved
model_dir = 'Model'
model_filename = 'bank_marketing.joblib'
model_path = os.path.join(model_dir, model_filename)

# Load the model
loaded_model = joblib.load(model_path)

print(f"Model successfully loaded from: {model_path}")
print("Loaded model type:", type(loaded_model))

Model successfully loaded from: Model/bank_marketing.joblib
Loaded model type: <class 'xgboost.sklearn.XGBClassifier'>


## **Scoring Function**

This function will take the loaded model and the pre-processed production data (`prod_data_encoded`) to generate predictions. The predictions will indicate whether a customer is likely to subscribe to a term deposit (1) or not (0).

In [50]:
def score_data(model, data_to_score):
    """
    Generates predictions using a trained model on new data.

    Args:
        model: The trained machine learning model (e.g., XGBoost classifier).
        data_to_score (pd.DataFrame): The pre-processed DataFrame containing features for scoring.

    Returns:
        np.array: An array of predictions (e.g., 0 or 1 for binary classification).
    """
    print("Generating predictions...")
    predictions = model.predict(data_to_score)
    print("Predictions generated successfully.")
    return predictions

# Ensure prod_data_encoded has the same columns as X_train, and in the same order
# This is crucial for models that expect a fixed set of features, like XGBoost after one-hot encoding
missing_cols = set(X_train.columns) - set(prod_data_encoded.columns)
print(missing_cols)
for c in missing_cols:
    prod_data_encoded[c] = 0

# Reindex the columns to ensure they are in the same order as X_train
prod_data_aligned = prod_data_encoded[X_train.columns]

# Apply the scoring function to the production data
prod_predictions = score_data(loaded_model, prod_data_aligned)

print("\nFirst 10 predictions for the production data:")
print(prod_predictions[:10])

print("\nValue counts of the predictions:")
unique, counts = np.unique(prod_predictions, return_counts=True)
for u, c in zip(unique, counts):
    print(f"Predicted Class {u}: {c} instances")

set()
Generating predictions...
Predictions generated successfully.

First 10 predictions for the production data:
[0 0 0 0 0 0 0 0 0 0]

Value counts of the predictions:
Predicted Class 0: 88 instances
Predicted Class 1: 12 instances
