In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Machine Learning Models and Utilities
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split, GridSearchCV

# Evaluation Metrics
from sklearn.metrics import classification_report, hamming_loss, make_scorer, f1_score

# Association Rule Mining
from mlxtend.frequent_patterns import apriori, association_rules


In [39]:
# ================================
# 1. Data Loading & Preprocessing
# ================================



# Assuming your file is in 'My Drive/data/my_file.csv'
file_path = '../Data DSA3101/customer_data_with_labels_only.csv'


# Load dataset (make sure 'customers_with_clusters.csv' is in your working directory)
data = pd.read_csv(file_path)

In [40]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9429 entries, 0 to 9428
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         9429 non-null   int64 
 1   age                 9429 non-null   int64 
 2   job                 9429 non-null   object
 3   marital             9429 non-null   object
 4   education           9429 non-null   object
 5   credit_default      9429 non-null   int64 
 6   region              9429 non-null   object
 7   income              9429 non-null   int64 
 8   created_at          9429 non-null   object
 9   credit_card         9429 non-null   int64 
 10  personal_loan       9429 non-null   int64 
 11  mortgage            9429 non-null   int64 
 12  savings_account     9429 non-null   int64 
 13  investment_product  9429 non-null   int64 
 14  auto_loan           9429 non-null   int64 
 15  wealth_management   9429 non-null   int64 
 16  cluster             9429

In [41]:
# Drop the customer_id column as it is an identifier
if 'customer_id' in data.columns:
    data.drop('customer_id', axis=1, inplace=True)

# 3. Define Target Product Columns and Feature Columns
# The product columns are from index 9 to 15 as provided.
product_cols = ['credit_card', 'personal_loan', 'mortgage', 'savings_account',
                'investment_product', 'auto_loan', 'wealth_management']

# 4. Preprocess Categorical Features
# Identify categorical columns: job, marital, education, customer_segment, region.
categorical_cols = ['job', 'marital', 'education', 'cluster', 'region']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)



# 4.1 Convert product columns to boolean to avoid the DeprecationWarning in Apriori
data[product_cols] = data[product_cols].astype(bool)

data['days_since_acc_created'] = (pd.Timestamp.now() - pd.to_datetime(data['created_at'])) / pd.Timedelta(days=1)
data.drop('created_at', axis=1, inplace=True)

# Update feature columns after encoding (they are all non-product columns now)
feature_cols = [col for col in data.columns if col not in product_cols]

# Fill missing values in features (if any)
data[feature_cols] = data[feature_cols].fillna(method='ffill')



  data[feature_cols] = data[feature_cols].fillna(method='ffill')


In [42]:
# 5. Apriori Analysis on Product Columns
frequent_itemsets = apriori(data[product_cols], min_support=0.5, use_colnames=True)
frequent_itemsets

# Generate association rules using a confidence threshold (e.g., 0.7)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(personal_loan),(credit_card),0.508219,0.829209,0.988947
1,(credit_card),(savings_account),0.756496,0.902226,1.005447
2,(savings_account),(credit_card),0.756496,0.843045,1.005447
3,(credit_card),(auto_loan),0.589246,0.702757,1.023683
4,(auto_loan),(credit_card),0.589246,0.858335,1.023683
5,(personal_loan),(savings_account),0.550536,0.898252,1.001019
6,(auto_loan),(savings_account),0.617881,0.900046,1.003018
7,"(credit_card, auto_loan)",(savings_account),0.533355,0.905148,1.008703
8,"(credit_card, savings_account)",(auto_loan),0.533355,0.705033,1.026998
9,"(auto_loan, savings_account)",(credit_card),0.533355,0.863199,1.029485


In [43]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.838477,(credit_card)
1,0.612896,(personal_loan)
2,0.512992,(mortgage)
3,0.897338,(savings_account)
4,0.686499,(auto_loan)
5,0.508219,"(credit_card, personal_loan)"
6,0.756496,"(credit_card, savings_account)"
7,0.589246,"(credit_card, auto_loan)"
8,0.550536,"(savings_account, personal_loan)"
9,0.617881,"(savings_account, auto_loan)"


In [44]:
# For feature engineering, we focus on bundles of 2 or more products.
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) >= 2)]
frequent_itemsets

Unnamed: 0,support,itemsets
5,0.508219,"(credit_card, personal_loan)"
6,0.756496,"(credit_card, savings_account)"
7,0.589246,"(credit_card, auto_loan)"
8,0.550536,"(savings_account, personal_loan)"
9,0.617881,"(savings_account, auto_loan)"
10,0.533355,"(credit_card, auto_loan, savings_account)"


In [45]:
# 6. Create Engineered Bundle Features as Boolean
# For each frequent product bundle, we create a new binary feature.
# This feature is set to True if a customer has all products in the bundle, otherwise False.
for idx, row in frequent_itemsets.iterrows():
    bundle = row['itemsets']
    # Generate a feature name by concatenating product names (sorted for consistency)
    feature_name = 'has_' + '_'.join(sorted(bundle))
    # Create the new feature: check if all products in the bundle are True for each customer.
    data[feature_name] = data[list(bundle)].all(axis=1)

# 7. Display the Data with New Boolean Features
# For demonstration, print the first few rows to see the engineered bundle features.
data.head()


Unnamed: 0,age,credit_default,income,credit_card,personal_loan,mortgage,savings_account,investment_product,auto_loan,wealth_management,...,cluster_2,region_Suburban,region_Urban,days_since_acc_created,has_credit_card_personal_loan,has_credit_card_savings_account,has_auto_loan_credit_card,has_personal_loan_savings_account,has_auto_loan_savings_account,has_auto_loan_credit_card_savings_account
0,38,0,2603,True,True,True,True,True,True,True,...,True,False,True,1644.067838,True,True,True,True,True,True
1,73,0,3130,True,False,False,False,True,True,False,...,True,False,True,1343.444113,False,False,True,False,False,False
2,87,0,2401,True,True,False,True,False,False,False,...,False,False,True,338.954392,True,True,False,True,False,False
3,46,0,3355,True,True,False,True,False,True,True,...,True,False,True,1825.219183,True,True,True,True,True,True
4,70,0,4918,True,True,True,True,False,True,False,...,True,False,True,1750.531113,True,True,True,True,True,True


In [46]:
data.columns

Index(['age', 'credit_default', 'income', 'credit_card', 'personal_loan',
       'mortgage', 'savings_account', 'investment_product', 'auto_loan',
       'wealth_management', 'job_blue-collar', 'job_entrepreneur',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'marital_married',
       'marital_single', 'education_secondary', 'education_tertiary',
       'cluster_1', 'cluster_2', 'region_Suburban', 'region_Urban',
       'days_since_acc_created', 'has_credit_card_personal_loan',
       'has_credit_card_savings_account', 'has_auto_loan_credit_card',
       'has_personal_loan_savings_account', 'has_auto_loan_savings_account',
       'has_auto_loan_credit_card_savings_account'],
      dtype='object')

In [47]:
product_cols.extend(['has_credit_card_personal_loan',
       'has_credit_card_savings_account', 'has_auto_loan_credit_card',
       'has_personal_loan_savings_account', 'has_auto_loan_savings_account',
       'has_auto_loan_credit_card_savings_account'])

In [48]:
# 6. Prepare Data for Multi-label Classification
X = data[feature_cols]
y = data[product_cols]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# 3. Build the Base Multi-label Model using a Classifier Chain
# We start with a RandomForestClassifier as our base estimator.
base_clf = RandomForestClassifier(random_state=42)

# Initialize the Classifier Chain
chain = ClassifierChain(base_estimator=base_clf, random_state=42)

# 4. Define a Scoring Function for Grid Search
# We use macro F1-score since it gives equal weight to all classes.
scorer = make_scorer(f1_score, average='macro')

# 5. Set Up Parameter Grid for Tuning the RandomForest within the Classifier Chain
# Note: Parameter names for the base estimator are prefixed with 'base_estimator__'
param_grid = {
    'base_estimator__n_estimators': [50, 100, 200],
    'base_estimator__max_depth': [None, 10, 20],
    'base_estimator__min_samples_split': [2, 5, 10]
}

In [50]:
# 6. Perform Grid Search with Cross-Validation
grid = GridSearchCV(chain, param_grid=param_grid, scoring=scorer, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

# Output the best parameters and best score from grid search
print("Best Parameters:")
print(grid.best_params_)
print("Best Cross-Validated Macro F1 Score:", grid.best_score_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time=   2.3s
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time=   2.3s
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time=   2.4s
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time=   2.5s
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time=   2.2s
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time=   4.6s
[CV] END base_estimator__max_depth=None, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time=   4.7s
[CV] END base_estimator__ma

In [51]:
# 7. Evaluate the Best Model on the Test Set
best_chain = grid.best_estimator_
y_pred = best_chain.predict(X_test)
y_prob = best_chain.predict_proba(X_test)  # if you need probabilities for further adjustment

print("\nFinal Classification Report:")
print(classification_report(y_test, y_pred))
print("Final Hamming Loss:", hamming_loss(y_test, y_pred))


Final Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.91      1585
           1       0.63      0.88      0.74      1170
           2       0.62      0.72      0.66       972
           3       0.90      1.00      0.95      1698
           4       0.57      0.27      0.36       623
           5       0.70      0.96      0.81      1279
           6       0.60      0.16      0.25       519
           7       0.53      0.86      0.65       976
           8       0.77      0.99      0.87      1440
           9       0.61      0.97      0.75      1091
          10       0.57      0.88      0.69      1063
          11       0.63      0.97      0.76      1148
          12       0.55      0.98      0.70       986

   micro avg       0.67      0.88      0.76     14550
   macro avg       0.66      0.82      0.70     14550
weighted avg       0.68      0.88      0.75     14550
 samples avg       0.67      0.87      0.73     14

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 3. Build the Base Multi-label Model using a Classifier Chain
# We start with an XGBClassifier as our base estimator.
base_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize the Classifier Chain
chain = ClassifierChain(base_estimator=base_clf, random_state=42)

# 4. Define a Scoring Function for Grid Search
# We use macro F1-score since it gives equal weight to all classes.
scorer = make_scorer(f1_score, average='macro')

# 5. Set Up Parameter Grid for Tuning the XGBClassifier within the Classifier Chain
# Note: Parameter names for the base estimator are prefixed with 'base_estimator__'
param_grid = {
    'base_estimator__n_estimators': [50, 100, 200],
    'base_estimator__max_depth': [3, 5, 7],
    'base_estimator__learning_rate': [0.01, 0.1, 0.2],
    'base_estimator__subsample': [0.7, 0.9, 1.0],
    'base_estimator__colsample_bytree': [0.7, 0.9, 1.0]
}

# 6. Perform Grid Search with Cross-Validation
grid = GridSearchCV(chain, param_grid=param_grid, scoring=scorer, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

# Output the best parameters and best score from grid search
print("Best Parameters:")
print(grid.best_params_)
print("Best Cross-Validated Macro F1 Score:", grid.best_score_)

# 7. Evaluate the Best Model on the Test Set
best_chain = grid.best_estimator_
y_pred = best_chain.predict(X_test)
y_prob = best_chain.predict_proba(X_test)  # if you need probabilities for further adjustment

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END base_estimator__colsample_bytree=0.7, base_estimator__learning_rate=0.01, base_estimator__max_depth=3, base_estimator__n_estimators=50, base_estimator__subsample=0.7; total time=   0.3s
[CV] END base_estimator__colsample_bytree=0.7, base_estimator__learning_rate=0.01, base_estimator__max_depth=3, base_estimator__n_estimators=50, base_estimator__subsample=0.7; total time=   0.3s
[CV] END base_estimator__colsample_bytree=0.7, base_estimator__learning_rate=0.01, base_estimator__max_depth=3, base_estimator__n_estimators=50, base_estimator__subsample=0.9; total time=   0.3s
[CV] END base_estimator__colsample_bytree=0.7, base_estimator__learning_rate=0.01, base_estimator__max_depth=3, base_estimator__n_estimators=50, base_estimator__subsample=1.0; total time=   0.3s
[CV] END base_estimator__colsample_bytree=0.7, base_estimator__learning_rate=0.01, base_estimator__max_depth=3, base_estimator__n_estimators=50, base_estimat

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [56]:

print("\nFinal Classification Report:")
print(classification_report(y_test, y_pred))
print("Final Hamming Loss:", hamming_loss(y_test, y_pred))


Final Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91      1585
           1       0.63      0.91      0.75      1170
           2       0.62      0.69      0.66       972
           3       0.90      1.00      0.95      1698
           4       0.57      0.26      0.35       623
           5       0.70      0.96      0.81      1279
           6       0.60      0.14      0.23       519
           7       0.53      0.90      0.67       976
           8       0.77      0.99      0.87      1440
           9       0.61      0.97      0.75      1091
          10       0.58      0.91      0.71      1063
          11       0.63      0.97      0.77      1148
          12       0.55      0.98      0.71       986

   micro avg       0.67      0.89      0.77     14550
   macro avg       0.66      0.82      0.70     14550
weighted avg       0.68      0.89      0.75     14550
 samples avg       0.67      0.87      0.73     14

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
