# CA1 - Machine learning
Develop a classifier to predict the outcome of a bank's marketing campaign using the provided dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

## Data import, exploration & preprocessing
This section will go over exploring and preprocesing the training set of the bank data marketing campaign. The first step will look for missing values, then explore the value counts of some of the input features, as well as the distribution of values for features of integer types. Lastly, I will preprocess the input features and target variable appropriately for model training.

In [None]:
# import dataframe
bank_data = pd.read_csv("data/trainingset.txt")
bank_data.head(5)

In [None]:
# check number of rows and columns
print("Number of columns: ", len(bank_data.columns))
print("Number of rows: ", len(bank_data))

In [None]:
# check data type of columns
print("Data types of columns:")
bank_data.dtypes

In [None]:
# check missing values
bank_data.isna().sum()

In [None]:
# count value unknown across all columns
val_to_count = "unknown"
unknown_counts = {}
for col in bank_data.columns:
    count = bank_data[col].value_counts().get(val_to_count, 0)
    unknown_counts[col] = count 

print("Value counts of '", val_to_count, "' across all columns: ")
for col, count in unknown_counts.items():
    print(col + ":", count)

'Unknown' is a value present in certain feature columns, and can be interpreted as NaN. I decided to count its occurence across all columns to see if it is statistically significant. It is significant in the 'poutcome' feature, constitutes 80% of total values. It is also significant in 'contact' so we can't remove it. We can however remove 'unknown' values in education and job columns as it constitutes 4.3% & 0.65% of total values.

In [None]:
# remove unknown in education & job
bank_data.drop(bank_data[bank_data["education"] == "unknown"].index, inplace=True)
bank_data.drop(bank_data[bank_data["job"] == "unknown"].index, inplace=True)

# check it has been removed
print("Value counts: ",bank_data["education"].value_counts(),
      "\n\nValue counts: ",bank_data["job"].value_counts())

In [None]:
# check unknown counts remaining in contact and poutcome
print("Value counts: ",bank_data["contact"].value_counts(),
      "\n\nValue counts: ",bank_data["poutcome"].value_counts())

'unknown' value count is still significant in the 'contact' and 'poutcome' columns after removing it from the others. Therefore, we leave as is and we proceed with the next step data preprocessing which is encoding.
Columns with only two possible categorical values will be binary encoded, the ones with more will be one hot encoded. The target variable will alse be binary encoded.
The day & month columns will be combined into a date column, I chose the year 2023 as it has not been specified.

In [None]:
# binary encode default, housing and loan
cols_to_bencode = ["default", "housing", "loan"]
data_bencoded = pd.get_dummies(bank_data, columns=cols_to_bencode, drop_first=True)
data_bencoded.columns = data_bencoded.columns.str.replace('yes', 'encoded')

# one-hot encode remaining categorical columns
data_ohencoded = pd.get_dummies(bank_data, columns=["job", "marital", "education", "contact", "poutcome"])

# binary encode target variable Y
# false is Type A, True is TypeB
target_encoded = pd.get_dummies(bank_data, columns=["y"], drop_first=True)
target_encoded.columns = target_encoded.columns.str.replace('TypeB', 'encoded')
target_encoded.columns = target_encoded.columns.str.replace('TypeA', 'encoded')

# combine one-hot encoded & binary encoded dataframes
combined_encoded = pd.concat([data_bencoded, data_ohencoded, target_encoded], axis=1)

# drop duplicate cols
combined_encoded = combined_encoded.loc[:,~combined_encoded.columns.duplicated()]

# drop non-encoded cols
combined_encoded.drop(["job", "marital","education","contact","poutcome","default", "housing", "loan", "y"], axis=1, inplace=True)

# combine day & month into one column
combined_encoded['date'] = combined_encoded['day'].astype(str) + '-' + combined_encoded['month']

# convert to datetime type
combined_encoded['date'] = pd.to_datetime(combined_encoded['date'], format='%d-%b')

# set year as 2023
combined_encoded['date'] = combined_encoded['date'] + pd.offsets.DateOffset(years=2023 - combined_encoded['date'].dt.year.max())

# remove day & month columns from df
combined_encoded.drop(['day', 'month'], axis=1, inplace=True)

# check new dataframe
combined_encoded.head(5)

Next I plotted the values of integer data type to see their distribution and assess whether they needed to be normalized.

In [None]:
# check distribution of int type features
cols_distribution = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']

fig, axes = plt.subplots(nrows=len(cols_distribution), ncols=1, figsize=(6, 3 * len(cols_distribution)))

# plot each column
for i, col in enumerate(cols_distribution):
    axes[i].hist(combined_encoded[col], bins=20, color='blue', alpha=0.7)
    axes[i].set_xlabel('Values')
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'Distribution of {col}')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

I realized that the column "duration" only had one type of value = 0, which I confirmed by checking the counts of values. I deduced that the column did not bring any value to predicting the target variable; therefore I removed it. In addition, most of the value distributions skew to the left and have one value towering over the others, therefore I will apply StandardScaler() normalization to even it out.

In [None]:
# check value counts of 'duration'
combined_encoded["duration"].value_counts()

In [None]:
# drop 'duration' column
combined_encoded.drop(["duration"], axis=1, inplace=True)
combined_encoded.head(5)

In [None]:
# copy dataframe
scaled_df = combined_encoded.copy()

# normalize age, balance, campaign, pdays & previous
cols_scale = ["age", "balance","campaign","pdays","previous"]
scaler = StandardScaler()

scaled_df = scaler.fit_transform(scaled_df[cols_scale])
scaled_df = pd.DataFrame(scaled_df, columns=cols_scale)
scaled_df.head(5)

In [None]:
# view new distribution of scaled features
fig, axes = plt.subplots(nrows=len(cols_scale), ncols=1, figsize=(6, 3 * len(cols_scale)))

for i, col in enumerate(cols_scale):
    axes[i].hist(scaled_df[col], bins=20, color='green', alpha=0.7)
    axes[i].set_xlabel('Values')
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'Distribution of scaled {col}')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

Distribution remains heavily skewed to the left for 'previous', 'campaign' and 'balance'. They also all have (except for age) one bin significantly larger than the others, especially 'pdays'. This can potentially affect ML models that make assumptions about normal distribution of the data like linear regression. The last step of the preprocessing is to combine scaled features dataframe with the encoded features dataframe to create a final one ready for model training.


In [None]:
scaled_df = scaled_df.add_prefix('scaled_')
combined_encoded.reset_index(drop=True, inplace=True)

# combine scaled & encoded dataframes
final_bank = pd.concat([combined_encoded, scaled_df], axis=1)

# drop non-encoded cols
final_bank.drop(["age", "previous","campaign","balance","pdays"], axis=1, inplace=True)

# convert date column to int64 for model training
final_bank['date'] = final_bank['date'].astype("int64")

# check all columns are there
final_bank.columns

In [None]:
# check for any missing values as a result of preprocessing
final_bank.isna().sum()

In [None]:
# reorder columns in dataframe
final_order = ["scaled_age", "job_JobCat1","job_JobCat2","job_JobCat3","job_JobCat4","job_JobCat5","job_JobCat6","job_JobCat7","job_JobCat8","job_JobCat9","job_JobCat10","marital_divorced","marital_married","marital_single","education_primary","education_secondary","education_tertiary","default_encoded","scaled_balance","housing_encoded","loan_encoded","contact_cellular","contact_telephone","contact_unknown","date","scaled_campaign","scaled_pdays","scaled_previous","poutcome_failure","poutcome_other","poutcome_success","poutcome_unknown","y_encoded"]
final_bank = final_bank[final_order]
final_bank.head(5)

## Model Training
In this section, I will use the final_bank dataframe created during preprocessing to train and test various machine learning models using the sci-kit learn library.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_curve, roc_auc_score


In [None]:
# collect input features and target feature
features_cols = final_bank.columns[:-1]
target_col = final_bank.columns[-1]
X = final_bank[features_cols]
y = final_bank[target_col]

# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

I am selecting features based on their variance first, to get rid of those with low variance, meaning those where most of the values are the same. I have set the threshold at 0.2, meaning if 80%+ of the values of a feature are the same, that feature will be removed. We have 32 features currently.

In [None]:
# check shape before variance thresholding
print("Shape of X_train before variance thresholding:", X_train.shape)
print("Shape of X_test before variance thresholding:", X_test.shape)

In [None]:
# define variance selector
var_selector = VarianceThreshold(threshold=0.2)

# fit to training data
var_selector.fit(X_train)

# transform training data
X_train = var_selector.transform(X_train)
X_test = var_selector.transform(X_test)

In [None]:
# check shape after variance thresholding
print("Shape of X_train after variance thresholding:", X_train.shape)
print("Shape of X_test after variance thresholding:", X_test.shape)

### Random Forest Classifier
Random Forest models are known to be robust supervised models suitable for classification tasks with in-built feature importance & selection based on node impurity. They are also resilient to noise and modulable. Parameters such as random state, n_estimators and maximum depth will remain the same throughout different iterations and experimentation of the Random forest classifiers.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# fit basic rf model
rf0 = RandomForestClassifier(max_depth=15, random_state=2, n_estimators=200)
rf0.fit(X_train, y_train)

# predict
y_pred0 = rf0.predict(X_test)

# evaluate metrics
accuracy0 = accuracy_score(y_test, y_pred0)
print("Accuracy:", accuracy0)
f1_rf0 = f1_score(y_test, y_pred0)
print("F1_score:", f1_rf0)

print("Classification Report:")
print(classification_report(y_test, y_pred0))


Initial results show model accuracy is decently high - 88.33%, but has a very low F1-score of 25.70%. This low score in addition to the classification report shows that the model performs well to classify one class (False) over another (True) because of the presence of class imbalance - 4563 vs 637. Before addressing the class imbalance, let's do some more feature selection by applying Recursive Feature Elimination with cross-validation to the training dataset to see if model performance changes.

In [None]:
# create RFECV object
selector_rfe = RFECV(estimator=rf0, step=1, cv=5)

# fit the RFECV object to training data
selector_rfe.fit(X_train, y_train)

# get selected features
X_train_rfe = selector_rfe.transform(X_train)
X_test_rfe = selector_rfe.transform(X_test)

# fit RF model
rf_rfe = RandomForestClassifier(max_depth=15, random_state=2, n_estimators=200)
rf_rfe.fit(X_train_rfe, y_train)

# predict
y_pred_rfe = rf_rfe.predict(X_test_rfe)

# evaluate metrics
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
print("Accuracy:", accuracy_rfe)
f1_rf_rfe = f1_score(y_test, y_pred_rfe)
print("F1_score:", f1_rf_rfe)

print("Classification Report:")
print(classification_report(y_test, y_pred_rfe))


Model performance improved, specifically F1-score increased by 6.49% while accuracy increased by 0.25%. Let's apply a different method: the built-in feature importance score of the Random Forest classifier to select the top 75th percentile features.

In [None]:
# retrieve feature importance scores
feature_importances = rf0.feature_importances_

# select top 75th percentile features based on importance scores
percentile_value = 75
threshold_fi = np.percentile(rf0.feature_importances_, percentile_value)
selector_fi = SelectFromModel(rf0, threshold=threshold_fi)
selector_fi.fit(X_train, y_train)

# transform training data
X_train_fi = selector_fi.transform(X_train)
X_test_fi = selector_fi.transform(X_test)

# train model on selected features
rf_fi = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=2)
rf_fi.fit(X_train_fi, y_train)

# predict
y_pred_fi = rf_fi.predict(X_test_fi)

# evaluate metrics
accuracy_fi = accuracy_score(y_test, y_pred_fi)
print("Accuracy:", accuracy_fi)
f1_rf_fi = f1_score(y_test, y_pred_fi)
print("F1_score:", f1_rf_fi)

print("Classification Report:")
print(classification_report(y_test, y_pred_fi))

Model performance is similar to the one with RFE cross validation, with slightly lower accuracy and F1-score overall. I will keep the training data with the RFE selected features for the next iterations of the Random Forest classifier model to address class imbalance. The first experiment is to add 'balanced' class weights.

In [None]:
# define & fit RF model with balanced weights
rf2 = RandomForestClassifier(max_depth=10, random_state=2, class_weight="balanced", n_estimators=200)
rf2.fit(X_train_rfe, y_train)

# predict
y_pred2 = rf2.predict(X_test_rfe)

# evaluate metrics
accuracy2 = accuracy_score(y_test, y_pred2)
print("Accuracy:", accuracy2)
f1_rf2 = f1_score(y_test, y_pred2)
print("F1_score:", f1_rf2)

print("Classification Report:")
print(classification_report(y_test, y_pred2))

Adding the parameter 'balanced' weights to the model increased overall F1-score by 14.97% compared to the RFE Model. Recall & F1 metrics for the 'True' class also increased. Let's experiment with custom weights now: I created a parameter grid to grid search for the best weight values using StratifiedKfold validation, the random forest classifier defined above and f1-scoring.
I plotted the scoring of the weights as well to see the evolution of the grid search and to see where the best parameter falls.

In [None]:
# set range for class weights
weights = np.linspace(0.0,6.0,25)

# create dictionary grid for grid search
param_grid = {'class_weight': [{0: x, 1: 6.0 - x} for x in weights]}

# define grid search with RF classifier, stratified k-fold & f1 scoring
gridsearch = GridSearchCV(estimator= rf_rfe, param_grid= param_grid, cv=StratifiedKFold(), n_jobs=2, scoring='f1',verbose=2)

# fit the grid search to training data
gridsearch.fit(X_train_rfe, y_train)

# get the best parameters for both classes
best_params = gridsearch.best_params_
best_weight_class_0 = best_params['class_weight'][0]
best_weight_class_1 = best_params['class_weight'][1]

print("Best Weight for Class 0 (False):", best_weight_class_0)
print("Best Weight for Class 1 (True):", best_weight_class_1)


In [None]:
# plot scoring for the two classes
plt.figure(figsize=(18, 6))

# plot for class 0 (False - majority)
weigh_data0 = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1 - weights)})
plt.subplot(1, 2, 1)
plt.plot(weigh_data0['weight'], weigh_data0['score'], marker='o')
plt.xlabel('Weight for class 0')
plt.ylabel('F1 score')
plt.xticks([round(i / 2, 1) for i in range(-8, 4, 1)])
plt.title('Scoring for different class weights (False Label)', fontsize=16)
plt.axvline(x=best_weight_class_0, color='r', linestyle='--', label=f'Best Weight: {best_weight_class_0}')
plt.legend()

# plot for class 1 (True - minority)
weigh_data1 = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (weights)})
plt.subplot(1, 2, 2)
plt.plot(weigh_data1['weight'], weigh_data1['score'], marker='o')
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i / 2, 1) for i in range(0, 15, 1)])
plt.title('Scoring for different class weights (True Label)', fontsize=16)
plt.axvline(x=best_weight_class_1, color='r', linestyle='--', label=f'Best Weight: {best_weight_class_1}')
plt.legend()

plt.tight_layout()
plt.show()

The best weight parameters are 0.50 and 5.5 respectively for majority Class 0 (False) and minority Class 1 (True). I implement these weights below to see their effect on the performance of the Random Forest classifier.

In [None]:
# define class weights
class_weights = {0: best_weight_class_0, 1: best_weight_class_1}

# define & fit RF model with weights
rf3 = RandomForestClassifier(max_depth=10, random_state=2, class_weight=class_weights, n_estimators=200)
rf3.fit(X_train_rfe, y_train)

# predict
y_pred3 = rf3.predict(X_test_rfe)

# evaluate metrics
accuracy3 = accuracy_score(y_test, y_pred3)
print("Accuracy:", accuracy3)
f1_rf3 = f1_score(y_test, y_pred3)
print("F1_score:", f1_rf3)

print("Classification Report:")
print(classification_report(y_test, y_pred3))

Both Accuracy and F1-score worsened, dropping by 8.12% and 6.30% respectively compared to 'balanced' weights parameter in the previous model which is interesting. The reason could be that sci-kit's learn 'balanced' parameter automatically adjusts class weights inversely proportional to class frequencies in the training data, allowing it to effectively handle class imbalance compared to grid search which can be sensitive to complex and/or irregular class distribution and depends on the variability of the dataset.<br><br>
Moving on, another method to handle class imbalance is to undersample the majority class in the training data.

In [None]:
# undersample training data
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_rfe, y_train)

# fit RF
rf4 = RandomForestClassifier(max_depth=10, random_state=2, n_estimators=200)
rf4.fit(X_train_resampled, y_train_resampled)

# predict
y_pred4 = rf4.predict(X_test_rfe)

# evaluate metrics
accuracy4 = accuracy_score(y_test, y_pred4)
print("Accuracy:", accuracy4)
f1_rf4 = f1_score(y_test, y_pred4)
print("F1_score:", f1_rf4)

print("Classification Report:")
print(classification_report(y_test, y_pred4))


Undersampling reduced overall model accuracy by 11.27% but increased model F1-score by 9.74% compared to the base RFE model. Recall & F1-score improved significantly for the 'True' class; all three metrics worsened for the 'False' class, similarly to the model where balanced class weights was applied. This is a direct cause of undersampling, which reduces the number of True positives for the majority class. Let's combine the balanced class weights with undersampling to help the model capture more complexities in the dataset.

In [None]:
# fit RF classifier with balanced class weights
rf5 = RandomForestClassifier(max_depth=10, random_state=2, n_estimators=200, class_weight='balanced')
rf5.fit(X_train_resampled, y_train_resampled)

# predict
y_pred5 = rf5.predict(X_test_rfe)

# evaluate metrics
accuracy5 = accuracy_score(y_test, y_pred5)
print("Accuracy:", accuracy5)
f1_rf5 = f1_score(y_test, y_pred5)
print("F1_score:", f1_rf5)

print("Classification Report:")
print(classification_report(y_test, y_pred5))


Adding balanced class weights to undersampling did not result in any changes in model performance compared to undersampling on its own. This is probably due to undersampling already handling the class imbalance, giving no 'work' to do for the class weights.<br><br> Regarding metrics, when class imbalance is prevalent, it is beneficial to also calculate ROC and area under ROC curve as they are less sensitive to imbalance compared to acuracy, precision and recall.

In [None]:
# calculate probabilities of 'True' class for all our model iterations
rf_probs0 = rf0.predict_proba(X_test)[:,1]  # basic model pre-RFE
rf_probs_rfe = rf_rfe.predict_proba(X_test_rfe)[:,1]  # basic model with RFE
rf2_probs2 = rf2.predict_proba(X_test_rfe)[:,1] # balanced class weight model
rf3_probs3 = rf3.predict_proba(X_test_rfe)[:,1] # custom class weight model
rf4_probs4 = rf4.predict_proba(X_train_resampled)[:,1] # undersampling model
rf5_probs5 = rf5.predict_proba(X_train_resampled)[:,1] # undersampling + class weight model

# calculate ROC curve
rf_fpr0, rf_tpr0, thresholds = roc_curve(y_test, rf_probs0)
rf_fpr_rfe, rf_tpr_rfe, thresholds = roc_curve(y_test, rf_probs_rfe)
rf_fpr2, rf_tpr2, thresholds = roc_curve(y_test, rf2_probs2)
rf_fpr3, rf_tpr3, thresholds = roc_curve(y_test, rf3_probs3)
rf_fpr4, rf_tpr4, thresholds = roc_curve(y_train_resampled, rf4_probs4)
rf_fpr5, rf_tpr5, thresholds = roc_curve(y_train_resampled, rf5_probs5)

# calculate AUC
rf_auc0 = roc_auc_score(y_test, rf_probs0)
rf_auc_rfe = roc_auc_score(y_test, rf_probs_rfe)
rf_auc2 = roc_auc_score(y_test, rf2_probs2)
rf_auc3 = roc_auc_score(y_test, rf3_probs3)
rf_auc4 = roc_auc_score(y_train_resampled, rf4_probs4)
rf_auc5 = roc_auc_score(y_train_resampled, rf5_probs5)


# plot ROC curve
plt.figure(figsize=(10, 5))
plt.plot(rf_fpr0, rf_tpr0, color='blue', lw=1, label=f'Basic RF (AUC = {rf_auc0:.2f})')
plt.plot(rf_fpr_rfe, rf_tpr_rfe, color='black', lw=1, label=f'Basic RF + RFE (AUC = {rf_auc_rfe:.2f})')
plt.plot(rf_fpr2, rf_tpr2, color='red', lw=1, label=f'RF + balanced class weights (AUC = {rf_auc2:.2f})')
plt.plot(rf_fpr3, rf_tpr3, color='orange', lw=1, label=f'RF + custom class weights (AUC = {rf_auc3:.2f})')
plt.plot(rf_fpr4, rf_tpr4, color='green', lw=1, label=f'RF + undersampling (AUC = {rf_auc4:.2f})')
plt.plot(rf_fpr5, rf_tpr5, color='purple', lw=1, label=f'RF + undersampling & custom class weights (AUC = {rf_auc5:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of 5 RF classifiers')
plt.legend(loc='lower right')
plt.show()

The higher the AUC score, the better the model is at distinguishing between the positive and negative classes. We can see that undersampling significantly improved the model's ability to distinguish from the classes compared to class weights on their own. The basic Random Forest classifier scored similarly to the one with RFE applied. Below I compiled the performance scores of all 6 Random Forest classifiers defined above. As a reminder, all models after the Basic RF + RFE use the training data where Recursive Feature Elimination was applied.

In [None]:
# compile RF model scores into a dataframe
rf_data = [["Basic RF", accuracy0, f1_rf0, rf_auc0],
           ["Basic RF + RFE", accuracy_rfe, f1_rf_rfe, rf_auc_rfe],
           ["RF + Balanced weights", accuracy2, f1_rf2, rf_auc2],
           ["RF + custom weights", accuracy3, f1_rf3, rf_auc3],
           ["RF + undersampling", accuracy4, f1_rf4, rf_auc4],
           ["RF + undersampling & balanced weights", accuracy5, f1_rf5, rf_auc5]]
rf_scores = pd.DataFrame(rf_data, columns=["Model", "Accuracy"," F1-score", "AUC score"])
rf_scores

The basic RF + RFE classifier has the highest accuracy of them all, however the low F1-score suggests the model struggles to capture the minority 'True' class appropriately. Adding balanced weights slightly reduced model accuracy but increased it's ability to better capture the minority class, as indicated by the significantly improved F1-score. The model with the custom weight scored much worse to the ones with balanced weights, indicating that our cross validation was unable to find similar weights to sci-kit's learn parameter.<br>
Undersampling severely reduced the model's accuracy, and achieved a lower F1-score compared to the model's with balanced weights; however it achieved the highest AUC score. Adding weights to undersampling did not improve nor worsen model performance.<br><br>
Due to the class imbalance present in our training dataset, AUC and F1-score are more important than accuracy. Therefore, our Random Forest Classifier where Recursive Feature Elimination was applied & paired balanced class weights for the target classes is the best performing one with a good balance between accuracy, F1-score and AUC score at 84.79%, 47.16% and 79.34% respectively.

In [None]:
import joblib

# save best performing model
joblib.dump(rf2, "rf_model_rfe.pkl")

### AdaBoost
Adaptive Boost is an ensemble learning method that combines multiple machine learning models to create a robust & strong classifier. I will fit the AdaBoost classifier below with some basic classifiers and compare their performance to the Random Forest classifier above. The training data used will be the one post RFE.

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# define basic models to use in the adaboost for comparison with RF model
dt_clf = DecisionTreeClassifier(class_weight='balanced')
rf_clf = RandomForestClassifier(class_weight='balanced')

# define adabost classifier
adaboost_clf = AdaBoostClassifier(estimator=None, n_estimators=50, random_state=2, learning_rate=1, algorithm='SAMME')
adaboost_clf_dt = AdaBoostClassifier(estimator=dt_clf, n_estimators=50, random_state=2, learning_rate=1, algorithm='SAMME')
adaboost_clf_rf = AdaBoostClassifier(estimator=rf_clf, n_estimators=50, random_state=2, learning_rate=1, algorithm='SAMME')

# train adaboost classifier
adaboost_clf.fit(X_train_rfe, y_train)
adaboost_clf_dt.fit(X_train_rfe, y_train)
adaboost_clf_rf.fit(X_train_rfe, y_train)

# predict
y_pred_ada = adaboost_clf.predict(X_test_rfe)
y_pred_dt = adaboost_clf_dt.predict(X_test_rfe)
y_pred_rf = adaboost_clf_rf.predict(X_test_rfe)

# calculate probabilities
ada_probs = adaboost_clf.predict_proba(X_test_rfe)[:,1]
ada_dt_probs = adaboost_clf_dt.predict_proba(X_test_rfe)[:,1]
ada_rf_probs = adaboost_clf_rf.predict_proba(X_test_rfe)[:,1]

# evaluate adaboost
print("AdaBoost:")
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print("Accuracy:", accuracy_ada)
f1_ada = f1_score(y_test, y_pred_ada)
print("F1_score:", f1_ada)
ada_auc = roc_auc_score(y_test, ada_probs)
print("AUC score:", ada_auc)
print("Classification Report:")
print(classification_report(y_test, y_pred_ada))

# evaluate adaboost + dt
print("\nAdaBoost + Decision tree:")
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy_dt)
f1_ada_dt = f1_score(y_test, y_pred_dt)
print("F1_score:", f1_ada_dt)
ada_dt_auc = roc_auc_score(y_test, ada_dt_probs)
print("AUC score:", ada_dt_auc)
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))

# evaluate adaboost + rf
print("\nAdaBoost + Random Forest:")
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)
f1_ada_rf = f1_score(y_test, y_pred_rf)
print("F1_score:", f1_ada_rf)
ada_rf_auc = roc_auc_score(y_test, ada_rf_probs)
print("AUC score:", ada_rf_auc)
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))


AdaBoost with no estimator scored the highest AUC but the lowest F1-score, indicating it struggled to accuracy predict positive instances of the minority class. AdaBoost with the decision tree estimator scored significantly higher F1 but lower AUC. Lastly, AdaBoost with random forest estimator had the highest accuracy, but scored lower F1 & AUC compared to AdaBoost with decision tree estimator. Looking at the classification report, AdaBoost + RF favoured the majority class with higher recall & F1 compared to AdaBoost + DT. Overall, our previous Random Forest model with RFE & balanced weights performed better than any of our AdaBoost models here. 

## Final predictions
In this next section, I will use the top performing Random Forest model with balanced weights & recursive feature elimination to predict the class labels on the queries.txt dataset.

### Preprocess queries

In [None]:
# import queries data for prediction
queries_data = pd.read_csv("data/queries.txt", delimiter='\s*,\s*', engine="python")
queries_data.columns = queries_data.columns.str.strip()
queries_data.columns

In [None]:
queries_data.head(5)

In [None]:
# remove double quotes from values in all object columns
for column in queries_data.columns:
    if queries_data[column].dtype == object:
        queries_data[column] = queries_data[column].str.strip('"')

queries_data.head(5)

In [None]:
# apply same preprocessing steps as for trainingset.txt
# binary encode default, housing and loan
queries_bencoded = pd.get_dummies(queries_data, columns=['default', 'housing', 'loan'], drop_first=True)
queries_bencoded.columns = queries_bencoded.columns.str.replace('yes', 'encoded')

# one-hot encode remaining categorical columns
queries_ohencoded = pd.get_dummies(queries_data, columns=['job', 'marital', 'education', 'contact', 'poutcome'])

# combine one-hot encoded & binary encoded dataframes
queries_encoded = pd.concat([queries_bencoded, queries_ohencoded], axis=1)

# drop duplicate cols
queries_encoded = queries_encoded.loc[:,~queries_encoded.columns.duplicated()]

# drop non-encoded cols
queries_encoded.drop(["job", "marital","education","contact","poutcome","default", "housing", "loan"], axis=1, inplace=True)

# preprocess day & month into one column
queries_encoded['date'] = queries_encoded['day'].astype(str) + '-' + queries_encoded['month']
queries_encoded['date'] = pd.to_datetime(queries_encoded['date'], format='%d-%b')
queries_encoded['date'] = queries_encoded['date'] + pd.offsets.DateOffset(years=2023 - queries_encoded['date'].dt.year.max())
queries_encoded.drop(['day', 'month'], axis=1, inplace=True)

# drop 'duration' column
queries_encoded.drop(["duration"], axis=1, inplace=True)

# normalize age, balance, campaign, pdays & previous
queries_scaled = scaler.fit_transform(queries_encoded[cols_scale])
queries_scaled = pd.DataFrame(queries_scaled, columns=cols_scale)
queries_scaled = queries_scaled.add_prefix('scaled_')

# combine scaled & encoded dataframes
queries_encoded.reset_index(drop=True, inplace=True)
final_queries = pd.concat([queries_encoded, queries_scaled], axis=1)

# drop non-encoded cols
final_queries.drop(["age", "previous","campaign","balance","pdays"], axis=1, inplace=True)

# convert date column to int64 for model prediction
final_queries['date'] = final_queries['date'].astype("int64")

# drop target column
final_queries.drop(["y"], axis=1, inplace=True)

# check new dataframe
final_queries.head(5)

In [None]:
# check shape before variance selector
final_queries.shape

In [None]:
# fit variance selector to training data
var_selector.fit(final_queries)
final_transformed = var_selector.transform(final_queries)

In [None]:
# check shape after variance selector
final_transformed.shape

In [None]:
# retrieve features selected by RFE
selected_feats_idx = selector_rfe.get_support(indices=True)
og_col_names = final_queries.columns
selected_feats = [og_col_names[i] for i in selected_feats_idx]

print("Features selected by RFE: ", selected_feats)

### Predict class labels for queries

In [None]:
# load saved model
rf_model = joblib.load("rf_model_rfe.pkl")

# select queries columns that match RFE columns
selected_final_queries = final_transformed[:, selected_feats_idx]

# make predictions
final_predictions = rf_model.predict(selected_final_queries)

# view predictions
print("Predictions:", final_predictions)


In [None]:
# save predictions in a dataframe using loop
num_preds = len(final_predictions)
predictions_df = pd.DataFrame(columns=['query', 'prediction'])
for i in range(num_preds):
    predictions_df.loc[i] = [i + 1, final_predictions[i]]

# rename True and False values with original class labels
predictions_df['prediction'] = predictions_df['prediction'].replace({True: 'TypeB', False: 'TypeA'})
print(predictions_df)

In [None]:
# save dataframe as txt file
predictions_df.to_csv('data/D22127697.txt', sep=',', index=False)