<h3>Initial model build<h3>

In [51]:
#import packages and modules
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [52]:
#create SQLAlchemy engine
db_uri = 'postgresql://postgres:postgres@localhost:5432/marketing_data_ml' #note that you will need to update the db_uri variable to pull from your local postgres instance
engine = create_engine(db_uri)

#execute SQL query and retrieve data into a DataFrame
query = "SELECT * FROM marketing_data;"
marketing_data_sql = pd.read_sql_query(query, engine)

#print first few rows of the DataFrame
print(marketing_data_sql.head(25))

      id  year_birth   education marital_status   income  kidhome  teenhome  \
0   5524        1957  Graduation         Single  58138.0        0         0   
1   2174        1954  Graduation         Single  46344.0        1         1   
2   4141        1965  Graduation       Together  71613.0        0         0   
3   6182        1984  Graduation       Together  26646.0        1         0   
4   5324        1981         PhD        Married  58293.0        1         0   
5   7446        1967      Master       Together  62513.0        0         1   
6    965        1971  Graduation       Divorced  55635.0        0         1   
7   6177        1985         PhD        Married  33454.0        1         0   
8   4855        1974         PhD       Together  30351.0        1         0   
9   5899        1950         PhD       Together   5648.0        1         1   
10  1994        1983  Graduation        Married      NaN        1         0   
11   387        1976       Basic        Married   75

In [53]:
#import additional libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [54]:
#make copy of the original data for modeling
marketing_data_modeling = marketing_data_sql.copy()

In [55]:
#preprocessing steps
#drop unnecessary columns
columns_to_drop = ['id', 'dt_customer', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 
                   'acceptedcmp1', 'acceptedcmp2', 'complain', 'z_costcontact', 'z_revenue']
marketing_data_modeling = marketing_data_modeling.drop(columns=columns_to_drop)

#drop rows with missing data
marketing_data_modeling = marketing_data_modeling.dropna()

marketing_data_modeling.head()

Unnamed: 0,year_birth,education,marital_status,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,response
0,1957,Graduation,Single,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,True
1,1954,Graduation,Single,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,False
2,1965,Graduation,Together,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,False
3,1984,Graduation,Together,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,False
4,1981,PhD,Married,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,False


In [56]:
#define features and target label
features = ['year_birth', 'education', 'marital_status', 'income', 'kidhome', 'teenhome',
            'recency', 'mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts',
            'mntsweetproducts', 'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
            'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth']
target_label = 'response'

#extract features and target
X = marketing_data_modeling[features]
y = marketing_data_modeling[target_label]

#X.head()
#y.head()

In [57]:
#data transformation
# Handle categorical variables (one-hot encoding)
X = pd.get_dummies(X, columns=['education', 'marital_status'], drop_first=True)

# List of columns to scale
columns_to_scale = ['income', 'recency', 'mntwines', 'mntfruits', 'mntmeatproducts',
                    'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']

# Scaling the features
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

X.head()

Unnamed: 0,year_birth,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,...,education_Graduation,education_Master,education_PhD,marital_status_Alone,marital_status_Divorced,marital_status_Married,marital_status_Single,marital_status_Together,marital_status_Widow,marital_status_YOLO
0,1957,0.234063,0,0,0.310532,0.978226,1.549429,1.690227,2.454568,1.484827,...,True,False,False,False,False,False,True,False,False,False
1,1954,-0.234559,1,1,-0.380509,-0.872024,-0.637328,-0.717986,-0.651038,-0.63388,...,True,False,False,False,False,False,True,False,False,False
2,1965,0.769478,0,0,-0.795134,0.358511,0.569159,-0.178368,1.340203,-0.146821,...,True,False,False,False,False,False,False,True,False,False
3,1984,-1.017239,1,0,-0.795134,-0.872024,-0.561922,-0.655551,-0.504892,-0.585174,...,True,False,False,False,False,False,False,True,False,False
4,1981,0.240221,1,0,1.554407,-0.391671,0.418348,-0.218505,0.152766,-0.000703,...,False,False,True,False,False,True,False,False,False,False


In [58]:
#splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

In [59]:
#training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=33)
rf_model.fit(X_train, y_train)

In [60]:
#making predictions
y_pred = rf_model.predict(X_test)

y_pred

array([False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False,  True,

In [61]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8676691729323308


In [62]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

Confusion Matrix:
[[556  17]
 [ 71  21]]


In [63]:
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.97      0.93       573
        True       0.55      0.23      0.32        92

    accuracy                           0.87       665
   macro avg       0.72      0.60      0.62       665
weighted avg       0.84      0.87      0.84       665



<h3>Tuning attempt 1: GridSearchCV<h3>

In [64]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV

#define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [65]:
#create GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=33), 
                           param_grid=param_grid, 
                           cv=3, 
                           n_jobs=-1, 
                           verbose=2)

In [66]:
#fit grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


  warn(


In [67]:
#get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

Best parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [68]:
#train the model with the best parameters
best_rf_model = RandomForestClassifier(**best_params, random_state=33)
best_rf_model.fit(X_train, y_train)

  warn(


In [69]:
#make predictions with the best model
best_y_pred = best_rf_model.predict(X_test)

In [70]:
#evaluate the best model
best_accuracy = accuracy_score(y_test, best_y_pred)
print(f'Best Model Accuracy: {best_accuracy}')

print(f'Confusion Matrix:\n{confusion_matrix(y_test, best_y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, best_y_pred)}')

Best Model Accuracy: 0.8661654135338346
Confusion Matrix:
[[553  20]
 [ 69  23]]
Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.97      0.93       573
        True       0.53      0.25      0.34        92

    accuracy                           0.87       665
   macro avg       0.71      0.61      0.63       665
weighted avg       0.84      0.87      0.84       665



<h3>Tuning attempt 2: SMOTE + Class weight adjustment<h3>

In [75]:
#import SMOTE
from imblearn.over_sampling import SMOTE

#apply SMOTE to the training data
smote = SMOTE(random_state=33)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [76]:
# Train the Random Forest model with class weight adjustment
rf_model_res = RandomForestClassifier(n_estimators=100, random_state=33, class_weight={0: 1, 1: 5})
rf_model_res.fit(X_train_res, y_train_res)

In [77]:
# Making predictions
y_pred_res = rf_model_res.predict(X_test)

In [79]:
#evaluate model
accuracy_res = accuracy_score(y_test, y_pred_res)
print(f'Accuracy (resampled): {accuracy_res}')

conf_matrix_res = confusion_matrix(y_test, y_pred_res)
print(f'Confusion Matrix (resampled):\n{conf_matrix_res}')

class_report_res = classification_report(y_test, y_pred_res)
print(f'Classification Report (resampled):\n{class_report_res}')

Accuracy (resampled): 0.849624060150376
Confusion Matrix (resampled):
[[531  42]
 [ 58  34]]
Classification Report (resampled):
              precision    recall  f1-score   support

       False       0.90      0.93      0.91       573
        True       0.45      0.37      0.40        92

    accuracy                           0.85       665
   macro avg       0.67      0.65      0.66       665
weighted avg       0.84      0.85      0.84       665



<h3>Tuning attempt 3: ADASYN (oversampling)<h3>

In [80]:
#import ADYSN
from imblearn.over_sampling import ADASYN

In [81]:
#apply ADASYN to the training data
adasyn = ADASYN(random_state=33)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

In [82]:
#train the Random Forest model with class weight adjustment using ADASYN
rf_model_adasyn = RandomForestClassifier(n_estimators=100, random_state=33, class_weight={0: 1, 1: 5})
rf_model_adasyn.fit(X_train_adasyn, y_train_adasyn)

In [83]:
#making predictions
y_pred_adasyn = rf_model_adasyn.predict(X_test)

In [84]:
#evaluate model
accuracy_adasyn = accuracy_score(y_test, y_pred_adasyn)
print(f'Accuracy (ADASYN): {accuracy_adasyn}')

conf_matrix_adasyn = confusion_matrix(y_test, y_pred_adasyn)
print(f'Confusion Matrix (ADASYN):\n{conf_matrix_adasyn}')

class_report_adasyn = classification_report(y_test, y_pred_adasyn)
print(f'Classification Report (ADASYN):\n{class_report_adasyn}')

Accuracy (ADASYN): 0.8406015037593985
Confusion Matrix (ADASYN):
[[525  48]
 [ 58  34]]
Classification Report (ADASYN):
              precision    recall  f1-score   support

       False       0.90      0.92      0.91       573
        True       0.41      0.37      0.39        92

    accuracy                           0.84       665
   macro avg       0.66      0.64      0.65       665
weighted avg       0.83      0.84      0.84       665



<h3>Tuning attempt 4: SMOTE with undersampling<h3>

In [85]:
#import SMOTEENN
from imblearn.combine import SMOTEENN

In [86]:
#combine SMOTE + ENN for undersampling
smote_enn = SMOTEENN(random_state=33)
X_train_smoteenn, y_train_smoteenn = smote_enn.fit_resample(X_train, y_train)

In [87]:
#train df model with class weight adjustment using SMOTE + ENN
rf_model_smoteenn = RandomForestClassifier(n_estimators=100, random_state=33, class_weight={0: 1, 1: 5})
rf_model_smoteenn.fit(X_train_smoteenn, y_train_smoteenn)

In [88]:
#make predictions
y_pred_smoteenn = rf_model_smoteenn.predict(X_test)

In [89]:
#evaluate model
accuracy_smoteenn = accuracy_score(y_test, y_pred_smoteenn)
print(f'Accuracy (SMOTE + ENN): {accuracy_smoteenn}')

conf_matrix_smoteenn = confusion_matrix(y_test, y_pred_smoteenn)
print(f'Confusion Matrix (SMOTE + ENN):\n{conf_matrix_smoteenn}')

class_report_smoteenn = classification_report(y_test, y_pred_smoteenn)
print(f'Classification Report (SMOTE + ENN):\n{class_report_smoteenn}')

Accuracy (SMOTE + ENN): 0.7819548872180451
Confusion Matrix (SMOTE + ENN):
[[472 101]
 [ 44  48]]
Classification Report (SMOTE + ENN):
              precision    recall  f1-score   support

       False       0.91      0.82      0.87       573
        True       0.32      0.52      0.40        92

    accuracy                           0.78       665
   macro avg       0.62      0.67      0.63       665
weighted avg       0.83      0.78      0.80       665

