In [40]:
# import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sqlalchemy import create_engine
import psycopg2
from config import db_password
import time
from pathlib import Path

In [41]:
# make a connection string for the engine to connect to the datbase
# "postgresql://[user]:[password]@[location]:[port]/[database]"
db_string = f"postgresql://postgres:{db_password}@marketing-data.c255i23tlogx.us-east-1.rds.amazonaws.com:5432/postgres"
    
# Create the database engine
engine = create_engine(db_string)

In [42]:
# Test the connection by import data in Postgres table into a Panda DataFrame
# Connect to PostgreSQL server
dbConnection = engine.connect()

# Read data from PostgreSQL database table and load into a DataFrame instance
df = pd.read_sql("select * from \"marketing_data_complete\"", dbConnection)

# Close the database connection
dbConnection.close()


In [43]:
# List all columns & datatypes
df.dtypes

ID                               int64
Year_Birth                       int64
Education                       object
Marital_Status                  object
Income                         float64
Kidhome                          int64
Teenhome                         int64
Dt_Customer             datetime64[ns]
Recency                          int64
MntWines                         int64
MntFruits                        int64
MntMeatProducts                  int64
MntFishProducts                  int64
MntSweetProducts                 int64
MntGoldProds                     int64
NumDealsPurchases                int64
NumWebPurchases                  int64
NumCatalogPurchases              int64
NumStorePurchases                int64
NumWebVisitsMonth                int64
AcceptedCmp3                     int64
AcceptedCmp4                     int64
AcceptedCmp5                     int64
AcceptedCmp1                     int64
AcceptedCmp2                     int64
Response                 

In [44]:
# Drop columns that are not useful to build the model
df = df.drop(columns=['ID', 'Year_Birth', 'Dt_Customer','Lat','Long','Per Capita Income', 'Currency Conv to USD','edu_classes', 'relation_status'])


In [45]:
# df.columns

In [46]:
# Transform text into numerical data
le = LabelEncoder()
df['Education'] = le.fit_transform(df['Education'])
df['Marital_Status'] = le.fit_transform(df['Marital_Status'])
df['Country'] = le.fit_transform(df['Country'])
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country,mnt_sum,purchases_sum,Total_Dependents,Total_campaigns,age
0,2,0,84835.0,0,0,0,189,104,379,111,...,0,0,1,0,5,1190,15,0,0,44
1,2,2,21474.0,1,0,0,6,16,24,11,...,0,0,1,0,5,91,8,1,1,25
2,4,2,71691.0,0,0,0,336,130,411,240,...,0,0,1,0,5,1192,17,0,0,56
3,2,3,44931.0,0,1,0,78,0,11,0,...,0,0,0,0,5,96,7,1,0,47
4,3,3,62499.0,1,0,0,140,4,61,0,...,0,0,0,0,5,222,12,1,0,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,2,3,64961.0,0,1,97,382,114,276,75,...,0,0,0,0,4,1009,16,1,0,56
2178,2,2,45989.0,0,1,97,138,33,87,28,...,0,0,0,0,4,322,16,1,0,50
2179,2,4,73455.0,0,0,98,901,61,757,186,...,0,0,0,0,4,2088,22,0,1,46
2180,3,3,24401.0,0,0,98,73,28,217,10,...,0,0,0,0,4,467,16,0,0,35


In [7]:
# Separate the features from the target
y = df["Response"]
X = df.drop(columns="Response")

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.shape

(1636, 29)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# SMOTE 

In [10]:
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

In [11]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 1395, 1: 1395})

# Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

In [13]:
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")
print("Random forest model accuracy (training): {0:.3f}".format(
       rf_model.score(
           X_resampled,
           y_resampled)))
print("Random forest accuracy  (validation): {0:.3f}".format(
       rf_model.score(
           X_test_scaled,
           y_test)))

 Random forest model accuracy: 0.754
Random forest model accuracy (training): 0.999
Random forest accuracy  (validation): 0.894


In [14]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,442,21
Actual 1,37,46


In [15]:
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.95      0.55      0.94      0.73      0.55       463
          1       0.69      0.55      0.95      0.61      0.73      0.51        83

avg / total       0.89      0.89      0.62      0.89      0.73      0.54       546



# SVM Classifier

In [16]:
from sklearn.svm import SVC
# Create the SVM model
svm = SVC(kernel='linear')
# Train the model
svm.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [17]:
# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")
print("SVM model accuracy (training): {0:.3f}".format(
       svm.score(
           X_resampled,
           y_resampled)))
print("SVM model accuracy  (validation): {0:.3f}".format(
       svm.score(
           X_test_scaled,
           y_test)))

 SVM model accuracy: 0.833
SVM model accuracy (training): 0.823
SVM model accuracy  (validation): 0.826


In [18]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,381,82
Actual 1,13,70


In [19]:
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.82      0.84      0.89      0.83      0.69       463
          1       0.46      0.84      0.82      0.60      0.83      0.70        83

avg / total       0.89      0.83      0.84      0.84      0.83      0.69       546



# Gradient Boosting Classifier

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled,
           y_resampled)

In [21]:
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_resampled,
           y_resampled)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  1
Accuracy score (training): 0.946
Accuracy score (validation): 0.842


In [22]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_resampled, y_resampled)
predictions = classifier.predict(X_test_scaled)

In [23]:
print(f"Accuracy Score : {balanced_accuracy_score(y_test,y_pred):.3f}")

Accuracy Score : 0.833


In [24]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,414,49
Actual 1,37,46


In [25]:
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.89      0.55      0.91      0.70      0.51       463
          1       0.48      0.55      0.89      0.52      0.70      0.48        83

avg / total       0.85      0.84      0.61      0.85      0.70      0.51       546



# K Nearest Neighbour Algorithm

In [26]:
from sklearn.neighbors import KNeighborsClassifier
# the number of points the classifier will look at to determine what class a new point belongs to
KNN_model = KNeighborsClassifier(n_neighbors=5)
KNN_model.fit(X_resampled, y_resampled)

KNeighborsClassifier()

In [27]:
# Evaluate the model
y_pred = KNN_model.predict(X_test_scaled)


In [28]:
print(f" KNN model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")

print("KNN model accuracy (training): {0:.3f}".format(
       KNN_model.score(
           X_resampled, 
           y_resampled)))
print("KNN accuracy  (validation): {0:.3f}".format(
       KNN_model.score(
           X_test_scaled,
           y_test)))

 KNN model accuracy: 0.737
KNN model accuracy (training): 0.906
KNN accuracy  (validation): 0.738


In [29]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,342,121
Actual 1,22,61


In [30]:
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.74      0.73      0.83      0.74      0.54       463
          1       0.34      0.73      0.74      0.46      0.74      0.54        83

avg / total       0.85      0.74      0.74      0.77      0.74      0.54       546



# Gaussian Naive Bayes Classifier

In [31]:
from sklearn.naive_bayes import GaussianNB
# the number of points the classifier will look at to determine what class a new point belongs to
GNB_model = GaussianNB()
GNB_model.fit(X_resampled, y_resampled)

GaussianNB()

In [32]:
# Evaluate the model
y_pred = GNB_model.predict(X_test_scaled)
print(f" Gaussian NB model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")
print("Gaussian NB  model accuracy (training): {0:.3f}".format(
       GNB_model.score(
           X_resampled,
           y_resampled)))
print("Gaussian NB  model accuracy  (validation): {0:.3f}".format(
       GNB_model.score(
           X_test_scaled,
           y_test)))

 Gaussian NB model accuracy: 0.742
Gaussian NB  model accuracy (training): 0.728
Gaussian NB  model accuracy  (validation): 0.813


In [33]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,391,72
Actual 1,30,53


In [34]:
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.84      0.64      0.88      0.73      0.55       463
          1       0.42      0.64      0.84      0.51      0.73      0.53        83

avg / total       0.85      0.81      0.67      0.83      0.73      0.55       546



# Linear Discriminant Analysis

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [36]:
# create the lda model
model = LinearDiscriminantAnalysis()
model.fit(X_resampled, y_resampled)

LinearDiscriminantAnalysis()

In [37]:
# Evaluate the model
y_pred = GNB_model.predict(X_test_scaled)
print(f" Linear Discriminant model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")
print("Linear Discriminant model accuracy (training): {0:.3f}".format(
       GNB_model.score(
           X_resampled,
           y_resampled)))
print("Linear Discriminant model accuracy  (validation): {0:.3f}".format(
       GNB_model.score(
           X_test_scaled,
           y_test)))

 Linear Discriminant model accuracy: 0.742
Linear Discriminant model accuracy (training): 0.728
Linear Discriminant model accuracy  (validation): 0.813


In [38]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,391,72
Actual 1,30,53


In [39]:
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.84      0.64      0.88      0.73      0.55       463
          1       0.42      0.64      0.84      0.51      0.73      0.53        83

avg / total       0.85      0.81      0.67      0.83      0.73      0.55       546

