In [65]:
# import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from sqlalchemy import create_engine
import psycopg2
from config import db_password
import time
from pathlib import Path

In [66]:
# make a connection string for the engine to connect to the datbase
# "postgresql://[user]:[password]@[location]:[port]/[database]"
db_string = f"postgresql://postgres:{db_password}@marketing-data.c255i23tlogx.us-east-1.rds.amazonaws.com:5432/postgres"
    
# Create the database engine
engine = create_engine(db_string)

In [67]:
# Test the connection by import data in Postgres table into a Panda DataFrame
# Connect to PostgreSQL server
dbConnection = engine.connect()

# Read data from PostgreSQL database table and load into a DataFrame instance
market_df = pd.read_sql("select * from \"marketing_data_complete\"", dbConnection)

# Close the database connection
dbConnection.close()
market_df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,purchases_sum,Lat,Long,Per Capita Income,Currency Conv to USD,Total_Dependents,Total_campaigns,age,edu_classes,relation_status
0,1826,1970,Graduation,Divorced,84835.0,0,0,2014-06-16,0,189,...,15,40.416775,-3.70379,27057.2,0.846231,0,0,44,2,4
1,5371,1989,Graduation,Single,21474.0,1,0,2014-04-08,0,6,...,8,40.416775,-3.70379,27057.2,0.846231,1,1,25,2,1
2,7348,1958,PhD,Single,71691.0,0,0,2014-03-17,0,336,...,17,40.416775,-3.70379,27057.2,0.846231,0,0,56,5,1
3,1991,1967,Graduation,Together,44931.0,0,1,2014-01-18,0,78,...,7,40.416775,-3.70379,27057.2,0.846231,1,0,47,2,2
4,5642,1979,Master,Together,62499.0,1,0,2013-12-09,0,140,...,12,40.416775,-3.70379,27057.2,0.846231,1,0,35,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,9940,1958,Graduation,Together,64961.0,0,1,2012-12-23,97,382,...,16,-25.731340,28.21837,5090.7,14.717459,1,0,56,2,2
2178,3406,1964,Graduation,Single,45989.0,0,1,2012-10-22,97,138,...,16,-25.731340,28.21837,5090.7,14.717459,1,0,50,2,1
2179,313,1968,Graduation,Widow,73455.0,0,0,2013-10-28,98,901,...,22,-25.731340,28.21837,5090.7,14.717459,0,1,46,2,5
2180,5871,1979,Master,Together,24401.0,0,0,2012-08-31,98,73,...,16,-25.731340,28.21837,5090.7,14.717459,0,0,35,4,2


In [68]:
df = market_df
# Drop columns that are not useful to build a logistic regression model
df = df.drop(columns=['ID', 'Year_Birth', 'Dt_Customer','Lat','Long','Per Capita Income', 
                      'Currency Conv to USD','edu_classes', 'relation_status'])

# Transform text into numerical data
le = LabelEncoder()
df['Education'] = le.fit_transform(df['Education'])
df['Marital_Status'] = le.fit_transform(df['Marital_Status'])
df['Country'] = le.fit_transform(df['Country'])

# Separate the features from the target
y = df["Response"]
X = df.drop(columns="Response")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.shape

(1636, 29)

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Create a function for logistic regression
def log_regress_fun(X_train_scaled,y_train,X_test_scaled):
    model = LogisticRegression(solver='lbfgs', max_iter=200,random_state=1)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    return y_pred, model

In [22]:
y_pred, model = log_regress_fun(X_train_scaled,y_train,X_test_scaled)
# Get training and testing accuracy
# print(f" Random forest model accuracy: {accuracy_score(y_test,y_pred):.3f}")
print("Logistic Regression model accuracy (training): {0:.3f}".format(
       model.score(
           X_train_scaled,
           y_train)))
print("Logistic Regression accuracy  (validation): {0:.3f}".format(
       model.score(
           X_test_scaled,
           y_test)))

Logistic Regression model accuracy (training): 0.886
Logistic Regression accuracy  (validation): 0.890


In [23]:
# results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
# results.head(20)

In [24]:
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,451,12
Actual 1,48,35


In [25]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       463
           1       0.74      0.42      0.54        83

    accuracy                           0.89       546
   macro avg       0.82      0.70      0.74       546
weighted avg       0.88      0.89      0.88       546



In [46]:
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

from collections import Counter

# Random Oversampling

In [47]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 1395, 1: 1395})

In [48]:
y_pred, model = log_regress_fun(X_resampled,y_resampled,X_test_scaled)

In [49]:
print(f" Random Oversampling model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")

print("Logistic Regression model accuracy (training): {0:.3f}".format(
       model.score(
           X_resampled,
           y_resampled)))
print("Logistic Regression accuracy  (validation): {0:.3f}".format(
       model.score(
           X_test_scaled,
           y_test)))

 Random Oversampling model accuracy: 0.825
Logistic Regression model accuracy (training): 0.808
Logistic Regression accuracy  (validation): 0.830


In [50]:

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.83      0.82      0.89      0.83      0.68       463
          1       0.47      0.82      0.83      0.59      0.83      0.68        83

avg / total       0.89      0.83      0.82      0.85      0.83      0.68       546



# Use SMOTE (Synthetic minority oversampling technique)

In [51]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 1395, 1: 1395})

In [52]:
y_pred, model = log_regress_fun(X_resampled,y_resampled,X_test_scaled)

In [53]:
print(f" SMOTE model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")

print("Logistic Regression model accuracy (training): {0:.3f}".format(
       model.score(
           X_resampled,
           y_resampled)))
print("Logistic Regression accuracy  (validation): {0:.3f}".format(
       model.score(
           X_test_scaled,
           y_test)))

 SMOTE model accuracy: 0.819
Logistic Regression model accuracy (training): 0.819
Logistic Regression accuracy  (validation): 0.819


In [54]:

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.82      0.82      0.88      0.82      0.67       463
          1       0.45      0.82      0.82      0.58      0.82      0.67        83

avg / total       0.88      0.82      0.82      0.84      0.82      0.67       546



# SMOTEEN

In [55]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled)

Counter({0: 927, 1: 1343})

In [56]:
y_pred, model = log_regress_fun(X_resampled,y_resampled,X_test_scaled)

In [57]:
print(f" SMOTEEN model accuracy: {balanced_accuracy_score(y_test,y_pred):.3f}")

print("Logistic Regression model accuracy (training): {0:.3f}".format(
       model.score(
           X_resampled,
           y_resampled)))
print("Logistic Regression accuracy  (validation): {0:.3f}".format(
       model.score(
           X_test_scaled,
           y_test)))

 SMOTEEN model accuracy: 0.815
Logistic Regression model accuracy (training): 0.893
Logistic Regression accuracy  (validation): 0.762


In [58]:

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.74      0.89      0.84      0.81      0.65       463
          1       0.38      0.89      0.74      0.53      0.81      0.67        83

avg / total       0.88      0.76      0.87      0.79      0.81      0.65       546

