In [13]:
#import packages and modules
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [14]:
#create SQLAlchemy engine
db_uri = 'postgresql://postgres:postgres@localhost:5432/marketing_data_ml' #note that you will need to update the db_uri variable to pull from your local postgres instance
engine = create_engine(db_uri)

#execute SQL query and retrieve data into a DataFrame
query = "SELECT * FROM marketing_data;"
marketing_data_sql = pd.read_sql_query(query, engine)

#print first few rows of the DataFrame
marketing_data_sql.head(25)
#print(marketing_data_sql.dtypes)

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,...,numwebvisitsmonth,acceptedcmp3,acceptedcmp4,acceptedcmp5,acceptedcmp1,acceptedcmp2,complain,z_costcontact,z_revenue,response
0,5524,1957,Graduation,Single,58138.0,0,0,9/4/2012,58,635,...,7,False,False,False,False,False,False,3,11,True
1,2174,1954,Graduation,Single,46344.0,1,1,3/8/2014,38,11,...,5,False,False,False,False,False,False,3,11,False
2,4141,1965,Graduation,Together,71613.0,0,0,8/21/2013,26,426,...,4,False,False,False,False,False,False,3,11,False
3,6182,1984,Graduation,Together,26646.0,1,0,2/10/2014,26,11,...,6,False,False,False,False,False,False,3,11,False
4,5324,1981,PhD,Married,58293.0,1,0,1/19/2014,94,173,...,5,False,False,False,False,False,False,3,11,False
5,7446,1967,Master,Together,62513.0,0,1,9/9/2013,16,520,...,6,False,False,False,False,False,False,3,11,False
6,965,1971,Graduation,Divorced,55635.0,0,1,11/13/2012,34,235,...,6,False,False,False,False,False,False,3,11,False
7,6177,1985,PhD,Married,33454.0,1,0,5/8/2013,32,76,...,8,False,False,False,False,False,False,3,11,False
8,4855,1974,PhD,Together,30351.0,1,0,6/6/2013,19,14,...,9,False,False,False,False,False,False,3,11,True
9,5899,1950,PhD,Together,5648.0,1,1,3/13/2014,68,28,...,20,True,False,False,False,False,False,3,11,False


In [15]:
#Get all column names
marketing_data_sql.columns

Index(['id', 'year_birth', 'education', 'marital_status', 'income', 'kidhome',
       'teenhome', 'dt_customer', 'recency', 'mntwines', 'mntfruits',
       'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts',
       'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
       'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth',
       'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1',
       'acceptedcmp2', 'complain', 'z_costcontact', 'z_revenue', 'response'],
      dtype='object')

In [16]:
#Drop unnecessary columns
marketing_df = marketing_data_sql.drop(columns=['id', 'dt_customer', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1', 'acceptedcmp2', 'complain', 'z_costcontact', 'z_revenue'])
marketing_df.head()

Unnamed: 0,year_birth,education,marital_status,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,response
0,1957,Graduation,Single,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,True
1,1954,Graduation,Single,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,False
2,1965,Graduation,Together,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,False
3,1984,Graduation,Together,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,False
4,1981,PhD,Married,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,False


In [17]:
# Drop missing values from income
marketing_df = marketing_df.dropna()

In [18]:
# Get dummy variables for education and marital status
marketing_df_encoded = pd.get_dummies(marketing_df, columns=['education', 'marital_status'])
marketing_df_encoded.head()

Unnamed: 0,year_birth,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,...,education_Master,education_PhD,marital_status_Absurd,marital_status_Alone,marital_status_Divorced,marital_status_Married,marital_status_Single,marital_status_Together,marital_status_Widow,marital_status_YOLO
0,1957,58138.0,0,0,58,635,88,546,172,88,...,False,False,False,False,False,False,True,False,False,False
1,1954,46344.0,1,1,38,11,1,6,2,1,...,False,False,False,False,False,False,True,False,False,False
2,1965,71613.0,0,0,26,426,49,127,111,21,...,False,False,False,False,False,False,False,True,False,False
3,1984,26646.0,1,0,26,11,4,20,10,3,...,False,False,False,False,False,False,False,True,False,False
4,1981,58293.0,1,0,94,173,43,118,46,27,...,False,True,False,False,False,True,False,False,False,False


In [28]:
# List columns to scale
columns_to_scale = [
    'income', 'recency', 'mntwines', 'mntfruits', 'mntmeatproducts', 
    'mntfishproducts', 'mntsweetproducts', 'mntgoldprods'	
]

# Initialize standardscaler
scaler = StandardScaler()

# Fit and transform data
marketing_df_encoded[columns_to_scale] = scaler.fit_transform(marketing_df_encoded[columns_to_scale])
marketing_df_encoded.head()

Unnamed: 0,year_birth,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,...,education_Master,education_PhD,marital_status_Absurd,marital_status_Alone,marital_status_Divorced,marital_status_Married,marital_status_Single,marital_status_Together,marital_status_Widow,marital_status_YOLO
0,1957,0.234063,0,0,0.310532,0.978226,1.549429,1.690227,2.454568,1.484827,...,False,False,False,False,False,False,True,False,False,False
1,1954,-0.234559,1,1,-0.380509,-0.872024,-0.637328,-0.717986,-0.651038,-0.63388,...,False,False,False,False,False,False,True,False,False,False
2,1965,0.769478,0,0,-0.795134,0.358511,0.569159,-0.178368,1.340203,-0.146821,...,False,False,False,False,False,False,False,True,False,False
3,1984,-1.017239,1,0,-0.795134,-0.872024,-0.561922,-0.655551,-0.504892,-0.585174,...,False,False,False,False,False,False,False,True,False,False
4,1981,0.240221,1,0,1.554407,-0.391671,0.418348,-0.218505,0.152766,-0.000703,...,False,True,False,False,False,True,False,False,False,False


In [29]:
# Separate features from target
y = marketing_df_encoded["response"]
X = marketing_df_encoded.drop(columns="response")

In [30]:
# Split data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [31]:
lr_model = LogisticRegression(random_state=42)

In [32]:
# Fit model 
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
print(f"Training Data Score: {lr_classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_classifier.score(X_test, y_test)}")

Training Data Score: 0.8616125150421179
Testing Data Score: 0.8664259927797834


In [34]:
# Make prediction using testing data
testing_predictions = lr_model.predict(X_test)

In [35]:
# Generate confusion matrix
marketing_cm = confusion_matrix(y_test, testing_predictions)
marketing_cm_df = pd.DataFrame(marketing_cm)
marketing_cm_df

Unnamed: 0,0,1
0,455,20
1,54,25


In [37]:
# Print the classification report
response = ["Likely to Respond", "Unlikely to Respond"]
print(classification_report(y_test, testing_predictions, target_names=response))

                     precision    recall  f1-score   support

  Likely to Respond       0.89      0.96      0.92       475
Unlikely to Respond       0.56      0.32      0.40        79

           accuracy                           0.87       554
          macro avg       0.72      0.64      0.66       554
       weighted avg       0.85      0.87      0.85       554

