In [1]:
# import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
# load the data
df = pd.read_csv("Resources/market_complete.csv")
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,purchases_sum,Lat,Long,Per Capita Income,Currency Conv to USD,Total_Dependents,Total_campaigns,age,edu_classes,relation_status
0,1826,1970,Graduation,Divorced,84835.0,0,0,2014-06-16,0,189,...,15,40.416775,-3.70379,27057.2,0.846231,0,0,44,2,4
1,5371,1989,Graduation,Single,21474.0,1,0,2014-04-08,0,6,...,8,40.416775,-3.70379,27057.2,0.846231,1,1,25,2,1
2,7348,1958,PhD,Single,71691.0,0,0,2014-03-17,0,336,...,17,40.416775,-3.70379,27057.2,0.846231,0,0,56,5,1
3,1991,1967,Graduation,Together,44931.0,0,1,2014-01-18,0,78,...,7,40.416775,-3.70379,27057.2,0.846231,1,0,47,2,2
4,5642,1979,Master,Together,62499.0,1,0,2013-12-09,0,140,...,12,40.416775,-3.70379,27057.2,0.846231,1,0,35,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,9940,1958,Graduation,Together,64961.0,0,1,2012-12-23,97,382,...,16,-25.731340,28.21837,5090.7,14.717459,1,0,56,2,2
2178,3406,1964,Graduation,Single,45989.0,0,1,2012-10-22,97,138,...,16,-25.731340,28.21837,5090.7,14.717459,1,0,50,2,1
2179,313,1968,Graduation,Widow,73455.0,0,0,2013-10-28,98,901,...,22,-25.731340,28.21837,5090.7,14.717459,0,1,46,2,5
2180,5871,1979,Master,Together,24401.0,0,0,2012-08-31,98,73,...,16,-25.731340,28.21837,5090.7,14.717459,0,0,35,4,2


In [9]:
# List all columns & datatypes
df.dtypes

ID                        int64
Year_Birth                int64
Education                object
Marital_Status           object
Income                  float64
Kidhome                   int64
Teenhome                  int64
Dt_Customer              object
Recency                   int64
MntWines                  int64
MntFruits                 int64
MntMeatProducts           int64
MntFishProducts           int64
MntSweetProducts          int64
MntGoldProds              int64
NumDealsPurchases         int64
NumWebPurchases           int64
NumCatalogPurchases       int64
NumStorePurchases         int64
NumWebVisitsMonth         int64
AcceptedCmp3              int64
AcceptedCmp4              int64
AcceptedCmp5              int64
AcceptedCmp1              int64
AcceptedCmp2              int64
Response                  int64
Complain                  int64
Country                  object
mnt_sum                   int64
purchases_sum             int64
Lat                     float64
Long    

In [10]:
# Drop columns that are not useful to build a logistic regression model
df = df[['Income', 'mnt_sum', 'purchases_sum', 'Kidhome','NumWebVisitsMonth', 'Total_campaigns', 'Response']]
df

Unnamed: 0,Income,mnt_sum,purchases_sum,Kidhome,NumWebVisitsMonth,Total_campaigns,Response
0,84835.0,1190,15,0,1,0,1
1,21474.0,91,8,1,7,1,1
2,71691.0,1192,17,0,2,0,1
3,44931.0,96,7,0,5,0,0
4,62499.0,222,12,1,4,0,0
...,...,...,...,...,...,...,...
2177,64961.0,1009,16,0,3,0,0
2178,45989.0,322,16,0,3,0,0
2179,73455.0,2088,22,0,3,1,0
2180,24401.0,467,16,0,8,0,0


In [11]:
df.columns

Index(['Income', 'mnt_sum', 'purchases_sum', 'Kidhome', 'NumWebVisitsMonth',
       'Total_campaigns', 'Response'],
      dtype='object')

In [6]:
# Transform text into numerical data
le = LabelEncoder()
df['Education'] = le.fit_transform(df['Education'])
df['Marital_Status'] = le.fit_transform(df['Marital_Status'])
df['Country'] = le.fit_transform(df['Country'])
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country,mnt_sum,purchases_sum,Total_Dependents,Total_campaigns,age
0,2,0,84835.0,0,0,0,189,104,379,111,...,0,0,1,0,5,1190,15,0,0,44
1,2,2,21474.0,1,0,0,6,16,24,11,...,0,0,1,0,5,91,8,1,1,25
2,4,2,71691.0,0,0,0,336,130,411,240,...,0,0,1,0,5,1192,17,0,0,56
3,2,3,44931.0,0,1,0,78,0,11,0,...,0,0,0,0,5,96,7,1,0,47
4,3,3,62499.0,1,0,0,140,4,61,0,...,0,0,0,0,5,222,12,1,0,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,2,3,64961.0,0,1,97,382,114,276,75,...,0,0,0,0,4,1009,16,1,0,56
2178,2,2,45989.0,0,1,97,138,33,87,28,...,0,0,0,0,4,322,16,1,0,50
2179,2,4,73455.0,0,0,98,901,61,757,186,...,0,0,0,0,4,2088,22,0,1,46
2180,3,3,24401.0,0,0,98,73,28,217,10,...,0,0,0,0,4,467,16,0,0,35


In [12]:
# Separate the features from the target
y = df["Response"]
X = df.drop(columns="Response")

In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(1636, 6)

In [14]:
# Create a logistic regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [15]:
# Fit (train) the model
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
# Make predictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [17]:
# Evaluate predictions
print(accuracy_score(y_test, y_pred))

0.8553113553113553
