In [83]:
# Import libraries and dependencies

from datetime import date, datetime
import pandas as pd
import numpy as np
import re
import scipy
import seaborn as sns
import timeit

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

In [84]:
# Import the data
# http://share.mailcharts.com/3K3e1d2W1h27

df = pd.read_csv('./data/mailcharts_cart_abandon_classification - for ds_project.csv')
df.shape

(4666, 9)

In [85]:
# Clean the data
df["subject"] = df["subject"].str.lower()
df["full_text"] = df["full_text"].str.lower()

df["subject"] = df["subject"].replace("\,|\.|\?|\-|\|", "", regex=True)
df["full_text"] = df["full_text"].replace("\,|\.|\?|\-|\|", "", regex=True)

In [86]:
# Ensure our cart_abandon column is a number
df["cart_abandon"] = pd.to_numeric(df["cart_abandon"], errors="coerce")

# Drop rows with missing data
df = df.dropna(axis=0, how="any")

In [87]:
print("We have this many rows: ", df.shape[0])
print("With this many identified cart abandon emails: ", df["cart_abandon"].sum())
df.head(3)

We have this many rows:  1208
With this many identified cart abandon emails:  200.0


Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,42376.62986,welcome to sephora beauty insider,lorem you're a beauty insider web version seph...,1,https://www.mailcharts.com/emails/f3870de1-3ab...,0.0
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,42377.72778,new year new rewards,lorem the january rewards are here** web versi...,2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0.0
2,2582,3742,db726d24-2477-ccd6-a0aa-902dcf07f4b9,42377.7875,a friendly reminder,lorem your product picks are waiting at checko...,3,https://www.mailcharts.com/emails/db726d24-247...,1.0


In [88]:
# Create a new DF with all our cart abandon emails
ca = df[df["cart_abandon"] == 1]

In [89]:
ca.head(3)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon
2,2582,3742,db726d24-2477-ccd6-a0aa-902dcf07f4b9,42377.7875,a friendly reminder,lorem your product picks are waiting at checko...,3,https://www.mailcharts.com/emails/db726d24-247...,1.0
7,3008,4168,b8184f12-d54f-9513-ca3a-ea0257ec5a9d,42469.14097,complete your purchase,get back here! order summary 1 sport - s / bla...,3,https://www.mailcharts.com/emails/b8184f12-d54...,1.0
14,3010,4170,eefd41a6-e0de-89aa-9134-885778a2b344,42470.0375,take $5 off you purchase!,hey looks like you forgot some things in your ...,5,https://www.mailcharts.com/emails/eefd41a6-e0d...,1.0


In [90]:
pd.Series(" ".join(ca["subject"]).lower().split()).value_counts()[:30]

your         102
you           71
order         31
complete      28
something     26
in            24
forget        22
did           22
for           22
cart          21
left          19
items         17
we            17
purchase      17
off           17
still         15
a             13
at            12
-             12
is            12
to            12
shopping      11
free          10
have           9
saved          9
&              9
don't          9
lorem          8
us             8
from           8
dtype: int64

In [91]:
pd.Series(" ".join(ca["full_text"]).lower().split()).value_counts()[:30]

to          959
your        926
you         810
the         494
-           420
and         396
for         337
in          328
we          321
email       319
us          304
our         273
on          273
a           273
this        264
or          264
order       256
if          222
have        218
of          208
complete    183
|           180
with        180
here        178
cart        172
from        172
please      165
at          164
click       163
any         159
dtype: int64

In [94]:
hand_picked_words_positive = ["complete", "forget", "forgot", "left", "cart", "items", "still", "something", "saved", "order"]
hand_picked_words_negative = ["welcome", "account", "new", "shop"]

In [95]:
for w in hand_picked_words_positive:
    df[w] = df["subject"].str.contains(w) | df["full_text"].str.contains(w)

In [96]:
df.head(3)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon,complete,forget,forgot,left,cart,items,still,something,saved,order
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,42376.62986,welcome to sephora beauty insider,lorem you're a beauty insider web version seph...,1,https://www.mailcharts.com/emails/f3870de1-3ab...,0.0,False,False,False,False,False,False,False,False,False,False
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,42377.72778,new year new rewards,lorem the january rewards are here** web versi...,2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0.0,False,False,False,False,False,False,False,False,False,True
2,2582,3742,db726d24-2477-ccd6-a0aa-902dcf07f4b9,42377.7875,a friendly reminder,lorem your product picks are waiting at checko...,3,https://www.mailcharts.com/emails/db726d24-247...,1.0,False,False,False,False,False,True,False,False,False,True


In [107]:
X = df[hand_picked_words_positive]
y = df.cart_abandon

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [125]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.92715231788079466

In [134]:
# 246 = it is class 0, we predicted class 0
# 9 = it is class 0, we predicted 1
# 13 = it is class 1, we predicted class 0
# 34 = it is class 1, we predicted class 1

from  sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lr.predict(X_test))

array([[246,   9],
       [ 13,  34]])

In [144]:
# We vectorize each word in our subjects. This helps us understand how important a word is.

from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
subject_vector = v.fit_transform(df['subject'])

In [145]:
subject_vector.shape

(1208, 1883)

In [146]:
lr.fit(subject_vector, y)
lr.score(subject_vector, y)

0.92549668874172186

In [147]:
confusion_matrix(y, lr.predict(subject_vector))

array([[1007,    1],
       [  89,  111]])

In [148]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
subject_vector_2 = v.fit_transform(df['subject'])

In [149]:
subject_vector_2.shape

(1208, 1883)

In [151]:
lr.fit(subject_vector_2, y)
lr.score(subject_vector_2, y)

0.9677152317880795

In [152]:
confusion_matrix(y, lr.predict(subject_vector_2))

array([[1004,    4],
       [  35,  165]])

In [163]:
X_train, X_test, y_train, y_test = train_test_split(subject_vector_2, y, random_state=100)

In [164]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9370860927152318

In [175]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

X = subject_vector_2.toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.86754966887417218

In [211]:
# let's use bigrams

from nltk import ngrams
n = 2
subject_grams = set()

for i, r in df.iterrows():
    grams = ngrams(r["subject"].split(), n)
    for g in grams:
        subject_grams.add(g)

In [None]:
# Homework: Finish up getting bigrams and re-run models
# We may want to convert this to similarity ratio