In [14]:
# Import libraries and dependencies

from datetime import date, datetime
import pandas as pd
import numpy as np
import re
import scipy
import seaborn as sns
import timeit

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# Import the data
# http://share.mailcharts.com/3K3e1d2W1h27

df = pd.read_csv('./data/capstone2-v2.csv')
df.shape

(4666, 9)

In [3]:
# Clean the data

# Turn everything to lowercase
df["subject"] = df["subject"].str.lower()
df["full_text"] = df["full_text"].str.lower()

# Remove non alphanumeric characters
df["subject"] = df["subject"].replace("[^A-Za-z0-9]+", " ", regex=True)
df["full_text"] = df["full_text"].replace("[^A-Za-z0-9]+", " ", regex=True)

In [4]:
# Ensure our cart_abandon column is a number
df["cart_abandon"] = pd.to_numeric(df["cart_abandon"], errors="coerce")

# Drop rows with missing data (unclassified emails)
df = df.dropna(axis=0, how="any")

In [5]:
print("We have this many rows: ", df.shape[0])
print("With this many identified cart abandon emails: ", df["cart_abandon"].sum())
print("Percent of cart abandon emails: ", df["cart_abandon"].sum() / df.shape[0])
df.head(3)

We have this many rows:  1581
With this many identified cart abandon emails:  264.0
Percent of cart abandon emails:  0.16698292220113853


Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,1/7/16 15:07,welcome to sephora beauty insider,lorem you re a beauty insider web version seph...,1,https://www.mailcharts.com/emails/f3870de1-3ab...,0.0
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,1/8/16 17:28,new year new rewards,lorem the january rewards are here web version...,2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0.0
2,2582,3742,db726d24-2477-ccd6-a0aa-902dcf07f4b9,1/8/16 18:54,a friendly reminder,lorem your product picks are waiting at checko...,3,https://www.mailcharts.com/emails/db726d24-247...,1.0


In [6]:
# Create a new DF with all our cart abandon emails
ca = df[df["cart_abandon"] == 1]

In [7]:
# Identify the most popular words in the subject for cart abandon emails
pd.Series(" ".join(ca["subject"]).lower().split()).value_counts()[:30]

your         144
you           92
order         52
complete      46
something     38
cart          37
in            36
purchase      29
we            29
items         27
left          26
forget        26
did           25
for           25
off           23
t             23
to            23
don           19
still         19
at            18
lorem         17
shopping      15
s             15
is            15
10            14
a             14
back          13
free          13
ve            11
have          11
dtype: int64

In [8]:
# Identify the most popular words in the full_text for cart abandon emails
pd.Series(" ".join(ca["full_text"]).lower().split()).value_counts()[:30]

to          1327
your        1249
you         1160
the          707
and          549
we           523
in           456
email        443
us           441
for          440
order        427
a            395
on           383
this         374
our          356
com          340
or           330
complete     314
if           314
s            306
cart         285
have         282
here         269
of           255
with         240
click        239
at           231
1            229
free         228
t            228
dtype: int64

In [9]:
# Let's hand-pick a few words and see if we can build a classifier this way
hand_picked_words_positive = ["complete", "forget", "forgot", "left", "cart", "items", "still", "something", "saved", "order", "waiting"]

In [10]:
# Let's add a new column to DF for each of the hand-picked words looking at both the subject or the full_text
# To keep our data clean, we'll create a copy of DF

df1 = df.copy()
for w in hand_picked_words_positive:
    df1[w] = df1["subject"].str.contains(w) | df1["full_text"].str.contains(w)

In [11]:
df1.head(3)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon,complete,forget,forgot,left,cart,items,still,something,saved,order,waiting
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,1/7/16 15:07,welcome to sephora beauty insider,lorem you re a beauty insider web version seph...,1,https://www.mailcharts.com/emails/f3870de1-3ab...,0.0,False,False,False,False,False,False,False,False,False,False,False
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,1/8/16 17:28,new year new rewards,lorem the january rewards are here web version...,2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0.0,False,False,False,False,False,False,False,False,False,True,False
2,2582,3742,db726d24-2477-ccd6-a0aa-902dcf07f4b9,1/8/16 18:54,a friendly reminder,lorem your product picks are waiting at checko...,3,https://www.mailcharts.com/emails/db726d24-247...,1.0,False,False,False,False,False,True,False,False,False,True,True


In [12]:
# Define the features (x) and target (y) to build our models
X = df1[hand_picked_words_positive]
y = df1.cart_abandon

In [15]:
# Create our train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [16]:
# Let's begin by using logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

# Not a bad score! We know that the majority of our data is non cart abandonment emails.
# Nonetheless, this performs better than a random guess would.

0.93181818181818177

In [17]:
# Time to build a confusion matrix

# 332 = it is class 0, we predicted class 0
# 5 = it is class 0, we predicted 1
# 23 = it is class 1, we predicted class 0
# 36 = it is class 1, we predicted class 1

from  sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lr.predict(X_test))

# There are some improvements to be made when it comes to detecting cart abandon emails

array([[332,   5],
       [ 22,  37]])

In [18]:
# Let's see if we get a better result if we were to instead vectorize each word in our subjects.
# This helps us understand how important a word is.

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Combine full_text and subject so that we can vectorize it
df1["subject_and_text"] = df["subject"] + " " + df["full_text"]

subject_vector = vectorizer.fit_transform(df1["subject_and_text"])
subject_vector.shape

(1581, 15708)

In [19]:
# Define the features (x) and target (y) to build our models
X = subject_vector
# y remains the same

# Create our train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [20]:
# The score is slightly higher
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.90656565656565657

In [21]:
# Taking a look at the confusion matrix
confusion_matrix(y_test, lr.predict(X_test))

# This model actually performs slightly worse.

array([[337,   0],
       [ 37,  22]])

In [22]:
# Let's try another approach, with CountVectorizer — this will tell us how often a word occurs

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
subject_vector = vectorizer.fit_transform(df1["subject_and_text"])
subject_vector.shape

(1581, 15708)

In [23]:
# Define the features (x) and target (y) to build our models
X = subject_vector
# y remains the same

# Create our train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [24]:
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.95707070707070707

In [25]:
# This model performs better. We have a few false positives, but we can likely live with that.
confusion_matrix(y_test, lr.predict(X_test))

array([[329,   8],
       [  9,  50]])

In [26]:
# Let's give Gausian NB a whirl
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# We need to convert the sparse matrix to an array
X = subject_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.88636363636363635

In [27]:
# Womp womp. That's our worst model so far.

In [29]:
# Maybe looking at 1 word is not enough.
# Let's try using bigrams and trigrams.

from nltk import ngrams
n = 2
# We'll use a set to make sure our ngrams are unique
subject_bigrams = set()
full_text_bigrams = set()

for i, r in df.iterrows():
    subject_grams = ngrams(r["subject"].split(), n)
    row_grams = []
    for sg in subject_grams:
#         subject_bigrams.add("_".join(sg))
        subject_bigrams.add(sg)
        row_grams.append(sg)

    full_text_grams = ngrams(r["full_text"].split(), n)
    for ftg in full_text_grams:
        full_text_bigrams.add(ftg)

In [30]:
# Trigrams

n = 3
subject_trigrams = set()
full_text_trigrams = set()

for i, r in df.iterrows():
    subject_grams = ngrams(r["subject"].split(), n)
    for sg in subject_grams:
        subject_trigrams.add(sg)
    
    full_text_grams = ngrams(r["full_text"].split(), n)
    for ftg in full_text_grams:
        full_text_trigrams.add(ftg)

In [31]:
bigrams = subject_bigrams.union(full_text_bigrams)
trigrams = subject_trigrams.union(full_text_trigrams)

In [32]:
len(bigrams)

94147

In [33]:
# def get_unigram_sentence(sentence):
#     return [word for word in word_tokenize(sentence.lower()) if word not in stopwords_set and word not in punctuation]

# Function to get unigrams of question1 and question2.
# def get_unigrams(df):
#     df['question1_unigram'] = df['question1'].apply(lambda x: get_unigram_sentence(x.decode(encoding='utf-8')))
#     df['question2_unigram'] = df['question2'].apply(lambda x: get_unigram_sentence(x.decode(encoding='utf-8')))
    
# def get_bigrams(df):
#     df['subject_bigrams'] = df['question1_unigram'].apply(lambda x: [i for i in ngrams(x, 2)])
#     df['full_text_bigram'] = df['question2_unigram'].apply(lambda x: [i for i in ngrams(x, 2)])

In [63]:
df2 = df.copy()
# Combine full_text and subject so that we can vectorize it
df2["subject_and_text"] = df2["subject"] + " " + df2["full_text"]

vectorizer = CountVectorizer(ngram_range=(1,2))
subject_vector = vectorizer.fit_transform(df2["subject_and_text"])
subject_vector.shape

(1581, 108951)

In [64]:
# Define the features (x) and target (y) to build our models
X = subject_vector
y = df2["cart_abandon"]

# Create our train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.95959595959595956

In [59]:
confusion_matrix(y_test, lr.predict(X_test))

array([[332,   5],
       [ 11,  48]])

In [60]:
df2 = df.copy()
# Combine full_text and subject so that we can vectorize it
df2["subject_and_text"] = df2["subject"] + " " + df2["full_text"]

vectorizer = TfidfVectorizer(ngram_range=(1,2))
subject_vector = vectorizer.fit_transform(df2["subject_and_text"])
subject_vector.shape

(1581, 108951)

In [61]:
# Define the features (x) and target (y) to build our models
X = subject_vector
y = df2["cart_abandon"]

# Create our train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.88888888888888884

In [62]:
confusion_matrix(y_test, lr.predict(X_test))

array([[337,   0],
       [ 44,  15]])

In [None]:
# from sklearn.manifold import TSNE

# tsne_model = TSNE(n_components=2, verbose=1, random_state=0, learning_rate=100)
# tsne_tfidf = tsne_model.fit_transform(subject_vector.toarray())

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
