In [1]:
import pandas as pd
import numpy as np
import sklearn
import re

In [2]:
df = pd.read_csv("./data/cart-abandon.csv")

In [3]:
df.shape

(4666, 9)

In [4]:
# data cleaning

# Remove null
df.dropna(inplace=True)

# Remove "?"
df = df[df.cart_abandon != "?"]

# Make sure we're using ints
df.cart_abandon = df.cart_abandon.apply(lambda x: int(x))

In [5]:
df["all_text"] = df["subject"] + " " + df["full_text"].fillna("")

In [6]:
df.head(2)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon,all_text
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,1/7/16 15:07,Welcome to Sephora Beauty Insider,"Lorem, you're a Beauty Insider. Web Version SE...",1,https://www.mailcharts.com/emails/f3870de1-3ab...,0,"Welcome to Sephora Beauty Insider Lorem, you'r..."
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,1/8/16 17:28,"New year, new rewards","Lorem, the January rewards are here.** Web Ver...",2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0,"New year, new rewards Lorem, the January rewar..."


In [7]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import ngrams
import string


# Steps: Clean up text, stemming, remove stop words and weird chars, tokenizer words

# punctuation = list(set(string.punctuation))
re_punctuation = "\#|\.|\>|\/|\)|\"|\(|\}|\'|\_|\-|\$|\:|\[|\^|\+|\?|\`|\~|\!|\<|\@|\;|\=|\*|\\\|\{|\&|\]|\||\,|\|"
stopwords_set = list(set(stopwords.words('english')))

def get_unigram_sentence(sentence):
    sentence_no_punc = re.sub(re_punctuation, " ", sentence)
    unigram = [word for word in word_tokenize(sentence_no_punc.lower()) if word not in stopwords_set]
    return unigram

In [8]:
df["tokenized_text"] = df.all_text.apply(lambda x: get_unigram_sentence(x))

In [9]:
df.head(2)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon,all_text,tokenized_text
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,1/7/16 15:07,Welcome to Sephora Beauty Insider,"Lorem, you're a Beauty Insider. Web Version SE...",1,https://www.mailcharts.com/emails/f3870de1-3ab...,0,"Welcome to Sephora Beauty Insider Lorem, you'r...","[welcome, sephora, beauty, insider, lorem, bea..."
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,1/8/16 17:28,"New year, new rewards","Lorem, the January rewards are here.** Web Ver...",2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0,"New year, new rewards Lorem, the January rewar...","[new, year, new, rewards, lorem, january, rewa..."


In [48]:
# from nltk.stem import SnowballStemmer
# from nltk.stem import WordNetLemmatizer

# snow = SnowballStemmer(language='english')
# stem = PorterStemmer()
# word = WordNetLemmatizer()

In [10]:
stemmer = PorterStemmer()

def get_stems(words):
    return [stemmer.stem(word) for word in words]

In [11]:
df["stemmed_tokens"] = df.tokenized_text.apply(lambda x: get_stems(x))

In [12]:
df["stemmed_text"] = df["stemmed_tokens"].apply(lambda x: " ".join(word for word in x))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf = tfidf.fit(df["stemmed_text"])
X = tfidf.transform(df["stemmed_text"])
X = X.toarray()

In [72]:
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

# We know we have 2 labels
lda = LDA(2)
lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=2, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [73]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
    for i in topic.argsort()[:-top_n - 1:-1]])
        print("=" * 100)

In [74]:
print_topics(lda, tfidf)

Topic 0:
[('œâ', 42.261067068335578), ('de', 10.786949926770131), ('harrod', 3.7519011027823455), ('para', 3.5388111515864202), ('rma', 3.355619376483622), ('oliv', 3.0269897005417681), ('armament', 2.955870948807878), ('en', 2.7033534878571714), ('dafiti', 2.6090830625857544), ('tu', 2.6082079620309471)]
Topic 1:
[('email', 166.06755807926538), ('shop', 143.24441656288906), ('xxx', 138.93847089519465), ('com', 121.26218820122382), ('us', 117.63011084868855), ('order', 116.89001504493865), ('ship', 111.89316026181879), ('offer', 108.86971960435508), ('free', 105.40256830085487), ('pleas', 97.029312996402084)]


In [None]:
# nmf = NMF()
# nmf.fit(X)

In [None]:
# print_topics(nmf, tfidf)

In [14]:
# Use the features to fit supervised learning models for each feature set to predict the category outcomes.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
y = df.cart_abandon
lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
lr.score(X, y)

0.93232131562302345

In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(lr.predict(X), y)

array([[1316,  106],
       [   1,  158]])

In [25]:
example_x = tfidf.transform([df.stemmed_text[2]]).toarray()

In [26]:
lr.predict(example_x)

array([0])

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
rf.score(X_test, y_test)

0.92929292929292928

In [30]:
confusion_matrix(rf.predict(X_test), y_test)

array([[332,  27],
       [  1,  36]])

In [31]:
rf.predict(example_x)

array([1])