In [40]:
import pandas as pd
import numpy as np
import sklearn

In [41]:
df = pd.read_csv("./data/cart-abandon.csv")

In [42]:
df.shape

(4666, 9)

In [43]:
df["all_text"] = df["subject"] + " " + df["full_text"].fillna("")

In [44]:
df.head(2)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon,all_text
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,1/7/16 15:07,Welcome to Sephora Beauty Insider,"Lorem, you're a Beauty Insider. Web Version SE...",1,https://www.mailcharts.com/emails/f3870de1-3ab...,0,"Welcome to Sephora Beauty Insider Lorem, you'r..."
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,1/8/16 17:28,"New year, new rewards","Lorem, the January rewards are here.** Web Ver...",2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0,"New year, new rewards Lorem, the January rewar..."


In [76]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import ngrams
import string


# Steps: Clean up text, stemming, remove stop words and weird chars, tokenizer words

# punctuation = list(set(string.punctuation))
re_punctuation = "\#|\.|\>|\/|\)|\"|\(|\}|\'|\_|\-|\$|\:|\[|\^|\+|\?|\`|\~|\!|\<|\@|\;|\=|\*|\\\|\{|\&|\]|\||\,|\|"
stopwords_set = list(set(stopwords.words('english')))

def get_unigram_sentence(sentence):
    sentence_no_punc = re.sub(re_punctuation, " ", sentence)
    unigram = [word for word in word_tokenize(sentence_no_punc.lower()) if word not in stopwords_set]
    return unigram

In [77]:
df["tokenized_text"] = df.all_text.apply(lambda x: get_unigram_sentence(x))

In [78]:
df.head(2)

Unnamed: 0,reg_id,add_id,email_guid,sent_at,subject,full_text,r,email_url,cart_abandon,all_text,tokenized_text,stemmed_tokens
0,2582,3742,f3870de1-3ab6-3fed-3fe2-778a74f3197e,1/7/16 15:07,Welcome to Sephora Beauty Insider,"Lorem, you're a Beauty Insider. Web Version SE...",1,https://www.mailcharts.com/emails/f3870de1-3ab...,0,"Welcome to Sephora Beauty Insider Lorem, you'r...","[welcome, sephora, beauty, insider, lorem, bea...","[welcom, sephora, beauti, insid, lorem, 're, b..."
1,2582,3742,0880fd5c-fbc5-eeb2-5bd3-8e352eae2b70,1/8/16 17:28,"New year, new rewards","Lorem, the January rewards are here.** Web Ver...",2,https://www.mailcharts.com/emails/0880fd5c-fbc...,0,"New year, new rewards Lorem, the January rewar...","[new, year, new, rewards, lorem, january, rewa...","[new, year, new, reward, lorem, januari, rewar..."


In [79]:
stemmer = PorterStemmer()

def get_stems(words):
    return [stemmer.stem(word) for word in words]

In [80]:
df["stemmed_tokens"] = df.tokenized_text.apply(lambda x: get_stems(x))

In [91]:
df["stemmed_text"] = df["stemmed_tokens"].apply(lambda x: " ".join(word for word in x))

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["stemmed_text"])
X = X.toarray()

In [114]:
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA()
lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [116]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
    for i in topic.argsort()[:-top_n - 1:-1]])
        print("=" * 100)

In [117]:
print_topics(lda, tfidf)

Topic 0:
[('shopbop', 2.75898794576303), ('lola', 1.7909009684246469), ('wigwam', 0.85717691783088545), ('¹ã', 0.73548630582397523), ('âºã', 0.7354842593903117), ('shopnicekick', 0.69220575079958191), ('5yp', 0.63933284654281608), ('s9j', 0.63933282964942451), ('l6v', 0.63933243342710921), ('yv1', 0.6393323065449773)]
Topic 1:
[('html', 27.820518055734379), ('version', 17.494157535777209), ('hammach', 4.7603372594176268), ('schlemmer', 2.7903521663425614), ('maxdeliveri', 2.1735532740091439), ('œnâ', 2.1432913414764045), ('thorlo', 2.0194295493759973), ('chesapeak', 2.0130716201603938), ('œaâ', 1.8513819881259777), ('wigwam', 1.4765636063212964)]
Topic 2:
[('rawl', 3.2670056900662301), ('bbcor', 1.1572385220075427), ('her', 0.98404196490944051), ('508', 0.84883538636880773), ('knockaroundâ', 0.75979221117168105), ('4327', 0.59505025986379745), ('demarini', 0.44822250693559917), ('510', 0.43711174358781052), ('63141', 0.4371100169886673), ('maryvil', 0.43710974039869888)]
Topic 3:
[('as