In [27]:
import pandas as pd
import numpy as np

import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import stopwords


from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

from tqdm import tqdm

import multiprocessing
cores = multiprocessing.cpu_count()

from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('train_data.csv')

In [5]:
## Drop nas in about

data.dropna(subset=['about'],inplace=True)
len(data)

12041

In [4]:
## Convert the text in 'about' to lowercase, remove punctuation

def cleanText(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

data['clean_about'] = data['about'].apply(cleanText)

In [8]:
## Create four equal sized buckets to be used as the predictive variable and add it to the dataframe

meme_popularity = pd.cut(data['target_variable'],
                    [0,data['target_variable'].quantile(0.25),data['target_variable'].quantile(0.5),data['target_variable'].quantile(0.75),max(data['target_variable'])],
                   labels = ['low','medium','high','great'])

data['popularity'] = meme_popularity

In [16]:
## Tokenize the clean text and remove stopwords

def tokenize_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            if word not in stop_words:
                tokens.append(word.lower())
    return tokens

In [17]:
## Tag the tokenized document with the populariity associated with each document

about_tagged = data.apply(
    lambda t: TaggedDocument(words=tokenize_text(t['clean_about']), tags=t.popularity), axis=1)

In [20]:
## Build a Distributed Bag-of-words model and build the vocab

model_dbow = Doc2Vec(dm=0, vector_size=15, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(about_tagged.values)])

100%|██████████| 12041/12041 [00:00<00:00, 2700294.84it/s]


In [22]:
## Train the model we built above for 30 epochs

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(about_tagged.values)]), total_examples=len(about_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 12041/12041 [00:00<00:00, 2605696.75it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3569664.58it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3688010.40it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3322606.21it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3539394.10it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3442176.56it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3490952.82it/s]
100%|██████████| 12041/12041 [00:00<00:00, 2995114.13it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3131424.51it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3859067.35it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3904113.67it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3550841.21it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3397485.00it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3329835.46it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3690166.19it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3496995.88it/s]
100%|██████████| 12041/12041 [00:00<00:00, 3013162.37it/

In [23]:
## Build the Final Vector feature we can use for classification

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [24]:
y_train, X_train = vec_for_learning(model_dbow, about_tagged)

In [26]:
## Let's try it out using a simple LogisticRegression

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_train)



In [28]:
## Testing the accuracy on training set

accuracy_score(y_train, y_pred)

0.33385931401046426

In [30]:
## Take a look at the embedding

X_train[0]

array([-0.83649707, -1.2550204 ,  0.6318553 , -3.3285842 , -1.0448722 ,
       -3.2441492 ,  0.81032354, -2.7630506 ,  0.7470689 , -2.5316715 ,
       -1.3190255 ,  1.8444138 ,  1.0105989 , -0.20433724, -3.5171907 ],
      dtype=float32)

### Each document has been converted to a vector of length 15 

### X_train contains arrays of tuples each tuple consists of the embedding of the doc