## Applying Deep Learning - Word2Vec and Doc2Vec

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import numpy as np
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import nltk
from gensim.models import Word2Vec
import spacy
import en_core_web_sm
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
from pandas import Panel

  from pandas import Panel


In [2]:
movie_cleaned = pd.read_csv('../IMDB/movie_cleaned.csv')
review_cleaned = pd.read_csv('../IMDB/review_cleaned.csv')
movie_title_cleaned = pd.read_csv('../IMDB/movie_titles_cleaned.csv')
data = pd.merge(review_cleaned, movie_cleaned, on='movie_id')
df = pd.merge(data, movie_title_cleaned, on='movie_id')

In [21]:
def confusion_table(test_y, pred_y):
    tn, fp, fn, tp = confusion_matrix(test_y, pred_y).ravel()
    cm = pd.DataFrame(columns=['Predicted Negative','Predicted Positive'], index=['Actual Negative', 'Actual Positive'])
    cm['Predicted Positive'][1] = tp
    cm['Predicted Positive'][0] = fp
    cm['Predicted Negative'][1] = fn
    cm['Predicted Negative'][0] = tn
    
    print("True Negatives: %s" % tn)
    print("False Negatives: %s" % fn)
    print("False Positives: %s" % fp)
    print("True Positives: %s" % tp)
    return cm

### Word2Vec on one movie

In [14]:
list_movies = list(df['movie_title'].value_counts()[:1].index)
one_movie = df[df['movie_title'].isin(list_movies)]
one_movie = one_movie[['review', 'is_spoiler']]

In [16]:
train_data, test_data = train_test_split(one_movie, test_size=0.2, random_state=42)

In [18]:
%%time 
wv = gensim.models.KeyedVectors.load_word2vec_format("../IMDB/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

Wall time: 3min 52s


In [125]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.layer1_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [20]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [127]:
test_tokenized = test_data.apply(lambda r: w2v_tokenize_text(r['review']), axis=1).values
train_tokenized = train_data.apply(lambda r: w2v_tokenize_text(r['review']), axis=1).values

In [23]:
%%time
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

Wall time: 6.37 s


In [24]:
train_tokenized[0]

['My',
 'favorite',
 'superhero',
 'movie',
 'One',
 'of',
 'my',
 'favorite',
 'movies',
 'ever',
 'and',
 'my',
 'favorite',
 'superhero',
 'movie',
 'The',
 'Dark',
 'Knight',
 'shows',
 'the',
 'potential',
 'of',
 'Batman',
 'movie',
 'The',
 'film',
 'has',
 'some',
 'flaws',
 'but',
 'those',
 'are',
 'easily',
 'glossed',
 'over',
 'with',
 'the',
 'brilliant',
 'performance',
 'of',
 'Heath',
 'Ledger',
 'as',
 'the',
 'Joker',
 'for',
 'which',
 'he',
 'won',
 'an',
 'Academy',
 'Award',
 'Christian',
 'Bale',
 'returns',
 'to',
 'play',
 'the',
 'Batman',
 'and',
 'though',
 'this',
 'movie',
 'continues',
 'the',
 'story',
 'that',
 'started',
 'with',
 'Batman',
 'Begins',
 'it',
 'feels',
 'like',
 'wholly',
 'separate',
 'film',
 'that',
 'stands',
 'easily',
 'on',
 'its',
 'own',
 'The',
 'movie',
 'starts',
 'with',
 'very',
 'cool',
 'robbery',
 'scene',
 'remember',
 'this',
 'scene',
 'being',
 'released',
 'early',
 'to',
 'build',
 'excitement',
 'and',
 'watched

In [152]:
%%time
w2v_lr = LogisticRegression(random_state=42, max_iter=1000, class_weight={1:1.75})
w2v_lr.fit(X_train_word_average, train_data['is_spoiler'])

Wall time: 221 ms


LogisticRegression(C=1.0, class_weight={1: 1.75}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [153]:
print('Train accuracy score:', round(w2v_lr.score(X_train_word_average, train_data['is_spoiler'])*100,4),'%')
print('Test accuracy score:', round(w2v_lr.score(X_test_word_average, test_data['is_spoiler'])*100,4),'%')

Train accuracy score: 72.3168 %
Test accuracy score: 70.6914 %


In [154]:
w2v_predict = w2v_lr.predict(X_test_word_average)

In [155]:
confusion_table(test_data['is_spoiler'], w2v_predict)

True Negatives: 538
False Negatives: 167
False Positives: 117
True Positives: 147


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,538,117
Actual Positive,167,147


### Word2Vec on top 10 movie reviews from the most popular genre

In [4]:
list_movies_genre = list(df['genre'].value_counts()[:1].index)
many_movies_genre = df[df['genre'].isin(list_movies_genre)]

In [107]:
list_movies = list(many_movies_genre['movie_title'].value_counts()[0:10].index)
many_movies = many_movies_genre[many_movies_genre['movie_title'].isin(list_movies)]
many_movies = many_movies[['review','is_spoiler']]
train_many, test_many = train_test_split(many_movies, test_size=0.2, random_state=42)

In [108]:
train_tokenized_many = train_many.apply(lambda r: w2v_tokenize_text(r['review']), axis=1).values
test_tokenized_many = test_many.apply(lambda r: w2v_tokenize_text(r['review']), axis=1).values


In [109]:
%%time
X_train_word_ave_many = word_averaging_list(wv,train_tokenized_many)
X_test_word_ave_many = word_averaging_list(wv,test_tokenized_many)

Wall time: 7.94 s


In [110]:
%%time
w2v_many_lr = LogisticRegression(random_state=42, max_iter=1000, class_weight={1:1.75})
w2v_many_lr.fit(X_train_word_ave_many, train_many['is_spoiler'])

Wall time: 283 ms


LogisticRegression(C=1.0, class_weight={1: 1.75}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [111]:
print('Train accuracy score:', round(w2v_many_lr.score(X_train_word_ave_many, train_many['is_spoiler'])*100,4),'%')
print('Test accuracy score:', round(w2v_many_lr.score(X_test_word_ave_many, test_many['is_spoiler'])*100,4),'%')
w2v_many_predict = w2v_many_lr.predict(X_test_word_ave_many)

Train accuracy score: 68.9907 %
Test accuracy score: 67.8426 %


In [112]:
confusion_table(test_many['is_spoiler'], w2v_many_predict)

True Negatives: 653
False Negatives: 188
False Positives: 286
True Positives: 347


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,653,286
Actual Positive,188,347


### Doc2Vec on one movie's reviews

In [134]:
def label_sentences(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(gensim.models.doc2vec.TaggedDocument(v.split(), [label]))
    return labeled


In [135]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [136]:
X_train, X_test, y_train, y_test = train_test_split(one_movie['review'], one_movie['is_spoiler'], random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [137]:
model_dbow = Doc2Vec(dm=0, vector_size=400, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|█████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 1214595.83it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 2425567.30it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 4855771.30it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 808683.31it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 1619235.29it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 2434575.64it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 901810.73it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4845/4845 [00:00<00:00, 2425856.86it/s]
100%|███████████████████████████████████

In [138]:
   
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 400, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 400, 'Test')

In [139]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)
logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6891334250343879
              precision    recall  f1-score   support

           0       0.75      0.84      0.79      1012
           1       0.48      0.35      0.40       442

    accuracy                           0.69      1454
   macro avg       0.62      0.59      0.60      1454
weighted avg       0.67      0.69      0.67      1454



In [140]:
confusion_table(y_test, y_pred)

True Negatives: 849
False Negatives: 289
False Positives: 163
True Positives: 153


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,849,163
Actual Positive,289,153


### Dov2Vec on top 10 movie reviews from the most popular genre

In [144]:
X_many_train = train_many['review']
X_many_test = test_many['review']
y_many_train = train_many['is_spoiler']
y_many_test = test_many['is_spoiler']

In [145]:
X_many_train = label_sentences(X_many_train, 'Train')
X_many_test = label_sentences(X_many_test, 'Test')
many_data = X_many_train + X_many_test

In [146]:
model_dbow_many = Doc2Vec(dm=0, vector_size=400, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow_many.build_vocab([x for x in tqdm(many_data)])

for epoch in range(20):
    model_dbow_many.train(utils.shuffle([x for x in tqdm(many_data)]), total_examples=len(many_data), epochs=1)
    model_dbow_many.alpha -= 0.002
    model_dbow_many.min_alpha = model_dbow_many.alpha

100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 2463560.19it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 3692691.30it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 3694456.87it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 1477782.75it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 1231387.50it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 2463363.85it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 1231338.44it/s]
100%|█████████████████████████████████████████████████████████████████████████| 7369/7369 [00:00<00:00, 2461990.30it/s]
100%|███████████████████████████████████

In [147]:
train_vectors_dbow_many = get_vectors(model_dbow_many, len(X_many_train), 400, 'Train')
test_vectors_dbow_many = get_vectors(model_dbow_many, len(X_many_test), 400, 'Test')

In [148]:
logreg_many = LogisticRegression(n_jobs=1, C=1e5)
logreg_many.fit(train_vectors_dbow_many, y_many_train)
logreg_many = logreg_many.fit(train_vectors_dbow_many, y_many_train)
y_pred_many = logreg_many.predict(test_vectors_dbow_many)
print('accuracy %s' % accuracy_score(y_pred_many, y_many_test))
print(classification_report(y_many_test, y_pred_many))

accuracy 0.7008141112618724
              precision    recall  f1-score   support

           0       0.73      0.83      0.78       939
           1       0.61      0.47      0.53       535

    accuracy                           0.70      1474
   macro avg       0.67      0.65      0.66      1474
weighted avg       0.69      0.70      0.69      1474



In [149]:
confusion_table(y_many_test, y_pred_many)

True Negatives: 781
False Negatives: 283
False Positives: 158
True Positives: 252


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,781,158
Actual Positive,283,252
