In [38]:
#!pip install gensim

In [89]:
import pandas as pd
import numpy as np
import re
import multiprocessing

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [43]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,length of text,sentiment
0,Match 1: Tag Team Table Match Bubba Ray and Sp...,13704,positive
1,There's a sign on The Lost Highway that says:<...,12988,positive
2,"(Some spoilers included:)<br /><br />Although,...",12930,positive
3,"Back in the mid/late 80s, an OAV anime by titl...",12129,positive
4,**Attention Spoilers**<br /><br />First of all...,10363,positive


In [30]:
SAMPLE_FRAC = 0.80
text_train = df[["review"]].sample(frac=SAMPLE_FRAC, random_state=10)
text_train.head()

Unnamed: 0,review
27632,I was very displeased with this move. Everythi...
36119,Eric Rohmer's 'The Lady and the Duke' is based...
4796,"OK, first of all, who in their right mind woul..."
3648,"For their credit, this is one of their more co..."
24501,This was recommended to me by a friend that sa...


In [32]:
def text_to_word_list(text):
    """
    Preprocess and convert texts to a list of words 
    """
    text = str(text)
    text = text.lower()
    
    # remove html tags
    text = re.sub(r'<[^>]+>', '', text)
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Single character removal
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    text = text.split()
    
    return text

In [33]:
text_train["review"] = text_train["review"].apply(lambda x: text_to_word_list(x))

In [34]:
text_train["review"][0]

['match',
 'tag',
 'team',
 'table',
 'match',
 'bubba',
 'ray',
 'and',
 'spike',
 'dudley',
 'vs',
 'eddie',
 'guerrero',
 'and',
 'chris',
 'benoit',
 'bubba',
 'ray',
 'and',
 'spike',
 'dudley',
 'started',
 'things',
 'off',
 'with',
 'tag',
 'team',
 'table',
 'match',
 'against',
 'eddie',
 'guerrero',
 'and',
 'chris',
 'benoit',
 'according',
 'to',
 'the',
 'rules',
 'of',
 'the',
 'match',
 'both',
 'opponents',
 'have',
 'to',
 'go',
 'through',
 'tables',
 'in',
 'order',
 'to',
 'get',
 'the',
 'win',
 'benoit',
 'and',
 'guerrero',
 'heated',
 'up',
 'early',
 'on',
 'by',
 'taking',
 'turns',
 'hammering',
 'first',
 'spike',
 'and',
 'then',
 'bubba',
 'ray',
 'german',
 'suplex',
 'by',
 'benoit',
 'to',
 'bubba',
 'took',
 'the',
 'wind',
 'out',
 'of',
 'the',
 'dudley',
 'brother',
 'spike',
 'tried',
 'to',
 'help',
 'his',
 'brother',
 'but',
 'the',
 'referee',
 'restrained',
 'him',
 'while',
 'benoit',
 'and',
 'guerrero',
 'ganged',
 'up',
 'on',
 'him',
 'i

In [41]:
#prepare the data for word2vec model
text = [row for row in text_train["review"]]
phrases = Phrases(text, min_count=1)
bigram = Phraser(phrases)
sentences = bigram[text]
sentences[0]

INFO - 15:41:20: collecting all words and their counts
INFO - 15:41:20: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 15:41:24: PROGRESS: at sentence #10000, processed 2210561 words and 778731 word types
INFO - 15:41:28: PROGRESS: at sentence #20000, processed 4418661 words and 1286839 word types
INFO - 15:41:32: PROGRESS: at sentence #30000, processed 6627240 words and 1719575 word types
INFO - 15:41:36: collected 2107879 word types from a corpus of 8831199 words (unigram + bigrams) and 40000 sentences
INFO - 15:41:36: using 2107879 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 15:41:36: source_vocab length 2107879
INFO - 15:41:43: Phraser added 50000 phrasegrams
INFO - 15:41:59: Phraser built with 84276 phrasegrams


['i',
 'was',
 'very',
 'displeased_with',
 'this',
 'move',
 'everything',
 'was',
 'terrible',
 'from',
 'the',
 'start',
 'the',
 'comedy',
 'was',
 'unhumorous',
 'the',
 'action',
 'overdone',
 'the',
 'songs',
 'unmelodious',
 'even',
 'the',
 'storyline',
 'was',
 'weightless',
 'from',
 'writer',
 'who',
 'has',
 'written',
 'successful',
 'scripts',
 'like',
 'guru',
 'and',
 'dhoom',
 'had',
 'high_expectations',
 'the',
 'actors',
 'worked',
 'way',
 'too',
 'hard',
 'and',
 'did',
 'not',
 'help',
 'the',
 'film',
 'at',
 'all',
 'of',
 'course',
 'kareena',
 'rocked',
 'the',
 'screen',
 'in',
 'bikini',
 'but',
 'for',
 'two',
 'seconds',
 'think',
 'hindi',
 'stunt',
 'directors',
 'should',
 'research',
 'how',
 'action',
 'movies',
 'are',
 'done',
 'they',
 'tend',
 'to',
 'exaggerate',
 'way',
 'too_much',
 'in',
 'chinese',
 'films',
 'this',
 'style',
 'works',
 'because',
 'that',
 'is',
 'their',
 'signature_piece',
 'but',
 'hindi_cinema',
 'signature',
 'are',


# Word2vec

In [45]:
# build vocab bag

#300 dimensional embeddings
#lookup window equal to 4
#negative sampling set to 20 words
#sub-sampling set to 1e-5
#learning rate from 0.03 to 0.0007

w2v_model = Word2Vec(min_count = 3,
                     window = 4,
                     size = 300,
                     sample = 1e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 15:52:56: collecting all words and their counts
INFO - 15:52:56: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 15:53:02: PROGRESS: at sentence #10000, processed 2030731 words, keeping 93382 word types
INFO - 15:53:07: PROGRESS: at sentence #20000, processed 4058268 words, keeping 129212 word types
INFO - 15:53:13: PROGRESS: at sentence #30000, processed 6086457 words, keeping 151913 word types
INFO - 15:53:19: collected 167138 word types from a corpus of 8111456 raw words and 40000 sentences
INFO - 15:53:19: Loading a fresh vocabulary
INFO - 15:53:20: effective_min_count=3 retains 81792 unique words (48% of original 167138, drops 85346)
INFO - 15:53:20: effective_min_count=3 leaves 7982878 word corpus (98% of original 8111456, drops 128578)
INFO - 15:53:20: deleting the raw counts dictionary of 167138 items
INFO - 15:53:20: sample=1e-05 downsamples 2849 most-common words
INFO - 15:53:20: downsampling leaves estimated 2351567 word corpus (29.5% of prior

Time to build vocab: 0.43 mins


In [46]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 15:55:26: training model with 3 workers on 81792 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 15:55:27: EPOCH 1 - PROGRESS: at 2.53% examples, 58393 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:28: EPOCH 1 - PROGRESS: at 5.45% examples, 64355 words/s, in_qsize 1, out_qsize 0
INFO - 15:55:29: EPOCH 1 - PROGRESS: at 8.46% examples, 65886 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:30: EPOCH 1 - PROGRESS: at 11.49% examples, 66882 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:31: EPOCH 1 - PROGRESS: at 14.97% examples, 68885 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:32: EPOCH 1 - PROGRESS: at 17.94% examples, 68662 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:33: EPOCH 1 - PROGRESS: at 20.89% examples, 68388 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:34: EPOCH 1 - PROGRESS: at 23.81% examples, 68181 words/s, in_qsize 0, out_qsize 0
INFO - 15:55:35: EPOCH 1 - PROGRESS: at 26.52% examples, 67996 words/s, in_qsize 0, out_qsize 0
INFO

INFO - 15:56:47: EPOCH 3 - PROGRESS: at 16.10% examples, 52194 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:48: EPOCH 3 - PROGRESS: at 18.19% examples, 51470 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:49: EPOCH 3 - PROGRESS: at 20.53% examples, 51514 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:50: EPOCH 3 - PROGRESS: at 23.35% examples, 52677 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:51: EPOCH 3 - PROGRESS: at 25.83% examples, 53294 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:52: EPOCH 3 - PROGRESS: at 28.52% examples, 54047 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:53: EPOCH 3 - PROGRESS: at 31.12% examples, 54235 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:54: EPOCH 3 - PROGRESS: at 33.63% examples, 54534 words/s, in_qsize 1, out_qsize 0
INFO - 15:56:55: EPOCH 3 - PROGRESS: at 36.10% examples, 54687 words/s, in_qsize 1, out_qsize 0
INFO - 15:56:56: EPOCH 3 - PROGRESS: at 38.68% examples, 54799 words/s, in_qsize 0, out_qsize 0
INFO - 15:56:57: EPOCH 3 - PROGRESS: at 

INFO - 15:58:09: EPOCH 5 - PROGRESS: at 5.82% examples, 45036 words/s, in_qsize 1, out_qsize 0
INFO - 15:58:10: EPOCH 5 - PROGRESS: at 7.73% examples, 45059 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:11: EPOCH 5 - PROGRESS: at 10.30% examples, 47961 words/s, in_qsize 1, out_qsize 0
INFO - 15:58:12: EPOCH 5 - PROGRESS: at 13.25% examples, 51067 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:13: EPOCH 5 - PROGRESS: at 16.10% examples, 52911 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:14: EPOCH 5 - PROGRESS: at 19.10% examples, 54771 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:15: EPOCH 5 - PROGRESS: at 21.84% examples, 55978 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:16: EPOCH 5 - PROGRESS: at 24.78% examples, 57032 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:17: EPOCH 5 - PROGRESS: at 27.67% examples, 58126 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:18: EPOCH 5 - PROGRESS: at 30.64% examples, 58928 words/s, in_qsize 0, out_qsize 0
INFO - 15:58:19: EPOCH 5 - PROGRESS: at 33

INFO - 15:59:31: EPOCH 7 - PROGRESS: at 30.29% examples, 62950 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:32: EPOCH 7 - PROGRESS: at 32.35% examples, 61453 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:33: EPOCH 7 - PROGRESS: at 34.38% examples, 60237 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:34: EPOCH 7 - PROGRESS: at 36.35% examples, 59044 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:35: EPOCH 7 - PROGRESS: at 38.68% examples, 58689 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:36: EPOCH 7 - PROGRESS: at 41.28% examples, 58871 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:37: EPOCH 7 - PROGRESS: at 43.31% examples, 58134 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:38: EPOCH 7 - PROGRESS: at 46.02% examples, 58347 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:39: EPOCH 7 - PROGRESS: at 48.49% examples, 58134 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:40: EPOCH 7 - PROGRESS: at 50.28% examples, 57336 words/s, in_qsize 0, out_qsize 0
INFO - 15:59:41: EPOCH 7 - PROGRESS: at 

INFO - 16:00:54: EPOCH 9 - PROGRESS: at 39.24% examples, 69067 words/s, in_qsize 0, out_qsize 0
INFO - 16:00:55: EPOCH 9 - PROGRESS: at 42.24% examples, 69283 words/s, in_qsize 0, out_qsize 0
INFO - 16:00:56: EPOCH 9 - PROGRESS: at 45.16% examples, 69229 words/s, in_qsize 0, out_qsize 0
INFO - 16:00:57: EPOCH 9 - PROGRESS: at 48.12% examples, 69185 words/s, in_qsize 0, out_qsize 0
INFO - 16:00:58: EPOCH 9 - PROGRESS: at 51.23% examples, 69192 words/s, in_qsize 0, out_qsize 0
INFO - 16:00:59: EPOCH 9 - PROGRESS: at 54.20% examples, 69195 words/s, in_qsize 0, out_qsize 0
INFO - 16:01:00: EPOCH 9 - PROGRESS: at 57.27% examples, 69191 words/s, in_qsize 0, out_qsize 0
INFO - 16:01:01: EPOCH 9 - PROGRESS: at 60.28% examples, 69227 words/s, in_qsize 0, out_qsize 0
INFO - 16:01:02: EPOCH 9 - PROGRESS: at 63.25% examples, 69286 words/s, in_qsize 0, out_qsize 0
INFO - 16:01:03: EPOCH 9 - PROGRESS: at 66.16% examples, 69192 words/s, in_qsize 0, out_qsize 0
INFO - 16:01:04: EPOCH 9 - PROGRESS: at 

INFO - 16:02:15: EPOCH 11 - PROGRESS: at 20.39% examples, 46238 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:16: EPOCH 11 - PROGRESS: at 23.10% examples, 47652 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:17: EPOCH 11 - PROGRESS: at 25.95% examples, 49212 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:18: EPOCH 11 - PROGRESS: at 28.77% examples, 50487 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:19: EPOCH 11 - PROGRESS: at 31.61% examples, 51423 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:20: EPOCH 11 - PROGRESS: at 34.38% examples, 52201 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:21: EPOCH 11 - PROGRESS: at 36.73% examples, 52203 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:22: EPOCH 11 - PROGRESS: at 39.50% examples, 52958 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:23: EPOCH 11 - PROGRESS: at 42.02% examples, 53304 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:24: EPOCH 11 - PROGRESS: at 44.20% examples, 53213 words/s, in_qsize 0, out_qsize 0
INFO - 16:02:25: EPOCH 11 - PR

INFO - 16:03:37: EPOCH 13 - PROGRESS: at 39.14% examples, 64118 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:38: EPOCH 13 - PROGRESS: at 42.12% examples, 64499 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:39: EPOCH 13 - PROGRESS: at 45.16% examples, 64814 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:40: EPOCH 13 - PROGRESS: at 48.24% examples, 65040 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:41: EPOCH 13 - PROGRESS: at 51.34% examples, 65238 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:42: EPOCH 13 - PROGRESS: at 54.33% examples, 65428 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:43: EPOCH 13 - PROGRESS: at 57.40% examples, 65686 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:44: EPOCH 13 - PROGRESS: at 60.41% examples, 65892 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:45: EPOCH 13 - PROGRESS: at 63.35% examples, 66025 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:46: EPOCH 13 - PROGRESS: at 66.39% examples, 66162 words/s, in_qsize 0, out_qsize 0
INFO - 16:03:47: EPOCH 13 - PR

INFO - 16:04:59: EPOCH 15 - PROGRESS: at 26.88% examples, 69082 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:00: EPOCH 15 - PROGRESS: at 30.03% examples, 69196 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:01: EPOCH 15 - PROGRESS: at 33.05% examples, 69221 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:02: EPOCH 15 - PROGRESS: at 36.23% examples, 69593 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:03: EPOCH 15 - PROGRESS: at 38.80% examples, 68802 words/s, in_qsize 1, out_qsize 0
INFO - 16:05:04: EPOCH 15 - PROGRESS: at 41.05% examples, 67668 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:05: EPOCH 15 - PROGRESS: at 43.10% examples, 66414 words/s, in_qsize 1, out_qsize 0
INFO - 16:05:06: EPOCH 15 - PROGRESS: at 45.64% examples, 65751 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:07: EPOCH 15 - PROGRESS: at 47.51% examples, 64256 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:08: EPOCH 15 - PROGRESS: at 49.80% examples, 63672 words/s, in_qsize 0, out_qsize 0
INFO - 16:05:09: EPOCH 15 - PR

INFO - 16:06:19: EPOCH 17 - PROGRESS: at 33.16% examples, 63579 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:20: EPOCH 17 - PROGRESS: at 35.31% examples, 62689 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:22: EPOCH 17 - PROGRESS: at 38.04% examples, 62648 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:23: EPOCH 17 - PROGRESS: at 41.05% examples, 63274 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:24: EPOCH 17 - PROGRESS: at 43.91% examples, 63447 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:25: EPOCH 17 - PROGRESS: at 47.05% examples, 63733 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:26: EPOCH 17 - PROGRESS: at 49.69% examples, 63564 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:27: EPOCH 17 - PROGRESS: at 52.64% examples, 63641 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:28: EPOCH 17 - PROGRESS: at 55.72% examples, 63964 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:29: EPOCH 17 - PROGRESS: at 58.69% examples, 64302 words/s, in_qsize 0, out_qsize 0
INFO - 16:06:30: EPOCH 17 - PR

INFO - 16:07:40: EPOCH 19 - PROGRESS: at 63.46% examples, 65966 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:41: EPOCH 19 - PROGRESS: at 66.16% examples, 65780 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:42: EPOCH 19 - PROGRESS: at 68.84% examples, 65587 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:43: EPOCH 19 - PROGRESS: at 71.41% examples, 65318 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:44: EPOCH 19 - PROGRESS: at 74.27% examples, 65412 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:45: EPOCH 19 - PROGRESS: at 76.95% examples, 65230 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:46: EPOCH 19 - PROGRESS: at 79.80% examples, 65149 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:47: EPOCH 19 - PROGRESS: at 82.60% examples, 65062 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:48: EPOCH 19 - PROGRESS: at 85.33% examples, 65025 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:49: EPOCH 19 - PROGRESS: at 88.24% examples, 65082 words/s, in_qsize 0, out_qsize 0
INFO - 16:07:50: EPOCH 19 - PR

INFO - 16:09:02: EPOCH 21 - PROGRESS: at 67.85% examples, 55546 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:03: EPOCH 21 - PROGRESS: at 70.29% examples, 55504 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:04: EPOCH 21 - PROGRESS: at 72.95% examples, 55702 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:05: EPOCH 21 - PROGRESS: at 75.47% examples, 55813 words/s, in_qsize 1, out_qsize 0
INFO - 16:09:06: EPOCH 21 - PROGRESS: at 78.16% examples, 55990 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:07: EPOCH 21 - PROGRESS: at 80.69% examples, 55946 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:08: EPOCH 21 - PROGRESS: at 83.43% examples, 56153 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:09: EPOCH 21 - PROGRESS: at 86.16% examples, 56335 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:10: EPOCH 21 - PROGRESS: at 87.88% examples, 55844 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:11: EPOCH 21 - PROGRESS: at 90.50% examples, 55948 words/s, in_qsize 0, out_qsize 0
INFO - 16:09:12: EPOCH 21 - PR

INFO - 16:10:23: EPOCH 23 - PROGRESS: at 63.46% examples, 60739 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:25: EPOCH 23 - PROGRESS: at 65.03% examples, 59725 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:26: EPOCH 23 - PROGRESS: at 67.26% examples, 59341 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:27: EPOCH 23 - PROGRESS: at 69.79% examples, 59238 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:28: EPOCH 23 - PROGRESS: at 72.17% examples, 58972 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:29: EPOCH 23 - PROGRESS: at 74.73% examples, 59085 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:30: EPOCH 23 - PROGRESS: at 77.68% examples, 59348 words/s, in_qsize 1, out_qsize 0
INFO - 16:10:31: EPOCH 23 - PROGRESS: at 80.42% examples, 59423 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:32: EPOCH 23 - PROGRESS: at 83.20% examples, 59554 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:33: EPOCH 23 - PROGRESS: at 85.95% examples, 59709 words/s, in_qsize 0, out_qsize 0
INFO - 16:10:34: EPOCH 23 - PR

INFO - 16:11:44: EPOCH 25 - PROGRESS: at 55.97% examples, 53397 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:45: EPOCH 25 - PROGRESS: at 58.57% examples, 53738 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:46: EPOCH 25 - PROGRESS: at 61.23% examples, 54058 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:47: EPOCH 25 - PROGRESS: at 63.95% examples, 54457 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:48: EPOCH 25 - PROGRESS: at 67.01% examples, 55036 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:49: EPOCH 25 - PROGRESS: at 69.79% examples, 55310 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:50: EPOCH 25 - PROGRESS: at 72.61% examples, 55610 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:51: EPOCH 25 - PROGRESS: at 75.25% examples, 55816 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:52: EPOCH 25 - PROGRESS: at 77.93% examples, 55945 words/s, in_qsize 0, out_qsize 0
INFO - 16:11:53: EPOCH 25 - PROGRESS: at 80.29% examples, 55863 words/s, in_qsize 1, out_qsize 0
INFO - 16:11:54: EPOCH 25 - PR

INFO - 16:13:05: EPOCH 27 - PROGRESS: at 82.11% examples, 70106 words/s, in_qsize 0, out_qsize 0
INFO - 16:13:06: EPOCH 27 - PROGRESS: at 85.22% examples, 70136 words/s, in_qsize 0, out_qsize 0
INFO - 16:13:07: EPOCH 27 - PROGRESS: at 88.24% examples, 70113 words/s, in_qsize 0, out_qsize 0
INFO - 16:13:08: EPOCH 27 - PROGRESS: at 91.33% examples, 70153 words/s, in_qsize 0, out_qsize 0
INFO - 16:13:09: EPOCH 27 - PROGRESS: at 94.44% examples, 70182 words/s, in_qsize 0, out_qsize 0
INFO - 16:13:10: EPOCH 27 - PROGRESS: at 97.39% examples, 70190 words/s, in_qsize 0, out_qsize 0
INFO - 16:13:10: worker thread finished; awaiting finish of 2 more threads
INFO - 16:13:10: worker thread finished; awaiting finish of 1 more threads
INFO - 16:13:10: worker thread finished; awaiting finish of 0 more threads
INFO - 16:13:10: EPOCH - 27 : training on 8111456 raw words (2351332 effective words) took 33.5s, 70274 effective words/s
INFO - 16:13:12: EPOCH 28 - PROGRESS: at 3.00% examples, 68399 words/s,

INFO - 16:14:26: worker thread finished; awaiting finish of 2 more threads
INFO - 16:14:26: worker thread finished; awaiting finish of 1 more threads
INFO - 16:14:26: worker thread finished; awaiting finish of 0 more threads
INFO - 16:14:26: EPOCH - 29 : training on 8111456 raw words (2351952 effective words) took 39.8s, 59057 effective words/s
INFO - 16:14:27: EPOCH 30 - PROGRESS: at 1.82% examples, 40598 words/s, in_qsize 0, out_qsize 0
INFO - 16:14:28: EPOCH 30 - PROGRESS: at 3.65% examples, 40476 words/s, in_qsize 1, out_qsize 0
INFO - 16:14:29: EPOCH 30 - PROGRESS: at 4.88% examples, 36889 words/s, in_qsize 0, out_qsize 0
INFO - 16:14:30: EPOCH 30 - PROGRESS: at 6.61% examples, 37702 words/s, in_qsize 0, out_qsize 0
INFO - 16:14:31: EPOCH 30 - PROGRESS: at 9.04% examples, 41204 words/s, in_qsize 0, out_qsize 0
INFO - 16:14:32: EPOCH 30 - PROGRESS: at 11.36% examples, 43027 words/s, in_qsize 0, out_qsize 0
INFO - 16:14:33: EPOCH 30 - PROGRESS: at 13.81% examples, 44784 words/s, in_

Time to train the model: 19.65 mins


In [47]:
w2v_model.save("word2vec.model")

INFO - 16:15:34: saving Word2Vec object under word2vec.model, separately None
INFO - 16:15:34: storing np array 'vectors' to word2vec.model.wv.vectors.npy
INFO - 16:15:35: not storing attribute vectors_norm
INFO - 16:15:35: storing np array 'syn1neg' to word2vec.model.trainables.syn1neg.npy
INFO - 16:15:35: not storing attribute cum_table
INFO - 16:15:36: saved word2vec.model


In [48]:
# Exporting preprocessed dataset for further steps with replaced bigrams
file_export = text_train.copy()
file_export["review"] = file_export["review"].apply(lambda x: ' '.join(bigram[x]))

In [49]:
file_export["review"][0]

'match_tag team table_match bubba_ray and spike_dudley vs eddie_guerrero and chris_benoit bubba_ray and spike_dudley started things off with tag_team table_match against eddie_guerrero and chris_benoit according to the rules of the match both_opponents have to go through tables in order to get the win benoit and guerrero heated up early on by taking turns hammering first spike and then bubba_ray german suplex by benoit to bubba took the wind out of the dudley brother spike tried to help his brother but the referee restrained him while benoit and guerrero ganged up on him in the corner with benoit stomping away on bubba guerrero set_up table outside spike dashed into the ring and somersaulted over the top_rope onto guerrero on the outside after recovering and taking_care of spike guerrero slipped table into the ring and helped the wolverine set it up the tandem then set_up for double superplex from the middle_rope which would have put bubba through the table but spike knocked the table 

In [50]:
file_export[["review"]].to_csv("cleaned_review.csv", index=False)

# K-Means Clustering

In [51]:
word_vectors = Word2Vec.load("word2vec.model").wv

INFO - 16:18:19: loading Word2Vec object from word2vec.model
INFO - 16:18:20: loading wv recursively from word2vec.model.wv.* with mmap=None
INFO - 16:18:20: loading vectors from word2vec.model.wv.vectors.npy with mmap=None
INFO - 16:18:21: setting ignored attribute vectors_norm to None
INFO - 16:18:21: loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
INFO - 16:18:21: loading trainables recursively from word2vec.model.trainables.* with mmap=None
INFO - 16:18:21: loading syn1neg from word2vec.model.trainables.syn1neg.npy with mmap=None
INFO - 16:18:21: setting ignored attribute cum_table to None
INFO - 16:18:21: loaded word2vec.model


In [53]:
# create KMeans model

# 5 classes
# 50 repeated starting points, to prevent from choosing wrong starting centroid 
# 1000 iterations of reassigning points to clusters

model = KMeans(n_clusters=5, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [60]:
#cosine similarity to coordinates of centroids
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=50, restrict_vocab=None)
# neutral

[('turmoils', 0.9209929704666138),
 ('cultural_political', 0.9208686351776123),
 ('political_environment', 0.9167952537536621),
 ('immediate_gratification', 0.9160435199737549),
 ('holocausts', 0.9158011674880981),
 ('negative_thoughts', 0.9140483140945435),
 ('simplifies', 0.9093090295791626),
 ('sexualities', 0.907524585723877),
 ('liberated_love', 0.905269980430603),
 ('intellectual_stimulation', 0.9021869897842407),
 ('religious_figures', 0.8985882997512817),
 ('our_perception', 0.8973821401596069),
 ('worthiness', 0.8972358703613281),
 ('focuses_exclusively', 0.8967568874359131),
 ('jingoism', 0.8966823220252991),
 ('religious_convictions', 0.8953819870948792),
 ('stigma_against', 0.8947529196739197),
 ('critical_intelligence', 0.8937622904777527),
 ('internalised', 0.8934087753295898),
 ('christian_parable', 0.8931920528411865),
 ('totalitarian_state', 0.8928284049034119),
 ('w_ii', 0.892673671245575),
 ('general_flavor', 0.8920348882675171),
 ('inquires_into', 0.8913379311561584

In [64]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None)
# positive

[('renne', 0.9604375958442688),
 ('airheaded', 0.948772668838501),
 ('slimy_hilt', 0.947950005531311),
 ('simon_ward', 0.9474081993103027),
 ('delightfully_played', 0.9451876878738403),
 ('deb', 0.9437732696533203),
 ('henry_stephenson', 0.9435991048812866),
 ('mcintosh', 0.9426686763763428),
 ('richard_davalos', 0.9395602941513062),
 ('sullivan_catherine', 0.9388637542724609),
 ('douglass_dumbrille', 0.9377530813217163),
 ('geoffrey_lewis', 0.9370205402374268),
 ('mcallister', 0.9365656971931458),
 ('madge_sinclair', 0.9356828331947327),
 ('hickland', 0.9355055689811707),
 ('lidz', 0.9353559017181396),
 ('sharyn_moffett', 0.9350690841674805),
 ('sarah_paulson', 0.9346758127212524),
 ('joan_woodbury', 0.9345322847366333),
 ('roscoe_lee', 0.9344432353973389),
 ('ulysses_everett', 0.9334567189216614),
 ('architect_linda', 0.9333460927009583),
 ('joey_slotnick', 0.9333344101905823),
 ('gloria_graham', 0.9332959055900574),
 ('ample_support', 0.9328981041908264),
 ('deliciously_played', 0.9

In [56]:
word_vectors.similar_by_vector(model.cluster_centers_[2], topn=10, restrict_vocab=None)
# very negative

[('sooo_bad', 0.8894731998443604),
 ('downright_horrible', 0.878878116607666),
 ('barely_reaches', 0.8780401349067688),
 ('steal_ideas', 0.8780032396316528),
 ('bust_gut', 0.8762840032577515),
 ('honestly_cant', 0.8758729696273804),
 ('entertainingly_bad', 0.8758292198181152),
 ('worth_peek', 0.8742009401321411),
 ('bum_bum', 0.8705578446388245),
 ('m_speechless', 0.8700497150421143)]

In [62]:
word_vectors.similar_by_vector(model.cluster_centers_[3], topn=50, restrict_vocab=None)
# negative

[('passenger_seat', 0.9433311223983765),
 ('yobs', 0.9403911232948303),
 ('dinosaur_egg', 0.9334125518798828),
 ('warp_speed', 0.9332993030548096),
 ('seven_weeks', 0.9276533126831055),
 ('cauldron', 0.9221333265304565),
 ('gets_tangled', 0.9189063906669617),
 ('handshakes', 0.9184724688529968),
 ('loony_bin', 0.9183509349822998),
 ('research_laboratory', 0.9180904030799866),
 ('vacated', 0.9165256023406982),
 ('rebar', 0.9159551858901978),
 ('foolishly_decide', 0.915518581867218),
 ('take_dip', 0.9134750962257385),
 ('secret_passageway', 0.9126226305961609),
 ('precious_cargo', 0.9121806621551514),
 ('fifty_foot', 0.9114546775817871),
 ('mortally', 0.9099901914596558),
 ('hammock', 0.9082291126251221),
 ('front_doors', 0.9080612659454346),
 ('tomahawk', 0.9080014228820801),
 ('his_paws', 0.907641589641571),
 ('panic_stricken', 0.9068843126296997),
 ('stolen_brides', 0.9065980315208435),
 ('he_deems', 0.9062352180480957),
 ('nobody_notices', 0.9061903953552246),
 ('staggers_through', 0

In [63]:
word_vectors.similar_by_vector(model.cluster_centers_[4], topn=50, restrict_vocab=None)
# very positive

[('halestorm_entertainment', 0.9247247576713562),
 ('slasher_completists', 0.923462986946106),
 ('magical_elements', 0.9201725125312805),
 ('meville', 0.9189850091934204),
 ('stooge_short', 0.9116284847259521),
 ('major_departure', 0.9099736213684082),
 ('rating_scale', 0.909446656703949),
 ('turkey_zero', 0.9063595533370972),
 ('absolute_masterpiece', 0.9056006669998169),
 ('tak', 0.904721736907959),
 ('spree_killer', 0.9043179750442505),
 ('filmgoing_experiences', 0.9018747210502625),
 ('ivan_trojan', 0.9008865356445312),
 ('biographic', 0.8998018503189087),
 ('outlay', 0.8992246389389038),
 ('direct_reference', 0.8990446925163269),
 ('enfants', 0.8977972269058228),
 ('fleisher', 0.8977828025817871),
 ('complete_mockery', 0.8971467614173889),
 ('blander', 0.8970561027526855),
 ('compatriot', 0.8969711661338806),
 ('unmotivated_characters', 0.8966857194900513),
 ('usual_dose', 0.8961433172225952),
 ('nadja', 0.8953823447227478),
 ('social_dramas', 0.893265962600708),
 ('twenty_percent

In [78]:
words = pd.DataFrame(word_vectors.vocab.keys())
words = words.rename(columns = {0:"word"})
words["vectors"] = words["word"].apply(lambda x: word_vectors.wv[f'{x}'])
words["cluster"] = words["vectors"].apply(lambda x: model.predict(x.reshape(-1, 300)))

  This is separate from the ipykernel package so we can avoid doing imports until


In [79]:
words["cluster"] = words["cluster"].apply(lambda x: x[0])
words[["cluster"]]

Unnamed: 0,cluster
0,2
1,2
2,2
3,4
4,2
5,2
6,2
7,2
8,2
9,2


In [81]:
words.head(50)

Unnamed: 0,word,vectors,cluster
0,i,"[0.07200982, -0.16429467, -0.060111098, -0.001...",2
1,was,"[0.005916084, -0.14569463, -0.057794064, -0.00...",2
2,very,"[-0.113071345, -0.040633474, 0.11513126, -0.03...",2
3,displeased_with,"[-0.06283038, -0.083826475, -0.024995796, 0.03...",4
4,this,"[0.025090072, -0.012240633, 0.021996802, 0.030...",2
5,move,"[-0.051552393, 0.08243211, 0.05263038, 0.08991...",2
6,everything,"[0.0283298, -0.051362216, 0.0749336, -0.017288...",2
7,terrible,"[-0.008401364, -0.052642964, -0.009378993, -0....",2
8,from,"[-0.014194613, -0.025308458, -0.11038079, 0.01...",2
9,the,"[-0.12263861, -0.0037751037, 0.019183328, 0.06...",2


In [82]:
for i in words["cluster"]:
    if i == 0: # neutral
        words["cluster_value"] = 3
    elif i == 1: # positive
        words["cluster_value"] = 4
    elif i == 2: # very negative
        words["cluster_value"] = 1
    elif i == 3: # negative
        words["cluster_value"] = 2
    else: # very positive
        words["cluster_value"] = 5

In [83]:
words["closeness_score"] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words["sentiment_coeff"] = words["closeness_score"] * words["cluster_value"]

In [84]:
words.head(10)

Unnamed: 0,word,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,i,"[0.07200982, -0.16429467, -0.060111098, -0.001...",2,3,1.123752,3.371256
1,was,"[0.005916084, -0.14569463, -0.057794064, -0.00...",2,3,1.057116,3.171349
2,very,"[-0.113071345, -0.040633474, 0.11513126, -0.03...",2,3,1.023143,3.06943
3,displeased_with,"[-0.06283038, -0.083826475, -0.024995796, 0.03...",4,3,1.562939,4.688817
4,this,"[0.025090072, -0.012240633, 0.021996802, 0.030...",2,3,1.208451,3.625354
5,move,"[-0.051552393, 0.08243211, 0.05263038, 0.08991...",2,3,0.964042,2.892125
6,everything,"[0.0283298, -0.051362216, 0.0749336, -0.017288...",2,3,1.019498,3.058494
7,terrible,"[-0.008401364, -0.052642964, -0.009378993, -0....",2,3,1.121156,3.363468
8,from,"[-0.014194613, -0.025308458, -0.11038079, 0.01...",2,3,0.969215,2.907646
9,the,"[-0.12263861, -0.0037751037, 0.019183328, 0.06...",2,3,1.029196,3.087587


In [85]:
words[["word", "sentiment_coeff"]].to_csv("sentiment_dictionary.csv", index=False)

# Predictions

In [86]:
review = pd.read_csv("cleaned_review.csv")

In [88]:
sentiment_map = pd.read_csv("sentiment_dictionary.csv")
sentiment_dict = dict(zip(sentiment_map["word"].values, sentiment_map["sentiment_coeff"].values))

In [90]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(review["review"])
features = pd.Series(tfidf.get_feature_names())
review_tfidf = tfidf.transform(review["review"])



In [99]:
def create_tfidf_dictionary(x, transformed_file, features):
    """
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    """
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    """
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    """
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x["review"].split()))

In [100]:
%%time
replaced_tfidf_scores = review.apply(lambda x: replace_tfidf_words(x, review_tfidf, features), axis=1)

CPU times: user 18.2 s, sys: 1.31 s, total: 19.5 s
Wall time: 20.5 s


In [101]:
def replace_sentiment_words(word, sentiment_dict):
    """
    replacing each word with its associated sentiment score from sentiment dict
    """
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [102]:
replaced_closeness_scores = review["review"].apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [103]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, review["review"]]).T
replacement_df.columns = ["sentiment_coeff", "tfidf_scores", "review"]
replacement_df["sentiment_rate"] = replacement_df.apply(lambda x: np.array(x.loc["sentiment_coeff"]) @ np.array(x.loc["tfidf_scores"]), axis=1)

In [104]:
for i in replacement_df["sentiment_rate"]:
    if i > 4 and i<=5 :
        replacement_df["prediction"] = "Very Positive"
    elif i>3 and i<=4 :
        replacement_df["prediction"] = "Positive"
    elif i>2 and i<=3 :
        replacement_df["prediction"] = "Neutral"
    elif i>1 and i<=2 :
        replacement_df["prediction"] = "Negative"
    else:
        replacement_df["prediction"] = "Very Negative"