In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

# Load data

In [3]:
data = np.load("task03_data.npy", allow_pickle=True)
reviews_1star = [[x.lower() for x in s] for s in data.item()["reviews_1star"]]
reviews_5star = [[x.lower() for x in s] for s in data.item()["reviews_5star"]]

In [11]:
reviews_5star

[['the',
  'surf',
  'and',
  'turf',
  'here',
  'was',
  'one',
  'of',
  'the',
  'best',
  "i've",
  'had'],
 ['filet', 'mignon', 'and', 'lobster', 'tail', 'was', 'very', 'good'],
 ['i',
  'generally',
  'dont',
  'think',
  'the',
  'food',
  'in',
  'vegas',
  'is',
  'great',
  'but',
  'after',
  'being',
  'dissappointed',
  'on',
  'many',
  'occasions',
  'i',
  'was',
  'pleasantly',
  'surprised',
  'with',
  'the',
  'quality',
  'of',
  'our',
  'meal'],
 ['thanks',
  'to',
  'the',
  'taste',
  'i',
  'was',
  'lured',
  'to',
  'this',
  'restaurant',
  'only',
  'to',
  'find',
  'that',
  'it',
  'is',
  'a',
  'somehat',
  'hidden',
  'jewel',
  'in',
  'new',
  'york',
  'new',
  'york',
  'close',
  'to',
  'the',
  'sometimes',
  'obnoxious',
  'piana',
  'bar',
  'time',
  'square'],
 ['the',
  'side',
  'of',
  'green',
  'beans',
  'were',
  'delish',
  'and',
  'the',
  'potatos',
  'are',
  'just',
  'meh'],
 ['for',
  'desert',
  'they',
  'served',
  'an',

In [4]:
vocabulary = [x for s in reviews_1star + reviews_5star for x in s]
vocabulary, counts = zip(*Counter(vocabulary).most_common(500))

In [12]:
vocabulary

('the',
 'and',
 'i',
 'a',
 'to',
 'was',
 'of',
 'for',
 'it',
 'is',
 'in',
 'we',
 'this',
 'my',
 'food',
 'with',
 'you',
 'they',
 'on',
 'had',
 'that',
 'but',
 'place',
 'not',
 'were',
 'so',
 'have',
 'at',
 'good',
 'our',
 'are',
 'here',
 'be',
 'great',
 'all',
 'very',
 'service',
 'as',
 'out',
 'one',
 'there',
 'like',
 'their',
 'if',
 'get',
 'time',
 'me',
 'back',
 'vegas',
 'just',
 'go',
 'will',
 'your',
 'from',
 'about',
 'best',
 'ordered',
 'he',
 'which',
 'when',
 "it's",
 'us',
 'also',
 'up',
 'an',
 'no',
 'would',
 'even',
 'what',
 'after',
 'delicious',
 'or',
 'them',
 'restaurant',
 'been',
 'only',
 'try',
 'really',
 'definitely',
 "i've",
 'come',
 'came',
 'chicken',
 'got',
 'amazing',
 'order',
 'love',
 'more',
 'because',
 'never',
 'some',
 'by',
 'can',
 'too',
 "don't",
 'wait',
 'always',
 'menu',
 'then',
 'experience',
 'everything',
 'people',
 '2',
 'went',
 'staff',
 'ever',
 'do',
 'over',
 'has',
 'meal',
 'other',
 'eat',
 'p

In [13]:
counts

(2017,
 1393,
 977,
 967,
 943,
 754,
 569,
 502,
 495,
 492,
 433,
 428,
 388,
 360,
 308,
 304,
 302,
 289,
 281,
 276,
 267,
 253,
 238,
 233,
 230,
 223,
 220,
 220,
 212,
 189,
 185,
 178,
 176,
 175,
 172,
 172,
 148,
 147,
 139,
 138,
 135,
 134,
 134,
 127,
 126,
 122,
 121,
 121,
 111,
 110,
 110,
 108,
 104,
 104,
 103,
 100,
 99,
 97,
 96,
 95,
 95,
 94,
 94,
 92,
 91,
 88,
 86,
 85,
 85,
 85,
 85,
 83,
 81,
 81,
 80,
 78,
 77,
 75,
 73,
 73,
 73,
 72,
 72,
 71,
 71,
 70,
 70,
 68,
 67,
 67,
 65,
 64,
 64,
 63,
 62,
 61,
 61,
 60,
 59,
 59,
 58,
 58,
 57,
 57,
 56,
 56,
 54,
 54,
 54,
 53,
 53,
 53,
 52,
 52,
 51,
 51,
 50,
 50,
 50,
 48,
 48,
 48,
 48,
 47,
 47,
 47,
 47,
 47,
 46,
 46,
 46,
 46,
 45,
 45,
 44,
 44,
 44,
 43,
 43,
 43,
 42,
 42,
 42,
 41,
 41,
 41,
 41,
 41,
 40,
 40,
 40,
 40,
 39,
 39,
 39,
 39,
 39,
 39,
 38,
 38,
 38,
 38,
 37,
 37,
 37,
 37,
 37,
 36,
 36,
 36,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 34,
 34,
 34,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 3

In [5]:
VOCABULARY_SIZE = len(vocabulary)
EMBEDDING_DIM = 100

In [6]:
print('Number of positive reviews:', len(reviews_1star))
print('Number of negative reviews:', len(reviews_5star))
print('Number of unique words:', VOCABULARY_SIZE)

Number of positive reviews: 1000
Number of negative reviews: 2000
Number of unique words: 500


In [7]:
word_to_ind = {}
ind_to_word = {}
ind_to_freq = {}
for i in range(VOCABULARY_SIZE):
    word_to_ind[vocabulary[i]] = i
    ind_to_word[i] = vocabulary[i]
    ind_to_freq[i] = counts[i]

In [8]:
print('Word \"%s\" is at position %d appearing %d times' % 
      (ind_to_word[word_to_ind['the']], word_to_ind['the'], ind_to_freq[word_to_ind['the']]))

Word "the" is at position 0 appearing 2017 times


In [9]:
word_to_ind

{'1': 200,
 '10': 209,
 '12': 361,
 '15': 362,
 '2': 102,
 '20': 496,
 '3': 165,
 '30': 311,
 '4': 332,
 '5': 139,
 '50': 475,
 '7': 366,
 'a': 3,
 'able': 443,
 'about': 54,
 'absolutely': 283,
 'actually': 266,
 'added': 393,
 'after': 69,
 'again': 118,
 'all': 34,
 'almost': 476,
 'already': 456,
 'also': 62,
 'always': 96,
 'am': 142,
 'amazing': 84,
 'an': 64,
 'and': 1,
 'another': 232,
 'any': 210,
 'anything': 341,
 'appetizer': 355,
 'are': 30,
 'area': 329,
 'around': 199,
 'arrived': 324,
 'as': 37,
 'asian': 403,
 'ask': 265,
 'asked': 130,
 'at': 27,
 'ate': 356,
 'atmosphere': 286,
 'attentive': 319,
 'away': 264,
 'awesome': 147,
 'ayce': 467,
 'back': 47,
 'bad': 174,
 'bar': 128,
 'be': 32,
 'because': 88,
 'beef': 242,
 'been': 74,
 'before': 167,
 'being': 204,
 'best': 55,
 'better': 119,
 'big': 279,
 'bit': 207,
 'bite': 296,
 'both': 179,
 'boyfriend': 349,
 'bread': 284,
 'breakfast': 202,
 'bring': 439,
 'brought': 450,
 'buffet': 136,
 'burger': 180,
 'busine

In [14]:
ind_to_word

{0: 'the',
 1: 'and',
 2: 'i',
 3: 'a',
 4: 'to',
 5: 'was',
 6: 'of',
 7: 'for',
 8: 'it',
 9: 'is',
 10: 'in',
 11: 'we',
 12: 'this',
 13: 'my',
 14: 'food',
 15: 'with',
 16: 'you',
 17: 'they',
 18: 'on',
 19: 'had',
 20: 'that',
 21: 'but',
 22: 'place',
 23: 'not',
 24: 'were',
 25: 'so',
 26: 'have',
 27: 'at',
 28: 'good',
 29: 'our',
 30: 'are',
 31: 'here',
 32: 'be',
 33: 'great',
 34: 'all',
 35: 'very',
 36: 'service',
 37: 'as',
 38: 'out',
 39: 'one',
 40: 'there',
 41: 'like',
 42: 'their',
 43: 'if',
 44: 'get',
 45: 'time',
 46: 'me',
 47: 'back',
 48: 'vegas',
 49: 'just',
 50: 'go',
 51: 'will',
 52: 'your',
 53: 'from',
 54: 'about',
 55: 'best',
 56: 'ordered',
 57: 'he',
 58: 'which',
 59: 'when',
 60: "it's",
 61: 'us',
 62: 'also',
 63: 'up',
 64: 'an',
 65: 'no',
 66: 'would',
 67: 'even',
 68: 'what',
 69: 'after',
 70: 'delicious',
 71: 'or',
 72: 'them',
 73: 'restaurant',
 74: 'been',
 75: 'only',
 76: 'try',
 77: 'really',
 78: 'definitely',
 79: "i've",