In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from gensim.models.word2vec import Word2Vec



In [2]:
data = pd.read_csv('../../yelp-data/new_data/final_data/manip_data.csv')

In [3]:
text = data.text.dropna().map(lambda x: x.split())

In [4]:
model = Word2Vec(text, size=100, window=5, min_count=5, workers=4)

# size represents how many concepts or topics we should use
# window represents how many words surrounding a sentence we should use as our original features
# min_count is the number of times that context or word must appear
# workers is the number of CPU cores to use to speed up model training

#### https://github.com/ga-students/DS-BOS-12/tree/master/lessons/lesson-14#introduction-word2vec

In [5]:
model.most_similar('delicious')

[('yummy', 0.8551763296127319),
 ('delicious,', 0.8408470153808594),
 ('tasty', 0.8365609645843506),
 ('divine', 0.7977620363235474),
 ('DELICIOUS', 0.7752904891967773),
 ('delish', 0.7725182175636292),
 ('amazing', 0.753511905670166),
 ('scrumptious', 0.743909478187561),
 ('delectable', 0.7341341972351074),
 ('delicious!', 0.7294865846633911)]

In [6]:
model.most_similar('manager')

[('manager,', 0.8661529421806335),
 ('Manager', 0.8340087532997131),
 ('owner', 0.7991610765457153),
 ('GM', 0.7938071489334106),
 ('busboy', 0.7837578654289246),
 ('manger', 0.7790880799293518),
 ('hostess', 0.777050793170929),
 ('cashier', 0.7605875730514526),
 ('host', 0.7564091086387634),
 ('mgr', 0.7459459900856018)]

In [7]:
model.most_similar('time')

[('time,', 0.8824414014816284),
 ('time.', 0.691596269607544),
 ('time)', 0.6746558547019958),
 ('time!', 0.6710756421089172),
 ('time).', 0.646044909954071),
 ('time...', 0.6286982893943787),
 ('day', 0.6172722578048706),
 ('timer', 0.6074073314666748),
 ('visit,', 0.572293221950531),
 ('morning', 0.5706180334091187)]

In [6]:
model.most_similar('bland')

[('bland,', 0.9098657965660095),
 ('tasteless', 0.8751441240310669),
 ('flavorless', 0.8288198709487915),
 ('dry,', 0.8281048536300659),
 ('watery', 0.8082864880561829),
 ('oily', 0.7977049350738525),
 ('mushy', 0.7896466255187988),
 ('undercooked', 0.7890170216560364),
 ('boring', 0.7748020887374878),
 ('rubbery', 0.7734558582305908)]

In [8]:
model.most_similar('cold')

[('cold,', 0.8073718547821045),
 ('lukewarm', 0.7302338480949402),
 ('hot', 0.7089186906814575),
 ('cold.', 0.689141035079956),
 ('stale', 0.6722781658172607),
 ('hot,', 0.6692797541618347),
 ('soggy', 0.6542467474937439),
 ('stale,', 0.635863721370697),
 ('burnt', 0.6355282068252563),
 ('dry', 0.6261243224143982)]

In [7]:
model.most_similar('mediocre')

[('subpar', 0.802825927734375),
 ('marginal', 0.7575305700302124),
 ('so-so', 0.742594838142395),
 ('mediocre,', 0.7301327586174011),
 ('sub-par', 0.7242838144302368),
 ('average', 0.7144842147827148),
 ('underwhelming', 0.6761751770973206),
 ('meh', 0.6691441535949707),
 ('lackluster', 0.6553842425346375),
 ('Mediocre', 0.6416609287261963)]

In [8]:
model.most_similar('flavor')

[('flavour', 0.9123848676681519),
 ('flavor,', 0.881925106048584),
 ('flavoring', 0.839346706867218),
 ('texture', 0.8147541880607605),
 ('seasoning', 0.7863143682479858),
 ('seasoning,', 0.7775187492370605),
 ('spice', 0.7618908286094666),
 ('taste', 0.7474372386932373),
 ('texture,', 0.7442693710327148),
 ('sweetness', 0.7336708903312683)]

In [9]:
model.most_similar('juicy')

[('juicy,', 0.9103448390960693),
 ('tender', 0.8763376474380493),
 ('tender,', 0.8713788986206055),
 ('moist', 0.846558153629303),
 ('flavorful', 0.8123543858528137),
 ('moist,', 0.7971984148025513),
 ('succulent,', 0.7971756458282471),
 ('flavorful,', 0.7845823764801025),
 ('succulent', 0.7817756533622742),
 ('well-seasoned', 0.7815227508544922)]

In [10]:
model.most_similar('usually')

[('typically', 0.8633309006690979),
 ('normally', 0.7643651962280273),
 ('generally', 0.7371158599853516),
 ('always', 0.6841447353363037),
 ('sometimes', 0.6383280754089355),
 ('often', 0.628433108329773),
 ('ALWAYS', 0.5364930629730225),
 ('rarely', 0.5312243700027466),
 ('tend', 0.5176421999931335),
 ('Usually', 0.44421839714050293)]

In [11]:
model.most_similar('however')

[('although', 0.8727309703826904),
 ('but', 0.8342143297195435),
 ('however,', 0.78711998462677),
 ('but,', 0.7000393271446228),
 ('(although', 0.6342678666114807),
 ('(but', 0.5939549207687378),
 ('(though', 0.5766899585723877),
 ('nor', 0.5720763206481934),
 ('and', 0.5434328317642212),
 ('though', 0.531823992729187)]

In [9]:
model.most_similar('pretty')

[('fairly', 0.7897210121154785),
 ('very', 0.7887053489685059),
 ('quite', 0.7202944755554199),
 ('surprisingly', 0.7177211046218872),
 ('super', 0.6665304899215698),
 ('soooo', 0.6634423732757568),
 ('relatively', 0.6605172157287598),
 ('VERY', 0.6587648391723633),
 ('extremely', 0.6506322026252747),
 ('ridiculously', 0.6504016518592834)]

In [10]:
model.most_similar('salad')

[('Salad', 0.7948505878448486),
 ('salad,', 0.7791732549667358),
 ('salad;', 0.7312804460525513),
 ('salad.', 0.7042760252952576),
 ('salads', 0.6994776725769043),
 ('salad:', 0.6956886053085327),
 ('salad-', 0.6580542325973511),
 ('salad...', 0.6278755068778992),
 ('dressing', 0.598473846912384),
 ('salad!', 0.594258189201355)]

In [11]:
model.most_similar('fries')

[('fries,', 0.7918753623962402),
 ('Fries', 0.78719562292099),
 ('fries)', 0.7649980783462524),
 ('frites', 0.7280604243278503),
 ('nachos', 0.7044536471366882),
 ('hashbrowns', 0.6819921731948853),
 ('fries.', 0.6818950772285461),
 ('toast', 0.6688725352287292),
 ('tots', 0.6664113998413086),
 ('fires', 0.6609050035476685)]

In [12]:
model.most_similar('cheese')

[('cheese)', 0.8349871039390564),
 ('cheese,', 0.8311554193496704),
 ('Cheese', 0.7969741821289062),
 ('cheese...', 0.7963688373565674),
 ('cheese;', 0.7749756574630737),
 ('cheese.', 0.768731951713562),
 ('cheese-', 0.7598976492881775),
 ('cheese!', 0.7595844268798828),
 ('cheese).', 0.755033016204834),
 ('cheese:', 0.7530560493469238)]

In [13]:
model.most_similar('wine')

[('wines', 0.7607375979423523),
 ('wine,', 0.7216306924819946),
 ('wine.', 0.7100576162338257),
 ('Malbec', 0.7076953649520874),
 ('Cabernet', 0.690409779548645),
 ('beer', 0.6867533922195435),
 ('pinot', 0.6863394379615784),
 ('Pinot', 0.6406688690185547),
 ('wines.', 0.6373344659805298),
 ('Riesling', 0.6373227834701538)]