In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from gensim.models.word2vec import Word2Vec



In [2]:
data = pd.read_csv('../../yelp-data/new_data/final_data/manip_data.csv')

In [3]:
text = data.text.dropna().map(lambda x: x.split())

In [4]:
model = Word2Vec(text, size=100, window=5, min_count=5, workers=4)

In [5]:
model.most_similar('bomb')

[('bomb,', 0.6664633750915527),
 ('killer', 0.6146372556686401),
 ('yummy', 0.6126831769943237),
 ('delish', 0.605032205581665),
 ('bomb!', 0.6024252772331238),
 ('bomb.', 0.6007922291755676),
 ('legit', 0.5647023916244507),
 ('guava', 0.5639989972114563),
 ('jalape\xc3\xb1o', 0.5568761825561523),
 ('DELICIOUS', 0.5542331337928772)]

In [6]:
model.most_similar('entrecote')

[('Swordfish,', 0.7641465067863464),
 ('respectively).', 0.754440188407898),
 ('Avalon.', 0.7522343397140503),
 ('brainchild', 0.7518200874328613),
 ('Kabocha', 0.7475636005401611),
 ('18-oz', 0.7471858263015747),
 ('Mornay', 0.7457447052001953),
 ('raison', 0.745682954788208),
 ('Frango', 0.744411826133728),
 ('"City', 0.7429077625274658)]

In [7]:
model.most_similar('pretty')

[('very', 0.773969292640686),
 ('fairly', 0.7730783224105835),
 ('quite', 0.730404257774353),
 ('surprisingly', 0.7146251201629639),
 ('soooo', 0.6735637784004211),
 ('ridiculously', 0.6639546751976013),
 ('VERY', 0.6628201007843018),
 ('super', 0.6603380441665649),
 ('relatively', 0.6578768491744995),
 ('really', 0.6485140323638916)]

In [8]:
model.most_similar('sauce')

[('sauce,', 0.8461623191833496),
 ('glaze', 0.7802013158798218),
 ('marinade', 0.7722140550613403),
 ('mayo', 0.7626287937164307),
 ('mustard', 0.7603120803833008),
 ('sauce)', 0.7562144994735718),
 ('horseradish', 0.7527989149093628),
 ('aioli', 0.7514640092849731),
 ('sauce-', 0.7496747970581055),
 ('flavoring', 0.718492865562439)]

In [9]:
model.most_similar('wine')

[('wines', 0.7738568186759949),
 ('Malbec', 0.750299870967865),
 ('wine,', 0.7357649803161621),
 ('wine.', 0.7072179317474365),
 ('pinot', 0.7012085318565369),
 ('beer', 0.6923951506614685),
 ('Cabernet', 0.6793330311775208),
 ('malbec', 0.6592185497283936),
 ('Cab', 0.6465196013450623),
 ('champagne.', 0.6465082764625549)]

In [10]:
model.most_similar('wtf')

[('why..', 0.6708564758300781),
 ('WTF', 0.6654213666915894),
 ('Cuz', 0.6614267230033875),
 ('AMBIANCE', 0.6422114372253418),
 ('Ya', 0.6348941922187805),
 ('SEAT', 0.6255700588226318),
 ('ME?!', 0.6251627206802368),
 ('"WHAT', 0.6235804557800293),
 ('GUYS', 0.6230815649032593),
 ('no."', 0.6224642395973206)]

In [11]:
model.most_similar('rare')

[('rare,', 0.8715862035751343),
 ('medium-rare', 0.8181618452072144),
 ('medium-well', 0.8158857822418213),
 ('rare)', 0.7847388982772827),
 ('medium-rare,', 0.7465656399726868),
 ('medium,', 0.7383071184158325),
 ('well-done', 0.7309146523475647),
 ('med-rare', 0.7149869203567505),
 ('"medium"', 0.7082633376121521),
 ('rare...', 0.7001520991325378)]

In [12]:
model.most_similar('cold')

[('cold,', 0.8282610177993774),
 ('lukewarm', 0.7436223030090332),
 ('stale', 0.6913003921508789),
 ('cold.', 0.6889268159866333),
 ('hot', 0.6849794387817383),
 ('soggy', 0.6724570393562317),
 ('dry', 0.6649828553199768),
 ('burnt', 0.6585512757301331),
 ('hot,', 0.6454548835754395),
 ('stale,', 0.6363958120346069)]

In [13]:
model.most_similar('medium')

[('med', 0.8880178928375244),
 ('medium,', 0.8737934827804565),
 ('medium-rare', 0.8304072618484497),
 ('Medium', 0.8261887431144714),
 ('med.', 0.7815141677856445),
 ('medium-well', 0.7626087665557861),
 ('(medium', 0.7583208084106445),
 ('Med', 0.7490639090538025),
 ('medium.', 0.747045636177063),
 ('med-rare', 0.7415951490402222)]

In [14]:
model.most_similar('well-done')

[('medium-well', 0.8448947668075562),
 ('medium-rare', 0.8125985860824585),
 ('rare)', 0.7956167459487915),
 ('med-rare', 0.7670876979827881),
 ('medium-well,', 0.7642567157745361),
 ('medium-rare,', 0.7612276077270508),
 ('"medium"', 0.7611895203590393),
 ('medium)', 0.754747211933136),
 ('Medium', 0.7445232272148132),
 ('medium,', 0.7439863681793213)]

In [15]:
model.most_similar('well')

[('well,', 0.7750773429870605),
 ('nicely', 0.7740298509597778),
 ('well.', 0.7060009241104126),
 ('poorly', 0.6994184851646423),
 ('perfectly', 0.6826453804969788),
 ('beautifully', 0.6810519695281982),
 ('expertly', 0.6713690757751465),
 ('nicely.', 0.6660995483398438),
 ('well!', 0.6440153121948242),
 ('well).', 0.6409461498260498)]

In [16]:
model.most_similar('done')

[('done,', 0.7681154608726501),
 ('cooked', 0.7227935194969177),
 ('done.', 0.6857039928436279),
 ('cooked,', 0.6690787076950073),
 ('timed', 0.6669354438781738),
 ('executed', 0.6629395484924316),
 ('seasoned', 0.6594668626785278),
 ('prepared,', 0.6432962417602539),
 ('prepared', 0.6380960941314697),
 ('done).', 0.6379367113113403)]

In [17]:
model.most_similar('minutes')

[('mins', 0.951054573059082),
 ('min', 0.9355353713035583),
 ('minutes,', 0.885208010673523),
 ('seconds', 0.8497612476348877),
 ('min.', 0.8010178208351135),
 ('minutes)', 0.7997899651527405),
 ('min,', 0.790583610534668),
 ('mins.', 0.7725257873535156),
 ('minute', 0.7676999568939209),
 ('mins,', 0.7602644562721252)]

In [18]:
model.most_similar('allergic')

[('partial', 0.7365303039550781),
 ('addicted', 0.7363131046295166),
 ('lactose', 0.7049533128738403),
 ('intolerant', 0.6875503063201904),
 ('accustomed', 0.6730118989944458),
 ('referring', 0.6710057258605957),
 ('pregnant', 0.650534451007843),
 ('sensitive', 0.6341856718063354),
 ('guessing', 0.6239946484565735),
 ('vegetarian,', 0.6230154037475586)]

In [19]:
model.most_similar('poisoning')

[('poisoning,', 0.7550253868103027),
 ('poisoning.', 0.7375223636627197),
 ('network', 0.7148500680923462),
 ('coma', 0.6968626976013184),
 ('coma,', 0.6851972937583923),
 ('poisoning!', 0.6745080947875977),
 ('court', 0.6410585045814514),
 ('network,', 0.6366638541221619),
 ('court,', 0.6237775683403015),
 ('court.', 0.6236802339553833)]

In [20]:
model.most_similar('sick')

[('tired', 0.7024308443069458),
 ('sick,', 0.6388643980026245),
 ('hungry', 0.6274431943893433),
 ('upset', 0.5981408357620239),
 ('thirsty', 0.5942469835281372),
 ('hungover', 0.5928487777709961),
 ('starving', 0.5759625434875488),
 ('hungry,', 0.5703878998756409),
 ('bloated', 0.5697811841964722),
 ('vomiting', 0.5678773522377014)]

In [21]:
model.most_similar('taste')

[('flavor', 0.7459472417831421),
 ('flavour', 0.6752249598503113),
 ('smell', 0.65694260597229),
 ('aftertaste', 0.6520683765411377),
 ('texture', 0.6485904455184937),
 ('taste,', 0.643190860748291),
 ('consistency', 0.6175447106361389),
 ('flavor,', 0.613344669342041),
 ('overpower', 0.6129823923110962),
 ('flavoring', 0.6008154153823853)]

In [22]:
model.most_similar('quality')

[('quality,', 0.8338937759399414),
 ('quantity', 0.7406512498855591),
 ('quality.', 0.7250924110412598),
 ('preparation', 0.6476686000823975),
 ('caliber', 0.6391072273254395),
 ('price', 0.6361862421035767),
 ('pricing,', 0.629205584526062),
 ('pricing', 0.628702700138092),
 ('freshness', 0.6242746114730835),
 ('grade', 0.6240543127059937)]

In [23]:
model.most_similar('skimpy')

[('stingy', 0.758736789226532),
 ('sparse', 0.6908819079399109),
 ('heavy', 0.6571880578994751),
 ('bland', 0.6387605667114258),
 ('salty', 0.6216830015182495),
 ('sloppy', 0.6193151473999023),
 ('greasy', 0.612706184387207),
 ('watery', 0.6108258962631226),
 ('oily', 0.6058367490768433),
 ('mushy', 0.6017669439315796)]

In [24]:
model.most_similar('panda')

[('Bosa', 0.5416286587715149),
 ('milanesa', 0.540710985660553),
 ('Dominick', 0.5395267009735107),
 ('Annie', 0.5371854305267334),
 ('cultures,', 0.5331829190254211),
 ('not..', 0.5211428999900818),
 ('FU', 0.5155062079429626),
 ('SCREAM', 0.5134795904159546),
 ('enforcing', 0.5122936964035034),
 ('Hometown.', 0.5104541778564453)]

In [25]:
model.most_similar('circus')

[('Bellagio,', 0.7144170999526978),
 ('Casino', 0.7140147686004639),
 ('Plaza', 0.7135531902313232),
 ('Tuscany', 0.7126450538635254),
 ('Paris,', 0.7105658650398254),
 ('Wynn', 0.7077804803848267),
 ('Hotel', 0.7006837129592896),
 ("Bally's", 0.6982412934303284),
 ('Circus', 0.6972460746765137),
 ('Rio', 0.6971273422241211)]

In [26]:
model.most_similar('bathroom')

[('restroom', 0.8567636013031006),
 ('shower', 0.8140807151794434),
 ('toilet', 0.7956017255783081),
 ('bathroom,', 0.7608997225761414),
 ('lobby', 0.721382737159729),
 ('restrooms', 0.7079805135726929),
 ('space', 0.7008147239685059),
 ('carpet', 0.6981001496315002),
 ('bathrooms', 0.692302942276001),
 ('pool', 0.6917460560798645)]

In [27]:
model.most_similar('hotel')

[('hotel,', 0.8278946280479431),
 ('casino', 0.7943623065948486),
 ('property', 0.7778263092041016),
 ('resort', 0.7753111124038696),
 ('casino,', 0.7156445980072021),
 ('restaurant', 0.7117213010787964),
 ('MGM', 0.7094793915748596),
 ('Palazzo', 0.7034705877304077),
 ('Hotel', 0.7034575939178467),
 ('Aria', 0.6977797746658325)]

In [28]:
model.most_similar(positive=['bomb', 'delicious'], negative=['terrible'])

[('yummy', 0.7192920446395874),
 ('delish', 0.7049578428268433),
 ('tasty', 0.6216047406196594),
 ('guava', 0.6166251301765442),
 ('jalape\xc3\xb1o', 0.6136160492897034),
 ('Yum!', 0.6121785044670105),
 ('house-made', 0.6118607521057129),
 ('scrumptious', 0.6018474102020264),
 ('homemade', 0.5980502367019653),
 ('bomb,', 0.5954315662384033)]

In [29]:
model.doesnt_match("breakfast steak dinner lunch".split())

'steak'