# Q1,2

In [38]:
# import libraries
import pandas as pd
import numpy as np
import string
import textdistance
from tqdm.notebook import tqdm
tqdm.pandas()

# ignore warnings
import warnings
warnings.filterwarnings('ignore')


# load the dataset
df = pd.read_csv(r"D:\OneDrive - NITT\Custom_Download\yelp.csv",
                 delimiter='\t',
                header=None)

# Q3

In [39]:
# ignore all punctuation !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def clean(x):
    return x.translate(str.maketrans('', '', string.punctuation)).lower()

In [40]:
# clean the dataset and store it in clean_text column
df['clean_text'] = df[0].progress_apply(clean)

  0%|          | 0/15000 [00:00<?, ?it/s]

In [41]:
df

Unnamed: 0,0,clean_text
0,I got 'new' tires from them and within two wee...,i got new tires from them and within two weeks...
1,Don't waste your time. We had two different p...,dont waste your time we had two different peo...
2,All I can say is the worst! We were the only 2...,all i can say is the worst we were the only 2 ...
3,I have been to this restaurant twice and was d...,i have been to this restaurant twice and was d...
4,Food was NOT GOOD at all! My husband & I ate h...,food was not good at all my husband i ate her...
...,...,...
14995,We only went one day for breakfast one day and...,we only went one day for breakfast one day and...
14996,I am not embarrassed to say I LOVE buffets. I ...,i am not embarrassed to say i love buffets i a...
14997,This buffet was so so on a Saturday evening. I...,this buffet was so so on a saturday evening i ...
14998,"This was our next destination for our \""Buffet...",this was our next destination for our buffets ...


In [42]:
# pip install -q transformers

# Q4

In [43]:
# import DistilBertTokenizer and create tokenizer object
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [44]:
# function to get tokens from a given text
def get_tokens(x):
    return tokenizer.tokenize(x)

In [45]:
# get tokens for each lines
df['tokens'] = df['clean_text'].progress_apply(get_tokens)

  0%|          | 0/15000 [00:00<?, ?it/s]

# Q5

In [46]:
# Now tokens column is our corpus for training word2vec model
corpus = df['tokens']

In [47]:
corpus

0        [i, got, new, tires, from, them, and, within, ...
1        [don, ##t, waste, your, time, we, had, two, di...
2        [all, i, can, say, is, the, worst, we, were, t...
3        [i, have, been, to, this, restaurant, twice, a...
4        [food, was, not, good, at, all, my, husband, i...
                               ...                        
14995    [we, only, went, one, day, for, breakfast, one...
14996    [i, am, not, embarrassed, to, say, i, love, bu...
14997    [this, buffet, was, so, so, on, a, saturday, e...
14998    [this, was, our, next, destination, for, our, ...
14999    [only, reason, why, we, went, here, was, becau...
Name: tokens, Length: 15000, dtype: object

In [48]:
# import Word2Vec
from gensim.models import Word2Vec

In [50]:
# Train Word2Vec Model on corpus
model = Word2Vec(corpus, 
                 min_count=1,          # word frequency
                 vector_size=200,      # dimention of word embeddings
                 workers=8,            # Number of processors
                 sg=1                  # 1 for skip-gram
                ) 

In [51]:
model.wv['this'] #check embedding for 'this'

array([ 0.03876384,  0.06792455, -0.01095999, -0.14670996,  0.09555603,
       -0.02762917,  0.35243046,  0.22581412, -0.14392382, -0.04734934,
       -0.01095255, -0.31620818, -0.16370018,  0.32142866,  0.0475617 ,
       -0.3508123 , -0.10558181, -0.00840569, -0.23925132, -0.6425318 ,
        0.06629066, -0.04420595, -0.00610115,  0.07420675, -0.17645268,
       -0.14200823, -0.23664309, -0.3114277 ,  0.13015835,  0.26915824,
       -0.13236913,  0.28407627,  0.26316193, -0.15393344, -0.24959256,
        0.15720378, -0.14831176, -0.33674926, -0.01742163, -0.29692084,
        0.12596937, -0.30064192, -0.23887782, -0.2260668 ,  0.18272957,
        0.03012532, -0.00527017,  0.22925234,  0.17272429,  0.18571417,
        0.02755589, -0.09958533, -0.09802404, -0.32358813,  0.07784869,
       -0.13857004,  0.17406712, -0.10038164,  0.02820028, -0.01346502,
       -0.1436504 ,  0.31138363,  0.11665127, -0.14073247, -0.39761984,
        0.1984581 , -0.06051049,  0.49929556, -0.24489218,  0.33

In [52]:
# pip install textdistance

In [53]:
# get all words for which model is having embedding.
all_word2vec_vocab = set(model.wv.key_to_index)

In [54]:
all_word2vec_vocab

{'soto',
 'start',
 '##gm',
 'grandparents',
 'used',
 '##oid',
 'denying',
 'spirits',
 'driving',
 'differential',
 'denise',
 'viewpoint',
 'royalty',
 'agent',
 '##sty',
 '##icia',
 '##47',
 'prank',
 'sweep',
 'trivial',
 'swung',
 'kali',
 'rests',
 '##icing',
 '##vina',
 'build',
 'theory',
 'anxiously',
 'transmissions',
 'assistant',
 'wards',
 'nightclub',
 '##less',
 'pleasantly',
 'affiliated',
 'unfortunately',
 '##bolt',
 'hurriedly',
 'like',
 'emperor',
 'join',
 'novelty',
 'rogers',
 'old',
 'variance',
 'steamer',
 '##phile',
 'verse',
 '##bury',
 'ramsey',
 'deafening',
 'chesapeake',
 'while',
 'idea',
 'jim',
 'pg',
 'spread',
 'continue',
 '##mark',
 'beau',
 'eponymous',
 'alien',
 'humiliated',
 '##zard',
 'fx',
 'lawyer',
 '##gence',
 'inspiring',
 'duplicate',
 'finest',
 'chinatown',
 'posse',
 'mace',
 'afi',
 'prison',
 'stiff',
 'pines',
 'dreams',
 'instincts',
 'buried',
 'much',
 'manly',
 'dying',
 'shown',
 'reversed',
 'capacity',
 'stranded',
 'arg

# Q 6-7

In [58]:
# get embedding of any word

def embed(word):
    
    # get tokens
    tokens = get_tokens(word.lower())
    
    print('tokens',tokens)
    # initialize it with 0, as we need to do Average.
    embedding = np.zeros((200))
    
    # total number of token present in word
    n = len(tokens)
    
    # for each token get their embedding 
    for token in tokens:
        
        # if token in present in word2vec model then take its embedding from the model itself
        if token in all_word2vec_vocab:
            embedding =  embedding + model.wv.get_vector(token)
            
        # if not present, get the closed word in word2vec model using levenshtein 
        #and get their embedding
        else:
            key_in_model = sorted([(textdistance.levenshtein.normalized_distance(e,token), e) for e in all_word2vec_vocab],reverse=True)[0][1]
            embedding =  embedding + model.wv.get_vector(key_in_model)
            print('Missing:',token,'was replaced by:',key_in_model)
    # return average
    return embedding/n

In [59]:
embed('capturing')

tokens ['capturing']


array([ 0.01319182, -0.03086699,  0.01386468,  0.00400418,  0.05156008,
       -0.02781689,  0.00263223,  0.1186272 ,  0.00237181,  0.04792605,
       -0.01278259, -0.07234085,  0.01668744,  0.06755218, -0.05790672,
       -0.02472644, -0.02157385, -0.02039355,  0.00577635, -0.11322032,
        0.03412848, -0.02939377,  0.0566329 ,  0.01762407,  0.01123586,
       -0.03570064,  0.01979497, -0.06237185, -0.08609715,  0.0027248 ,
        0.05720227,  0.02175297,  0.04432205,  0.0334942 ,  0.03129559,
        0.02755326,  0.02684237,  0.00516881, -0.02638847, -0.04504759,
       -0.03101944, -0.03051468,  0.00969181,  0.00170613,  0.03911392,
        0.01811376,  0.00751254, -0.02651414,  0.08815496,  0.08694479,
        0.02697583,  0.0027897 ,  0.06679496, -0.08807959, -0.01598597,
       -0.02504161,  0.00776912, -0.03141798, -0.09840126,  0.00372895,
       -0.04001177,  0.02679459, -0.03950952, -0.0100818 , -0.10573756,
        0.01116486, -0.0094537 ,  0.14020903, -0.05065979,  0.05

In [60]:
embed('hopelessness')

tokens ['hopeless', '##ness']


array([ 0.06900935, -0.3478472 , -0.04593382, -0.07518924,  0.12024632,
       -0.21340124, -0.09061289,  0.33385693, -0.04049813, -0.05042046,
       -0.07106523, -0.08041959,  0.14822572, -0.01013631, -0.12996206,
       -0.01517398,  0.18260615, -0.05622896, -0.02300552, -0.25276137,
       -0.00378707,  0.00444411,  0.17096332,  0.04935565, -0.0500372 ,
       -0.11579858,  0.17546122, -0.21928538, -0.21453824,  0.08363615,
        0.09906815,  0.0318277 ,  0.27810087,  0.02445544,  0.11972313,
        0.08837485,  0.17467387, -0.25554041, -0.0149664 ,  0.04639356,
        0.07668654, -0.14024627,  0.03573029, -0.02629977,  0.05161199,
        0.00532763, -0.08346561, -0.0548148 ,  0.14903075,  0.27766694,
       -0.03115265,  0.1287857 , -0.10404994, -0.14292534,  0.04834884,
       -0.07164149, -0.00672097, -0.27166202, -0.25866086, -0.03309915,
       -0.1025292 ,  0.22868331,  0.01367408, -0.02897862, -0.1762967 ,
        0.10358536,  0.00225399,  0.20149469,  0.02766315,  0.34

In [61]:
embed('codementor')

tokens ['code', '##mento', '##r']


array([ 7.71957710e-02, -8.80573081e-02,  2.75130272e-02, -9.04211160e-02,
       -2.25953932e-02, -1.79160106e-01,  2.88029869e-03,  7.80728906e-02,
       -5.37512234e-03,  1.89053586e-01, -5.77188569e-02, -2.97674065e-02,
        1.20728331e-01,  2.42129213e-01, -4.99490773e-02, -1.12885113e-01,
       -5.85854091e-02, -2.65053039e-02, -5.71930432e-02, -1.09849115e-01,
        2.72694432e-02, -1.39848366e-01, -2.03149418e-02,  3.87287692e-02,
        2.28951089e-01, -1.36164540e-01,  1.17682544e-01, -2.43023929e-01,
       -2.22660257e-01,  2.81765567e-02,  1.15932594e-01,  1.04949443e-02,
        2.39597410e-01,  1.56480387e-01,  2.37697808e-02, -4.76552000e-02,
        1.49869733e-02, -2.74424590e-02, -1.72447326e-01, -5.09878645e-02,
       -8.71189224e-02, -1.86615787e-01, -9.27077358e-02,  1.04189213e-01,
       -2.16633901e-02, -8.96391867e-02,  5.11476149e-05,  1.09709399e-01,
        1.82081749e-01,  2.05740792e-01,  1.79384854e-01,  1.39577356e-01,
        1.07473468e-01, -

In [64]:
embed('bloodbathiam')

tokens ['blood', '##bat', '##hia', '##m']
Missing: ##hia was replaced by: zur


array([ 0.02846425,  0.00109577,  0.12880867,  0.13495493,  0.0037156 ,
       -0.10363909, -0.12312555,  0.29268183,  0.01487847,  0.0471044 ,
       -0.03741287, -0.11771268,  0.10247838,  0.12693817, -0.100399  ,
       -0.10813975, -0.08467673, -0.10688362,  0.12316071, -0.24680823,
        0.14811322,  0.00084272,  0.15869648,  0.08806059, -0.00604453,
       -0.15899629,  0.0594959 , -0.09119568, -0.27917519, -0.01912675,
        0.18747474, -0.02235155,  0.25409335,  0.0739059 ,  0.02778592,
        0.0643375 ,  0.11557539, -0.00517701, -0.07499179,  0.00629249,
       -0.21721246, -0.14224002, -0.03054655,  0.12677388,  0.20773037,
       -0.10077683, -0.07001822, -0.11418467,  0.20748661,  0.13046798,
        0.01619395,  0.06108155,  0.06370979, -0.16224311, -0.12144628,
        0.01299228, -0.16437192, -0.04633624, -0.20448172,  0.03708749,
       -0.05588313, -0.01729232, -0.13130077, -0.05802228, -0.12057111,
        0.04908667, -0.09839908,  0.20494726, -0.03198866,  0.08