In [1]:
import json
import pandas as pd

In [2]:
f = open("lemma_train.json")
data = json.load(f)
data_items = data.items()
data_list = list(data_items)
df = pd.DataFrame(data_list,columns=["business_ID","review"])
df

Unnamed: 0,business_ID,review
0,aKOP4d8UuFoeShiopqOtBQ,"[think, place, good, rating, location, big, on..."
1,VHfY59ctsugaOd4vVvUVMQ,"[wonderful, staff, delicious, bbq, quick, fril..."
2,bf4J2wsJYWIpmmff2iZTDA,"[yes, used, thelonious, monkfish, hipster, chi..."
3,BXDFhzarvzkCfsm-6oQolw,"[friends, came, watching, lovely, documentary,..."
4,UqukowIJJh-THhVhlEUy4w,"[like, think, boba, frequent, san, gabriel, be..."
...,...,...
99995,JnnMBFQ60M4MKj5Jn5LyIw,"[wary, bars, brighton, allston, undergrad, han..."
99996,Z1lRuV7aA9I-LlD51qFlNQ,"[husband, enjoyed, delightful, al, fresco, din..."
99997,qEuMDrWyLN49p1-i6zDsQA,"[allure, trivia, night, cent, wings, strong, k..."
99998,69XidCXVzOHgiQssXnZi5w,"[came, dinner, last, week, commemorate, moving..."


In [3]:
# build model with gensim
dataset = df['review']
sentence = [row for row in dataset]

## 300 dimension word2vec

In [4]:
from gensim import models
# Set the embedding size to be 300 and the window size to be 5. Can also consider a minimum word count of 1. 
w2v_model_300 = models.Word2Vec(min_count=1, window=5, vector_size=300)
#build vocabulary table
w2v_model_300.build_vocab(sentence, progress_per=100000)
# train word2vec model
w2v_model_300.train(sentence, total_examples=w2v_model_300.corpus_count, epochs=30, report_delay=1)

(233013753, 245526510)

In [5]:
# example_test: excellent ~ outstanding
word_similarity1 = w2v_model_300.wv.similarity('outstanding','excellent')
print(round(word_similarity1,3))

0.722


In [6]:
# initialize the embeddings(with padding)
print("Longest sentence length is: ",max([len(sent) for sent in sentence]))

Longest sentence length is:  534


In [7]:
vocab_size = len(w2v_model_300.wv)
print(vocab_size)

70903


In [8]:
import numpy as np

In [9]:
vocab = list(set([str(word) for sent in sentence for word in sent]))

tok_idx = {t: i for i,t in enumerate(vocab)}
idx_tok = {i: t for i,t in enumerate(vocab)}

In [10]:
print(len(vocab))

70903


In [11]:
embedding_matrix = np.zeros((vocab_size, 300))
count = 0
for i, word in idx_tok.items():
    if word in w2v_model_300.wv:
        embedding_matrix[i] = w2v_model_300.wv[word]
    else:
        count += 1

In [12]:
embedding_matrix.shape

(70903, 300)

In [13]:
embedding_matrix

array([[ 0.17865475,  0.07290468, -0.05059278, ...,  0.11274458,
        -0.03391233,  0.01182021],
       [ 0.13963184,  0.02014055, -0.08031757, ..., -0.10294749,
         0.06813318, -0.05767412],
       [-0.14781737,  0.132543  , -0.05591839, ..., -0.00763279,
        -0.00091019,  0.02200993],
       ...,
       [ 0.07572832,  0.09471691,  0.01944026, ..., -0.00920047,
        -0.06536612, -0.06254303],
       [ 0.01813478,  0.06530383,  0.10736354, ..., -0.21134418,
        -0.27372545, -0.161219  ],
       [-0.29662532,  0.0186099 ,  0.09861816, ...,  0.26055321,
         0.23418316,  0.10958157]])

In [14]:
t1 = embedding_matrix.tolist()

In [15]:
word_vec = {}
for i, word in idx_tok.items():
    word_vec[word] = t1[i]

In [16]:
word_vec['excellent']

[0.03199922665953636,
 -1.1645514965057373,
 -0.6643750667572021,
 -0.6508510112762451,
 -0.5549589395523071,
 -1.6239607334136963,
 -0.8143494725227356,
 -0.5344766974449158,
 -0.5290297269821167,
 1.097898244857788,
 -1.0795907974243164,
 1.3724521398544312,
 -1.5024207830429077,
 2.021359920501709,
 -1.1349811553955078,
 -0.07028519362211227,
 -0.5974878072738647,
 0.5872141718864441,
 -0.6305249929428101,
 -0.03889511525630951,
 3.0996975898742676,
 -0.9435570240020752,
 -1.941014289855957,
 -0.10299648344516754,
 -2.5138118267059326,
 -1.9073374271392822,
 1.045264482498169,
 0.057466115802526474,
 -1.8676754236221313,
 1.8604719638824463,
 0.879635214805603,
 1.6824465990066528,
 1.1966537237167358,
 -0.11161638051271439,
 1.901508092880249,
 0.947869062423706,
 1.1724170446395874,
 1.2869617938995361,
 1.128455638885498,
 0.13767606019973755,
 0.7097157835960388,
 0.4342864453792572,
 -1.6140156984329224,
 1.1452828645706177,
 -1.1261389255523682,
 -0.051310889422893524,
 0.3647

In [17]:
with open('300d_myword2vec_lemma_train.json', 'w') as json_out:
    json_out.write(json.dumps(word_vec, indent=4))

## 200 dimensions word2vec

In [18]:
# Set the embedding size to be 200 and the window size to be 5. Can also consider a minimum word count of 1. 
w2v_model_200 = models.Word2Vec(min_count=1, window=5, vector_size=200)
#build vocabulary table
w2v_model_200.build_vocab(sentence, progress_per=25000)
# train word2vec model
w2v_model_200.train(sentence, total_examples=w2v_model_200.corpus_count, epochs=30, report_delay=1)

(233015355, 245526510)

In [19]:
# example_test: excellent ~ outstanding
word_similarity2 = w2v_model_200.wv.similarity('outstanding','excellent')
print(round(word_similarity2,3))

0.802


In [20]:
vocab_size1 = len(w2v_model_200.wv)
print(vocab_size1)

70903


In [21]:
embedding_matrix1 = np.zeros((vocab_size1, 200))
count = 0
for i, word in idx_tok.items():
    if word in w2v_model_200.wv:
        embedding_matrix1[i] = w2v_model_200.wv[word]
    else:
        count += 1

In [22]:
embedding_matrix1.shape

(70903, 200)

In [23]:
t2 = embedding_matrix1.tolist()

In [24]:
word_vec1 = {}
for i, word in idx_tok.items():
    word_vec1[word] = t2[i]

In [25]:
with open('200d_myword2vec_lemma_train.json', 'w') as json_out:
    json_out.write(json.dumps(word_vec1, indent=4))