# Libraries

**Machine Learning**

In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle

**NLP**

In [2]:
from nltk.tokenize import word_tokenize
import gensim 
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data_path = "/home/hongphuc95/notebookteam/dataset/"

# 1. Data Preprocessing

In [4]:
review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019.json", lines=True)

In [5]:
review_df.shape

(3494121, 9)

# 2. Content Based

In [6]:
all_stopwords = stopwords.words('english')
def clean_text(text):
    text_tokens = word_tokenize(text.lower())
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
    return tokens_without_sw

In [7]:
review_by_business = review_df.groupby('business_id')['text'].agg(lambda col: ' '.join(col)).reset_index()

In [8]:
review_by_business["text"] = review_by_business["text"].apply(lambda x: clean_text(x))

**Train Word2Vec Model**

In [9]:
model = gensim.models.Word2Vec(review_by_business["text"], min_count = 5, size = 200, workers = 4) 

In [10]:
model.save("review_full.model")

In [11]:
review_by_business

Unnamed: 0,business_id,text
0,--1UhMGODdWsrMastO9DZw,"[last, review, mention, get, charge, extra, ,,..."
1,--6MefnULPED_I942VcFNA,"[decent, food, decent, price, ., standard, chi..."
2,--7zmmkVg-IMGaXbuVd0SQ,"[recent, tour, lake, norman, area, brewery, ,,..."
3,--8LPVSo5i0Oo61X01sV9A,"[dr., purcell, good, thorough, ., office, staf..."
4,--9QQLMTbFzLJ_oT-ON3Xw,"[ever, believe, check, time, ., always, 20, mi..."
...,...,...
165188,zzuOCWxuY39YJ1wnTwQ0Lg,"[wife, purchase, wedding, band, 3, year, ago, ..."
165189,zzvlwkcNR1CCqOPXwuvz2A,"[come, friday, night, look, quick, bite, ., pl..."
165190,zzwaS0xn1MVEPEf0hNLjew,"[place, hole, ..., .trash, especially, bathroo..."
165191,zzwhN7x37nyjP0ZM8oiHmw,"[excellent, every, way, far, ., clean, ,, well..."


In [13]:
def avg_feature_vector(sentence, model, n_features):
    index2word_set = set(model.wv.index2word)
    feature_vec = np.zeros((n_features, ), dtype='float32')
    n_words = 0
    for word in sentence:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [None]:
review_by_business["text_vec"] = review_by_business["text"].apply(lambda x: avg_feature_vector(x, model=model, n_features=200))

**Load Word2Vec Model**

In [12]:
#model = Word2Vec.load("review.model")

In [None]:
#review_by_business["result"] = review_by_business["text"].apply(lambda x: model.wv[x])

In [12]:
#with open("review_vectorized.pickle", "wb") as f:
#    pickle.dump(review_by_business, f, protocol=pickle.HIGHEST_PROTOCOL)