In [11]:
import json
import pickle
import re
from gensim.models import Word2Vec

In [55]:
import xml.etree.ElementTree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def parse_aspect_node(aspect_node):
    category = aspect_node.get('category')
    polarity = aspect_node.get('polarity')
    
    typo_polarity_map = {
        'POSITIVE': 'POSITIVE',
        'NEGATIVE': 'NEGATIVE',
        'NEATIVE': 'NEGATIVE',
        'NEGTIVE': 'NEGATIVE',
        ' NEGATIVE ': 'NEGATIVE',
        'NEGATIVE ': 'NEGATIVE',
        'POSITIVETIVE': 'POSITIVE',
        'POSITUVE': 'POSITIVE'
    }
    
    polarity = typo_polarity_map[polarity]
    
    return {category: polarity}    

def parse_aspects_node(aspects_node):
    default_aspects = {
        'FOOD': 'NEUTRAL',
        'AMBIENCE': 'NEUTRAL',
        'SERVICE': 'NEUTRAL',
        'PRICE': 'NEUTRAL'
    }
    
    for aspect in aspects_node.getchildren():
        default_aspects.update(parse_aspect_node(aspect))
    
    return default_aspects

def parse_review_node(review_node):
    text = review_node.find('text').text
    rid = review_node.get('rid')
    aspects = review_node.findall('aspects')
    
    default_dict = {
        'rid': int(rid),
        'text': text
    }
    
    res = []
    for aspect in aspects:
        cur_dict = default_dict.copy()
        cur_dict.update(parse_aspects_node(aspect))
        res.append(cur_dict)
        
    return res

def filter_same_train_aspects(reviews):
    res = []
    for v in reviews:
        if len(v['aspects']) == 1 or v['aspects'][0] == v['aspects'][1]:
            res.append(v)
    
    return res

def filter_different_train_aspects(reviews):
    res = []
    for v in reviews:
        if len(v['aspects']) == 2 and not(v['aspects'][0] == v['aspects'][1]):
            res.append(v)
            
    return res

def parse_dataset(filename):
    root_node = xml.etree.ElementTree.parse(filename).getroot()
    review_nodes = root_node.findall('review')
    reviews = [item for sublist in review_nodes for item in parse_review_node(sublist)]
    
    return pd.DataFrame.from_dict(reviews)

In [56]:
training_parsed = parse_dataset('../training_set.xml')
validation_parsed = parse_dataset('../validation_set.xml')

In [108]:
def tokenize_zomato_reviews():
    with open('../scrapper/reviews.json', 'r') as fp:
        reviews = json.load(fp)['reviews']
    
    sentences_tokens = []

    for review in reviews:
        try :
            tokens =  re.sub(r"[^a-z0-9]+", " ", review.lower()).split()
            sentences_tokens.append(tokens)
        except:
            continue
            
    return sentences_tokens

def tokenize_dataset(res):
    sentences_tokens=[]
    
    for id in np.unique(res.rid.values):
        df = res[res.rid == id]
        text = df.iloc[0]['text']
        tokens =  re.sub(r"[^a-z0-9]+", " ", text.lower()).split()
        sentences_tokens.append(tokens)
        
    return sentences_tokens
        


In [112]:
# scrap_tokenize = tokenize_zomato_reviews()
# test_tokenize = tokenize_dataset(training_parsed)
# validation_tokenize = tokenize_dataset(validation_parsed)

# all_tokenize = []
# all_tokenize.extend(scrap_tokenize)
# all_tokenize.extend(test_tokenize)
# all_tokenize.extend(validation_tokenize)

# model = Word2Vec(
#     sentences=all_tokenize,
#     size=100,
#     window=5,
#     min_count=3,
#     workers=4,
# )

# with open('wordmodel', 'wb') as fp:
#     pickle.dump(model, fp, pickle.HIGHEST_PROTOCOL)


In [113]:
with open('wordmodel', 'rb') as fp:
    model = pickle.load(fp)
    
print(model.wv['this'])

[ -1.89402383e-02   1.21436203e+00   1.24927247e+00   3.71148556e-01
  -6.13080680e-01   6.43768013e-01   1.18003391e-01   1.99595630e-01
   1.40739167e+00   1.77181530e+00  -7.15958059e-01   3.79979283e-01
   4.35211122e-01  -3.36454995e-02  -5.97849429e-01  -5.31703830e-01
  -8.82247463e-02   1.12970367e-01  -1.06464863e+00   1.00283647e+00
  -1.19164765e+00  -7.93579102e-01   6.59804404e-01  -3.46191764e-01
   1.68263521e-02  -1.90282211e-01  -9.12232161e-01   3.48721176e-01
   1.29581869e-01  -1.73281774e-01  -8.37333679e-01  -9.02165055e-01
   2.88223118e-01  -3.87077481e-01   4.46943253e-01  -3.52096319e-01
  -7.94674993e-01  -1.84090805e+00  -8.73749435e-01   9.36676443e-01
   1.50449514e-01   1.33542359e+00   2.13218597e-03  -5.79675078e-01
   4.26380366e-01  -3.54437739e-01   2.51761413e+00  -8.01229179e-01
   2.08790705e-01  -1.22829750e-01   9.83762443e-01   5.07659554e-01
  -1.62639201e+00   5.79870641e-01  -1.75448060e+00  -9.09258068e-01
  -3.66892934e-01   2.20756078e+00