In [11]:
import json
import pickle
import re
from gensim.models import Word2Vec

In [55]:
import xml.etree.ElementTree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def parse_aspect_node(aspect_node):
    category = aspect_node.get('category')
    polarity = aspect_node.get('polarity')
    
    typo_polarity_map = {
        'POSITIVE': 'POSITIVE',
        'NEGATIVE': 'NEGATIVE',
        'NEATIVE': 'NEGATIVE',
        'NEGTIVE': 'NEGATIVE',
        ' NEGATIVE ': 'NEGATIVE',
        'NEGATIVE ': 'NEGATIVE',
        'POSITIVETIVE': 'POSITIVE',
        'POSITUVE': 'POSITIVE'
    }
    
    polarity = typo_polarity_map[polarity]
    
    return {category: polarity}    

def parse_aspects_node(aspects_node):
    default_aspects = {
        'FOOD': 'NEUTRAL',
        'AMBIENCE': 'NEUTRAL',
        'SERVICE': 'NEUTRAL',
        'PRICE': 'NEUTRAL'
    }
    
    for aspect in aspects_node.getchildren():
        default_aspects.update(parse_aspect_node(aspect))
    
    return default_aspects

def parse_review_node(review_node):
    text = review_node.find('text').text
    rid = review_node.get('rid')
    aspects = review_node.findall('aspects')
    
    default_dict = {
        'rid': int(rid),
        'text': text
    }
    
    res = []
    for aspect in aspects:
        cur_dict = default_dict.copy()
        cur_dict.update(parse_aspects_node(aspect))
        res.append(cur_dict)
        
    return res

def filter_same_train_aspects(reviews):
    res = []
    for v in reviews:
        if len(v['aspects']) == 1 or v['aspects'][0] == v['aspects'][1]:
            res.append(v)
    
    return res

def filter_different_train_aspects(reviews):
    res = []
    for v in reviews:
        if len(v['aspects']) == 2 and not(v['aspects'][0] == v['aspects'][1]):
            res.append(v)
            
    return res

def parse_dataset(filename):
    root_node = xml.etree.ElementTree.parse(filename).getroot()
    review_nodes = root_node.findall('review')
    reviews = [item for sublist in review_nodes for item in parse_review_node(sublist)]
    
    return pd.DataFrame.from_dict(reviews)

In [56]:
training_parsed = parse_dataset('../training_set.xml')
validation_parsed = parse_dataset('../validation_set.xml')

In [98]:
def tokenize_zomato_reviews():
    with open('../scrapper/reviews.json', 'r') as fp:
        reviews = json.load(fp)['reviews']
    
    sentences_tokens = []

    for review in reviews:
        try :
            tokens =  re.sub(r"[^a-z0-9]+", " ", review.lower()).split()
            sentences_tokens.append(tokens)
        except:
            continue
            
    return sentences_tokens

def tokenize_dataset(res):
    sentences_tokens=[]
    
    for id in np.unique(res.rid.values):
        df = res[res.rid == id]
        text = df.iloc[0]['text']
        tokens =  re.sub(r"[^a-z0-9]+", " ", text.lower()).split()
        sentences_tokens.append(tokens)
        break
        
    return sentences_tokens
        


In [99]:
scrap_tokenize = tokenize_zomato_reviews()
test_tokenize = tokenize_dataset(training_parsed)
validation_tokenize = tokenize_dataset(validation_parsed)

all_tokenize = []
all_tokenize.append(scrap_tokenize)
all_tokenize.append(test_tokenize)
all_tokenize.append(validation_tokenize)

print(len(scrap_tokenize), len(test_tokenize), len(validation_tokenize), len(all_tokenize))
# model = Word2Vec(
#     sentences=sentences_tokens,
#     size=100,
#     window=5,
#     min_count=1,
#     workers=4,
# )

# with open('wordmodel', 'wb') as fp:
#     pickle.dump(model, fp, pickle.HIGHEST_PROTOCOL)


Ini dia toko lawas jaman bapak gua masih ABG kayanya. Namanya emang agak aneh, cocok buat turis2 yg lg cari makan enak dan oleh2. Letaknya di Jl. Sangaji deket daerah harmoni  Kenapa cocok buat wisatawan? Karena disini dijual macem2 oleh2 dari mana aja hampir ada disini. Kalo ga salah ada sambal bu rudy, dll.  Untuk jajanan yg juara ya otak-otaknya. Satu otak2 harga Rp.8.000 tapi dijamin, makan 4 biji udah bisa kenyaang..  Disini juga ada asinan sama juhi. Kemarin makan disini juhinya emang rasa jadul banget, tapi masih kalah enak sama jugi di Jl. Sabang sih hehe (fyi juhi per porsi Rp. 20.000)  Dan es kopyor kalau gak salah harganya 16.000
 Iseng banget kesini sama temen karena udah lama pengen kesini . Dan aku ngerti kenapa tempat ini hype banget . Suasananya enak , bagus buat foto2 , dan MAKANANNYA ENAK SEMUA ! ! ! ! Aku dan temenku kalap banget beli semua makanan . Waitressnya baik semua juga . Pesen ayam keranjang yang enak banget itu , sate ayam , roti goreng , dan ubi crispy bua