In [1]:
import json
import pickle
import re
from gensim.models import Word2Vec

In [2]:
import xml.etree.ElementTree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def parse_aspect_node(aspect_node):
    category = aspect_node.get('category')
    polarity = aspect_node.get('polarity')
    
    typo_polarity_map = {
        'POSITIVE': 'POSITIVE',
        'NEGATIVE': 'NEGATIVE',
        'NEATIVE': 'NEGATIVE',
        'NEGTIVE': 'NEGATIVE',
        ' NEGATIVE ': 'NEGATIVE',
        'NEGATIVE ': 'NEGATIVE',
        'POSITIVETIVE': 'POSITIVE',
        'POSITUVE': 'POSITIVE'
    }
    
    polarity = typo_polarity_map[polarity]
    
    return {category: polarity}    

def parse_aspects_node(aspects_node):
    default_aspects = {
        'FOOD': 'NEUTRAL',
        'AMBIENCE': 'NEUTRAL',
        'SERVICE': 'NEUTRAL',
        'PRICE': 'NEUTRAL'
    }
    
    for aspect in aspects_node.getchildren():
        default_aspects.update(parse_aspect_node(aspect))
    
    return default_aspects

def parse_review_node(review_node):
    text = review_node.find('text').text
    rid = review_node.get('rid')
    aspects = review_node.findall('aspects')
    
    default_dict = {
        'rid': int(rid),
        'text': text
    }
    
    res = []
    for aspect in aspects:
        cur_dict = default_dict.copy()
        cur_dict.update(parse_aspects_node(aspect))
        res.append(cur_dict)
        
    return res

def parse_review_test_node(review_node):
    text = review_node.find('text').text
    rid = review_node.get('rid')
    
    default_dict = {
        'rid': int(rid),
        'text': text
    }
    
    res = [default_dict]
        
    return res

def filter_same_train_aspects(reviews):
    res = []
    for v in reviews:
        if len(v['aspects']) == 1 or v['aspects'][0] == v['aspects'][1]:
            res.append(v)
    
    return res

def filter_different_train_aspects(reviews):
    res = []
    for v in reviews:
        if len(v['aspects']) == 2 and not(v['aspects'][0] == v['aspects'][1]):
            res.append(v)
            
    return res

def parse_dataset(filename):
    root_node = xml.etree.ElementTree.parse(filename).getroot()
    review_nodes = root_node.findall('review')
    reviews = [item for sublist in review_nodes for item in parse_review_node(sublist)]
    
    return pd.DataFrame.from_dict(reviews)

def parse_testset(filename):
    root_node = xml.etree.ElementTree.parse(filename).getroot()
    review_nodes = root_node.findall('review')
    reviews = [item for sublist in review_nodes for item in parse_review_test_node(sublist)]
    
    return pd.DataFrame.from_dict(reviews)

In [3]:
training_parsed = parse_dataset('../training_set.xml')
validation_parsed = parse_dataset('../validation_set.xml')
test_parsed = parse_testset('../test_set.xml')

In [4]:
def tokenize_zomato_reviews():
    with open('../scrapper/reviews.json', 'r') as fp:
        reviews = json.load(fp)['reviews']
    
    sentences_tokens = []

    for review in reviews:
        try :
            tokens =  re.sub(r"[^a-z0-9]+", " ", review.lower()).split()
            sentences_tokens.append(tokens)
        except:
            continue
            
    return sentences_tokens

def tokenize_dataset(res):
    sentences_tokens=[]
    
    for id in np.unique(res.rid.values):
        df = res[res.rid == id]
        text = df.iloc[0]['text']
        tokens =  re.sub(r"[^a-z0-9]+", " ", text.lower()).split()
        sentences_tokens.append(tokens)
        
    return sentences_tokens
        


In [6]:
scrap_tokenize = tokenize_zomato_reviews()
test_tokenize = tokenize_dataset(training_parsed)
validation_tokenize = tokenize_dataset(validation_parsed)
test_tokenize = tokenize_dataset(test_parsed)

all_tokenize = []
all_tokenize.extend(scrap_tokenize)
all_tokenize.extend(test_tokenize)
all_tokenize.extend(validation_tokenize)
all_tokenize.extend(test_tokenize)

model = Word2Vec(
    sentences=all_tokenize,
    size=20,
    window=5,
    min_count=1,
    workers=4,
)

with open('wordmodel', 'wb') as fp:
    pickle.dump(model, fp, pickle.HIGHEST_PROTOCOL)


In [14]:
with open('wordmodel', 'rb') as fp:
    model = pickle.load(fp)
    
model.similar_by_word('tidak')

  after removing the cwd from sys.path.


[('sedikit', 0.9811491966247559),
 ('tipe', 0.9796196222305298),
 ('istimewa', 0.9747599363327026),
 ('banyak', 0.9714991450309753),
 ('besar', 0.9710503816604614),
 ('variasi', 0.9706156849861145),
 ('ekspektasi', 0.9693906307220459),
 ('renyah', 0.9681794047355652),
 ('tmen2', 0.9665861129760742),
 ('tetapi', 0.9656574726104736)]

In [57]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

Using TensorFlow backend.


In [59]:
data = pd.read_csv('../Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [None]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

In [None]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))