In [None]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='project-id', project_access_token='project-token')
pc = project.project_context


### READ DATA FROM DB2

In [None]:
import pandas as pd
import numpy as np
from ibmdbpy import IdaDataBase, IdaDataFrame
import string
import emoji
from collections import Counter,defaultdict,OrderedDict
from tweebo_parser import API, ServerError
import pyodbc
import json
import re

In [None]:
conn_string = 'credential-db'
db = IdaDataBase(dsn=conn_string)
db.show_tables(show_all=False)

In [None]:
user = IdaDataFrame(db, 'TGZ44984.USER').as_dataframe()
biz = IdaDataFrame(db, 'TGZ44984.BUSINESS').as_dataframe()
# Big file with varchar more than 1024 can't use IdaDataFrame, use pyodbc
cnx = pyodbc.connect('DSN='+conn_string)
cursor = cnx.cursor()
cursor.execute('select * from TGZ44984.REVIEW')
rows = cursor.fetchall()
rows = [tuple(x) for x in rows]
review = pd.DataFrame(rows,columns = ['review_id','user_id','biz_id','stars','useful','funny','cool','text','date'])
cursor.close()
cnx.close()


### RUN NLU API

In [None]:
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
# import watson_developer_cloud.natural_language_understanding.features.v1 as Features
from watson_developer_cloud.natural_language_understanding_v1 import Features, SentimentOptions, KeywordsOptions
import re


In [None]:
authenticator = IAMAuthenticator('IAMAuth')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator
)

natural_language_understanding.set_service_url('https://gateway.watsonplatform.net/natural-language-understanding/api')

In [None]:
result = []

In [None]:
for idx, row in review.iterrows():
    id_  = row['review_id']
    text_ = row['text']
    if re.search(r'\w+',text_) is None or len(text_) < 10:
        continue
    response = natural_language_understanding.analyze(text=text_,
                                                      features=Features(keywords=KeywordsOptions(sentiment=True,emotion=True))).get_result()
    re_ = response['keywords']
    lang_ = response['language']
    result.append({'review_id':id_, 'result':re_, 'lang':lang_})
    

In [None]:
project.save_data("result.json", json.dumps(result, indent=2))

### KEYWORDS TO ASPECTS

In [None]:
from nltk.tokenize import word_tokenize
import spacy
from mxnet import gluon
from mxnet import nd
import gluonnlp as nlp

In [None]:
glove_6b300d = nlp.embedding.create('glove', source='glove.6B.300d')
vocab = nlp.Vocab(nlp.data.Counter(glove_6b300d.idx_to_token))
vocab.set_embedding(glove_6b300d)

In [None]:
punct = string.punctuation
transtab = str.maketrans(punct,len(punct)*" ")
aspects = ['cleanliness','food','service','location','price']
emo_list = ['sadness','joy','fear','disgust','anger']
cols = ['review_id']+[i+'_sentiment' for i in aspects]+[i+'_'+j for i in aspects for j in emo_list ]
nlp = spacy.load("en_core_web_sm")
final_result = []

In [None]:
def cos_sim(x, y):
    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))

def check_similarity(word, aspects = aspects):
    similarity = []
    for aspect in aspects:
        similarity.append(cos_sim(vocab.embedding[aspect],vocab.embedding[word]).asnumpy()[0])
    
    if max(similarity) > 0.30:
        return aspects[np.argmax(similarity)]
    else:
        return None

In [None]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_30d96c6e27154b7d92410a8469740ae4 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='apikey',
    ibm_auth_endpoint="https://iam.ng.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_30d96c6e27154b7d92410a8469740ae4.get_object(Bucket='bucket',Key='result.json')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object 

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# Since JSON data can be semi-structured and contain additional metadata, it is possible that you might face an error during data loading.
# Please read the documentation of 'pandas.read_json()' and 'pandas.io.json.json_normalize' to learn more about the possibilities to adjust the data loading.
# pandas documentation: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
# and http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html

# df_data_1 = pd.read_json(body, orient='values')
# df_data_1.head()

re_list = json.load(body)

In [None]:
for res in re_list:
    row = dict(zip(cols,[None for i in range(len(cols))]))
    row['review_id'] = res['review_id']
    for sent_ in res['result']:
        doc = nlp(sent_['text'])
        pos_tag = [token.text for token in doc if token.pos_ in ['NOUN', 'PRON', 'PROPN']]
        text_ = [re.sub('\s+','',x.translate(transtab).lower()) for x in pos_tag]
        
        asp_ = None
        for t_ in text_:
            asp_ = check_similarity(aspects,t_)
            
            if asp_ is not None:
                break
        if asp_ is None:
            continue
        if row[asp_+'_sentiment'] is None:
            row[asp_+'_sentiment'] = []
        score_ = sent_['sentiment']['score']
        label_ = sent_['sentiment']['label']
        if (label_ == 'negative' and score_ > 0) or (label_ == 'positive' and score_ < 0):
            score = -score
        row[asp_+'_sentiment'].append(score_)
        if 'emotion' in sent_.keys():
            emo_ = Counter(sent_['emotion']).most_common(1)[0]
            emo_label = asp_+'_'+emo_[0]
            if row[emo_label] is None:
                row[emo_label] = []
            row[emo_label].append(emo_[1])
    for col in cols[1:]:
        if isinstance(row[col],list):
            row[col] = np.mean(row[col])
    final_result.append(row)

In [None]:
project.save_data('result.csv', pd.DataFrame(final_result).to_csv(index=False))