### DATA SELECTION

In [None]:
import pandas as pd
import numpy as np
import string
from collections import Counter,defaultdict,OrderedDict
from tweebo_parser import API, ServerError
import json
from nltk.tokenize import word_tokenize
import re

In [None]:
with open('./yelp_dataset/business.json','r',encoding='utf-8') as f:
    biz_list = list(map(json.loads, f))

In [None]:
attr_food = set(['OutdoorSeating','Alcohol','RestaurantsGoodForGroups','RestaurantsAttire',
            'RestaurantsReservations','RestaurantsTakeOut','RestaurantsDelivery','Caters'])

In [None]:
punct = string.punctuation
transtab = str.maketrans(punct,len(punct)*" ")

In [None]:
keyword_food = ['food','cafe','restaurant','bar','dine','drink','bistro','pub','eat']

In [None]:
def filter_food(entry):
    if 'attributes' in entry.keys() and entry['attributes'] is not None:
        attr_ = set(entry['attributes'].keys())
        if len(attr_.intersection(attr_food))>0:
            return True
#     else: 
#         if 'categories' in entry.keys() and entry['categories'] is not None:
#             cat_ = re.sub('\s+',' ', str(entry['categories']).translate(transtab)).lower().split(' ')
#             sim_ = np.array([cos_sim(vocab.embedding[c], vocab.embedding[key]) for c in cat_ for key in keyword_food])
#             if len(np.where(sim_>0.3)[0])>0:
#                 return True
    return False

In [None]:
filter_biz = [(ent_['business_id'],ent_['name'],ent_['address'],ent_['city'],ent_['state'],ent_['postal_code'],
              ent_['latitude'],ent_['longitude'],ent_['stars'],ent_['review_count'],
              ent_['is_open']) for ent_ in biz_list if filter_food(ent_)]

In [None]:
biz = pd.DataFrame(filter_biz, columns = ['business_id','name','address','city','state','postal_code','latitide','longitude',
                                         'stars','review_count','is_open'])

In [None]:
with open('./yelp_dataset/review.json','r',encoding='utf-8') as f:
    review_list = list(map(json.loads, f))

In [None]:
biz_id = biz.loc[biz.state=='PA','business_id'].values #penn state

In [None]:
filter_review,i = [],0

for rev in review_list:
    if i >100000: # restrict to 100K to fit into 200mb
        break
    else:
        if rev['business_id'] in biz_id:
            filter_review.append(rev)
            i+=1

In [None]:
review = pd.DataFrame(filter_review)
biz_high = pd.DataFrame(review.groupby('business_id')['date'].count()>100).reset_index()
biz_h_list = biz_high.loc[biz_high.date==True,'business_id'].values
review = review.loc[review.business_id.isin(biz_h_list)]
del biz_list, review_list, filter_biz, filter_review

In [None]:
review.stars.mean()

In [None]:
review.groupby('stars')['business_id'].count()

In [None]:
# review.to_csv('./review_selected.csv',index=False)
# export to keep a copy
# left with 53920 reviews

In [None]:
review = pd.read_csv('./review_selected.csv') 

In [None]:
len(review.user_id.unique()) 
# left with 2K+ unique users

In [None]:
len(review.business_id.unique())
# left with 200+ restaurants

### TOKENIZE, PARSE & TEST WITH TDSA

In [None]:
import spacy
from nltk.parse.dependencygraph import DependencyGraph
from bella import helper
from bella.models.target import TargetDep

nlp = spacy.load("en_core_web_sm")

In [None]:
tweebo_api = API(hostname='localhost',port=8000)
text_data = [re.sub('\s+',' ',review.text.iloc[0]),
             re.sub('\s+',' ',review.text.iloc[-1])]
try:
    result_conll = tweebo_api.parse_conll(text_data)
except ServerError as e:
    print(f'{e}\n{e.message}')

In [None]:
def add_root_node(list_conll_sentences):
    temp_list_conll_sentences = []
    for conll_sentences in list_conll_sentences:
        temp_conll_sentences = []
        for sentence in conll_sentences.split('\n'):
            sentence = sentence.split('\t')
            if int(sentence[6]) == 0:
                sentence[7] = 'ROOT'
            temp_conll_sentences.append('\t'.join(sentence))
        conll_sentences = '\n'.join(temp_conll_sentences)
        temp_list_conll_sentences.append(conll_sentences)
    return temp_list_conll_sentences

In [None]:
nltk_result = add_root_node(result_conll)
nltk_dep_tree = DependencyGraph(nltk_result[0])

In [None]:
nltk_dep_tree.tree()

In [None]:
nltk_dep_tree.nodes[0]

In [None]:
target_dep = helper.download_model(TargetDep, 'SemEval 14 Restaurant')

In [None]:
test_data = []
parser_col = ['ID','TOKEN','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
def extract_entities():
    ent_series = []
    for res in result_conll:
        sub_ = pd.DataFrame([token.split('\t') for token in res.split('\n')],columns=parser_col)
        entities = [tuple([int(i),x]) for i,x in sub_.loc[sub_.UPOS.isin(['^','N']),['ID','TOKEN']].values]
        ent_lst = []
        if len(entities) > 0:
            start, end, ent_ = entities[0][0],entities[0][0],entities[0][1]
            if len(entities) == 1:
                if 'UNK' not in ent_:
                    ent_lst.append((ent_,start,end))
            else:
                for i, token in entities[1:]:
                    if end == i-1 and 'UNK' not in " ".join([ent_,token]):
                        ent_ = " ".join([ent_,token])
                        end = i
                        if (i, token) == entities[-1]:
                            ent_lst.append((ent_,start,end))
                    else:
                        if 'UNK' not in ent_:
                            ent_lst.append((ent_,start,end))
                            start, end = i,i
                            ent_ = token
                            if (i, token) == entities[-1]:
                                ent_lst.append((ent_,start,end))
                                
        ent_series.append((ent_lst,len(ent_lst)))
    return ent_series
temp = extract_entities()

In [None]:
for x in range(2):
    txt_ = text_data[x]
    ent_lst = [ent_ for ent_, start, end in temp[x][0]]
    spans = [re.search(e_, txt_).span() for e_ in ent_lst]
    for j in range(len(ent_lst)):
        d_ = {'text':txt_, 'target':ent_lst[j],'spans':spans[j]}
        test_data.append(d_)

In [None]:
target_dep.probabilities(test_data)