In [1]:
# import all needed libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from datetime import datetime
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import copy

In [2]:
# distilbert-base-uncased-finetuned-sst-2-english
# Trained on BERT base model as a teacher
from transformers import pipeline
classifier1 = pipeline("sentiment-analysis")

def bert_distilbert(text):
    text = str(text)
    if text == 'nan ' or text == '':
        return math.nan
    return classifier1(text)
# This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews
# in six languages: English, Dutch, German, French, Spanish and Italian
# It predicts the sentiment of the review as a number of stars (between 1 and 5)
classifier2 = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')  

def bert_multilingual(text):
    text = str(text)
    if text == 'nan ' or text == '':
        return math.nan
    return classifier2(text)
# The model was fine-tuned and evaluated on 15 data sets from diverse text sources to enhance 
# generalization across different types of texts (reviews, tweets, etc.)
classifier3 = pipeline('sentiment-analysis', model='siebert/sentiment-roberta-large-english')   

def roberta_english(text):
    text = str(text)
    if text == 'nan ' or text == '':
        return math.nan
    return classifier3(text)
# This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis 
# with the TweetEval benchmark.
# Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive
classifier4 = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')     

def roberta_tweet(text):
    text = str(text)
    if text == 'nan ' or text == '':
        return math.nan
    return classifier4(text)

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj= SentimentIntensityAnalyzer()
from sentifish import Sentiment
from afinn import Afinn
afn = Afinn()

def textblob(text):
    text = str(text)
    res = TextBlob(text)
    return res.sentiment.polarity

def vader(text):
    text = str(text)
    temp = sid_obj.polarity_scores(sentence)
    return temp['compound']

def sentifish(text):
    text = str(text)
    obj=Sentiment(text)
    polarity = obj.analyze( )
    return polarity

def afinn(text):
    text = str(text)
    return afn.score(text)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))
nltk.download('averaged_perceptron_tagger')
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy
    
def dataframe_display(text):
    doc = nlp(text)
    print(doc)
    df = pd.DataFrame(columns=['token','.dep_','.pos_','.tag_','children','children.pos_','children.text'])
    for token in doc:
        c = []
        p = []
        t = []
        for children in token.children:
            c.append(children)
            p.append(children.pos_)
            t.append(children.text)
        df2 = {'token':token,
              '.dep_':token.dep_,
              '.pos_':token.pos_,
              '.tag_':token.tag_,
              'children':c,
              'children.pos_':p,
              'children.text':t}
        df = df.append(df2, ignore_index = True)
    display(pd_centered(df))
    
def pos_display(text):
    print(text)
    doc = nlp(text)
    sentence_spans = list(doc.sents)
    options = {'distance': 100}
    displacy.render(sentence_spans, style='dep',jupyter=True,options=options)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
def print_list(lis):
    print("Length of list:",len(lis))
    for i in range(len(lis)):
        print(lis[i])
        
def pd_centered(df):
    return df.style.set_table_styles([
        {"selector": "th", "props": [("text-align", "center")]},
        {"selector": "td", "props": [("text-align", "center")]}])

In [5]:
df = pd.read_csv('Sample_for testing - Sheet1.csv')
df.shape

(1079, 2)

In [6]:
df.head()

Unnamed: 0,review_content,id
0,"The Room, hotel, menu everything was just perf...",2591082
1,Excellent experience to spend time in this res...,2591117
2,"It's spacious, neat & clean. Ordered misal pav...",2590678
3,Decent size rooms and the heating was very eff...,2590050
4,"If zero Rating is available, this deserves the...",2589882


In [7]:
cd = pd.DataFrame(columns = ['row','Reviewcontent','ReviewContentPos','ReviewContentNeg','ReviewContentNeu'])

counter = 0
for i in range(df.shape[0]):
#     print(i)
    if df.loc[i,'review_content']=='There are no comments available for this review':
        df.loc[i,'review_content'] = math.nan
#         print(i)
    try:
        if math.isnan(df.loc[i,'review_content']):
            counter += 1
            continue
    except:
        a = 1
    
    text = df.loc[i,'review_content']
    now = text
    now = now.split('\n')
    pos = math.nan
    neg = math.nan
    neu = math.nan
            
    for j in range(len(now)):
        x = now[j].split()
        if len(x)==0:
            continue
        if x[0] == '[Positive]:':
            pos = ' '.join(x[1:])
        elif x[0] == '[Negative]:':
            neg = ' '.join(x[1:])
        else:
            neu = ' '.join(x)
    df2 = {'row':i,
           'Reviewcontent':df.loc[i,'review_content'],
           'ReviewContentPos':pos,
           'ReviewContentNeg':neg,
           'ReviewContentNeu':neu}
    cd = cd.append(df2, ignore_index = True)
print(counter)

0


In [8]:
cd["ReviewContentPos"] = cd["ReviewContentPos"].astype(str)
cd["ReviewContentNeg"] = cd["ReviewContentNeg"].astype(str)
cd["ReviewContentNeu"] = cd["ReviewContentNeu"].astype(str)

In [9]:
cd.head(10)

Unnamed: 0,row,Reviewcontent,ReviewContentPos,ReviewContentNeg,ReviewContentNeu
0,0,"The Room, hotel, menu everything was just perf...",,,"The Room, hotel, menu everything was just perf..."
1,1,Excellent experience to spend time in this res...,,,Excellent experience to spend time in this res...
2,2,"It's spacious, neat & clean. Ordered misal pav...",,,"It's spacious, neat & clean. Ordered misal pav..."
3,3,Decent size rooms and the heating was very eff...,,,Decent size rooms and the heating was very eff...
4,4,"If zero Rating is available, this deserves the...",,,"If zero Rating is available, this deserves the..."
5,5,Excellent resort with on the toe services by e...,,,Excellent resort with on the toe services by e...
6,6,[Positive]: Staff courtesy\n[Negative]: Food w...,Staff courtesy,Food was terrible. Restaurant service pathetic,
7,7,One of the most memorable bday ever Thanku so ...,,,One of the most memorable bday ever Thanku so ...
8,8,Nice for parties. Food very greasy. Not recomm...,,,Nice for parties. Food very greasy. Not recomm...
9,9,"The place is cozy, my reservation was upgraded...",,,"The place is cozy, my reservation was upgraded..."


In [10]:
import re
# Remove Punctuations without fullstop
rem_pun = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~'
def remove_punctuation_without_full(text):
    punctuationfree="".join([i if i not in rem_pun else ' ' for i in text])
    return punctuationfree

# Remove Numbers
def remove_numbers(text):
    pattern = r'[0-9]'
    line = re.sub(pattern,' ', text)
    return line

# Remove non ascii characters
def remove_non_ascii(text):
    now = text
    now = now.strip().split()
    now = [''.join([i for i in word if ord(i) < 128]) for word in now]
    text = ""
    for i in range(len(now)):
        text = text + now[i] + " "
    return text

# Convert multiple spaces to single space
def convert_multiple_spaces(text):
    answer = re.sub(' +', ' ', text)
    return answer

# Remove first and  end spaces
def remove_first_end_spaces(string):
    return "".join(string.rstrip().lstrip())

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
def lemma(text):
    now = text
    now = now.strip().split()
    now = [lemmatizer.lemmatize(word) for word in now]
    text = ""
    for i in range(len(now)):
        text = text + now[i] + " "
    return text

In [11]:
print('Preprocessing Started...')

cd['clean_Reviewcontent'] = cd['Reviewcontent'].apply(lambda x:remove_punctuation_without_full(x))
cd['clean_Reviewcontent'] = cd['clean_Reviewcontent'].apply(lambda x:remove_numbers(x))
cd['clean_Reviewcontent'] = cd['clean_Reviewcontent'].apply(lambda x:remove_non_ascii(x))
cd['clean_Reviewcontent'] = cd['clean_Reviewcontent'].apply(lambda x:lemma(x))
cd['clean_Reviewcontent'] = cd['clean_Reviewcontent'].apply(lambda x:convert_multiple_spaces(x))
cd['clean_Reviewcontent'] = cd['clean_Reviewcontent'].apply(lambda x:remove_first_end_spaces(x))

cd['clean_ReviewContentPos'] = cd['ReviewContentPos'].apply(lambda x:remove_punctuation_without_full(x))
cd['clean_ReviewContentPos'] = cd['clean_ReviewContentPos'].apply(lambda x:remove_numbers(x))
cd['clean_ReviewContentPos'] = cd['clean_ReviewContentPos'].apply(lambda x:remove_non_ascii(x))
cd['clean_ReviewContentPos'] = cd['clean_ReviewContentPos'].apply(lambda x:lemma(x))
cd['clean_ReviewContentPos'] = cd['clean_ReviewContentPos'].apply(lambda x:convert_multiple_spaces(x))
cd['clean_ReviewContentPos'] = cd['clean_ReviewContentPos'].apply(lambda x:remove_first_end_spaces(x))

cd['clean_ReviewContentNeg'] = cd['ReviewContentNeg'].apply(lambda x:remove_punctuation_without_full(x))
cd['clean_ReviewContentNeg'] = cd['clean_ReviewContentNeg'].apply(lambda x:remove_numbers(x))
cd['clean_ReviewContentNeg'] = cd['clean_ReviewContentNeg'].apply(lambda x:remove_non_ascii(x))
cd['clean_ReviewContentNeg'] = cd['clean_ReviewContentNeg'].apply(lambda x:lemma(x))
cd['clean_ReviewContentNeg'] = cd['clean_ReviewContentNeg'].apply(lambda x:convert_multiple_spaces(x))
cd['clean_ReviewContentNeg'] = cd['clean_ReviewContentNeg'].apply(lambda x:remove_first_end_spaces(x))

cd['clean_ReviewContentNeu'] = cd['ReviewContentNeu'].apply(lambda x:remove_punctuation_without_full(x))
cd['clean_ReviewContentNeu'] = cd['clean_ReviewContentNeu'].apply(lambda x:remove_numbers(x))
cd['clean_ReviewContentNeu'] = cd['clean_ReviewContentNeu'].apply(lambda x:remove_non_ascii(x))
cd['clean_ReviewContentNeu'] = cd['clean_ReviewContentNeu'].apply(lambda x:lemma(x))
cd['clean_ReviewContentNeu'] = cd['clean_ReviewContentNeu'].apply(lambda x:convert_multiple_spaces(x)) 
cd['clean_ReviewContentNeu'] = cd['clean_ReviewContentNeu'].apply(lambda x:remove_first_end_spaces(x)) 

print('Preprocessing Done!')

Preprocessing Started...
Preprocessing Done!


In [12]:
cd.head(10)

Unnamed: 0,row,Reviewcontent,ReviewContentPos,ReviewContentNeg,ReviewContentNeu,clean_Reviewcontent,clean_ReviewContentPos,clean_ReviewContentNeg,clean_ReviewContentNeu
0,0,"The Room, hotel, menu everything was just perf...",,,"The Room, hotel, menu everything was just perf...","The Room, hotel, menu everything wa just perfe...",,,"The Room, hotel, menu everything wa just perfe..."
1,1,Excellent experience to spend time in this res...,,,Excellent experience to spend time in this res...,Excellent experience to spend time in this res...,,,Excellent experience to spend time in this res...
2,2,"It's spacious, neat & clean. Ordered misal pav...",,,"It's spacious, neat & clean. Ordered misal pav...","It s spacious, neat clean. Ordered misal pav s...",,,"It s spacious, neat clean. Ordered misal pav s..."
3,3,Decent size rooms and the heating was very eff...,,,Decent size rooms and the heating was very eff...,Decent size room and the heating wa very effec...,,,Decent size room and the heating wa very effec...
4,4,"If zero Rating is available, this deserves the...",,,"If zero Rating is available, this deserves the...","If zero Rating is available, this deserves the...",,,"If zero Rating is available, this deserves the..."
5,5,Excellent resort with on the toe services by e...,,,Excellent resort with on the toe services by e...,Excellent resort with on the toe service by ea...,,,Excellent resort with on the toe service by ea...
6,6,[Positive]: Staff courtesy\n[Negative]: Food w...,Staff courtesy,Food was terrible. Restaurant service pathetic,,Positive Staff courtesy Negative Food wa terri...,Staff courtesy,Food wa terrible. Restaurant service pathetic,
7,7,One of the most memorable bday ever Thanku so ...,,,One of the most memorable bday ever Thanku so ...,One of the most memorable bday ever Thanku so ...,,,One of the most memorable bday ever Thanku so ...
8,8,Nice for parties. Food very greasy. Not recomm...,,,Nice for parties. Food very greasy. Not recomm...,Nice for parties. Food very greasy. Not recomm...,,,Nice for parties. Food very greasy. Not recomm...
9,9,"The place is cozy, my reservation was upgraded...",,,"The place is cozy, my reservation was upgraded...","The place is cozy, my reservation wa upgraded ...",,,"The place is cozy, my reservation wa upgraded ..."


In [13]:
raw0 = cd.loc[:,'clean_Reviewcontent'].to_list()
raw1 = cd.loc[:,'clean_ReviewContentPos'].to_list()
raw2 = cd.loc[:,'clean_ReviewContentNeg'].to_list()
raw3 = cd.loc[:,'clean_ReviewContentNeu'].to_list()

In [14]:
rem_all = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def remove_punctuation_without_gap(text):
    punctuationfree="".join([i for i in text if i not in rem_all])
    # punctuationfree = punctuationfree.replace(' ','')
    return punctuationfree

from nltk.corpus import stopwords
stopwords_lst = stopwords.words('english')
sw = set(['dont','nothing','couldnt','wouldnt','doesnt','mustnt',
          'couldn','wouldn','doesn','mustn','neednt','needn','wont',
          'unfortunately','havent','arent','haven','aren','cant',
          'didnt','didn','shouldnt','shouldn','werent','weren',
          'shant','don','cannot','sorry','awfully','not'])
for word in stopwords_lst:
    sw.add(remove_punctuation_without_gap(word))
    
import requests
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = list(set(stopwords_list.decode().splitlines()))
for word in stopwords:
    sw.add(remove_punctuation_without_gap(word))

def remove_stopwords(text):
    now = text
    now = now.strip().split()
    now = [word for word in now if word not in sw and len(word)>2]
    text = ""
    for i in range(len(now)):
        text = text + now[i] + " "
    return text

In [15]:
def aspect_based1(sentences):
#     print(sentences)
    aspects = []
    for sentence in sentences:
        if sentence == '':
            continue
        doc = nlp(sentence)
        descriptive_term = []
        target = []
        adv = []
        negation = []
        descriptive_term_i = []
        target_i = []
        adv_i = []
        negation_i = []
        sep = []
        sep_i = []
        for token in doc:
            if token.pos_ == 'PUNCT' or token.pos_ == 'CCONJ':
                sep.append(token.text.lower())
                sep_i.append(token.i)
                continue
            if (token.dep_ == 'neg' and token.pos_ =='PART') or token.text.lower()=='no':
                negation.append(token.text.lower())
                negation_i.append(token.i)
                continue
            if remove_punctuation_without_gap(token.text.lower()) in sw:
                continue
            if token.pos_ == 'VERB' or token.pos_ == 'ADV':
                adv.append(token.text.lower())
                adv_i.append(token.i)
            if token.pos_ == 'NOUN':
                target.append(token.text.lower())
                target_i.append(token.i)
            if token.pos_ == 'ADJ':
                descriptive_term.append(token.text.lower())
                descriptive_term_i.append(token.i)
            if token.pos_ == 'ADJ':
                prepend = {}
                for child in token.children:
                    if remove_punctuation_without_gap(child.text.lower()) in sw:
                        continue
                    if child.pos_ == 'ADJ' or child.pos_ == 'ADV':
                        prepend[child.text.lower()] = child.i
                        if child.text.lower() in descriptive_term:
                            ind = descriptive_term.index(child.text.lower())
                            descriptive_term.pop(ind)
                            descriptive_term_i.pop(ind)
                        if token.text.lower() in descriptive_term:
                            ind = descriptive_term.index(token.text.lower())
                            descriptive_term.pop(ind)
                            descriptive_term_i.pop(ind)
                if len(prepend) > 0:
                    prepend[token.text.lower()] = token.i
                    prepend_sorted = sorted(prepend.items(), key=lambda x: x[1]) 
                    text_to_add = ''
                    temp = []
                    for i in range(len(prepend_sorted)-1):
                        text_to_add += prepend_sorted[i][0] + " "
                        temp.append(prepend_sorted[i][1])
                    text_to_add += prepend_sorted[len(prepend_sorted)-1][0]
                    temp.append(prepend_sorted[len(prepend_sorted)-1][1])
                    descriptive_term.append(text_to_add)
                    descriptive_term_i.append(temp)
        to_del = []
        for i in range(len(adv_i)):
            now = adv_i[i]
            for j in range(len(descriptive_term_i)):
                test = descriptive_term_i[j]
                if type(test) == list:
                    for k in range(len(test)):
                        if now == test[k]:
                            to_del.append(now)
                else:
                    if now == test:
                        to_del.append(now)
        for i in range(len(to_del)):
            ind = adv_i.index(to_del[i])
            adv.pop(ind)
            adv_i.pop(ind)
        
        for i in range(len(descriptive_term_i)):
            test = descriptive_term_i[i]
            if type(test) == list:
                descriptive_term_i[i] = np.mean(test)
                
        for i in range(len(negation_i)):
            test = negation_i[i]
            if type(test) == list:
                negation_i[i] = np.mean(test)
                
        for i in range(len(adv_i)):
            test = adv_i[i]
            if type(test) == list:
                adv_i[i] = np.mean(test)
                
        for i in range(len(negation_i)):
            min_dis = 1000
            now = -1
            typ = -1
            for j in range(len(descriptive_term_i)):
                if descriptive_term_i[j] > negation_i[i] and descriptive_term_i[j] - negation_i[i] < min_dis:
                    min_dis = descriptive_term_i[j] - negation_i[i]
                    now = j
                    typ = 1
            for j in range(len(adv_i)):
                if adv_i[j] > negation_i[i] and adv_i[j] - negation_i[i] < min_dis:
                    min_dis = adv_i[j] - negation_i[i]
                    now = j
                    typ = 2
            if now == -1:
                for j in range(len(descriptive_term_i)):
                    if abs(descriptive_term_i[j] - negation_i[i]) < min_dis:
                        min_dis = descriptive_term_i[j] - negation_i[i]
                        now = j
                        typ = 1
                for j in range(len(adv_i)):
                    if abs(adv_i[j] - negation_i[i]) < min_dis:
                        min_dis = adv_i[j] - negation_i[i]
                        now = j
                        typ = 2
            if typ == 1:
                descriptive_term[now] = negation[i] + ' ' + descriptive_term[now]
            if typ == 2:
                adv[now] = negation[i] + ' ' + adv[now]
                    
        aspects.append({'sentence':sentence,'aspect': target,
                        'description': descriptive_term,'adv':adv,
                        'negation':negation,'aspect_i': target_i,
                        'description_i': descriptive_term_i,'adv_i':adv_i,
                        'negation_i':negation_i,'sep':sep,'sep_i':sep_i})
#     print_list(aspects)
    return aspects

In [16]:
def combine_dictionary(a,b):
    final = {}
    if len(a) == 0:
        return b
    elif len(b) == 0:
        return a
    for itm in a:
        final[itm] = [a[itm]]
    for itm in b:
        if itm in final:
            now = final[itm]
            now.append(b[itm])
        else:
            final[itm] = [b[itm]]
    for itm in final:
        now = final[itm]
        score = np.mean(now)
        final[itm] = score
    return final

In [17]:
def aspect_relation1(dict_now):
    ans = {}
    return ans

def aspect_relation2(dict_now,column):
    ans = {}
    target = dict_now['aspect']
    sen = dict_now['sentence']
    if column == 'POSITIVE':
        for i in range(len(target)):
            ans[target[i]] = 1
    elif column == 'NEGATIVE':
        for i in range(len(target)):
            ans[target[i]] = -1
    else:
        s1 = bert_distilbert(sen)[0]
        r1 = 1000
        if s1['label'] == 'POSITIVE':
            r1 = s1['score']
        elif s1['label'] == 'NEGATIVE':
            r1 = -s1['score']
            
        s2 = roberta_english(sen)[0]
        r2 = 1000
        if s2['label'] == 'POSITIVE':
            r2 = s2['score']
        elif s2['label'] == 'NEGATIVE':
            r2 = -s2['score']
            
        score = round((r1+r2)/2,5)
        for i in range(len(target)):
            ans[target[i]] = score
    return ans

def aspect_relation3(dict_now):
    ans = {}
    target = dict_now['aspect']
    sen = dict_now['sentence']

    s1 = bert_distilbert(sen)[0]
    r1 = 1000
    if s1['label'] == 'POSITIVE':
        r1 = s1['score']
    elif s1['label'] == 'NEGATIVE':
        r1 = -s1['score']

    s2 = roberta_english(sen)[0]
    r2 = 1000
    if s2['label'] == 'POSITIVE':
        r2 = s2['score']
    elif s2['label'] == 'NEGATIVE':
        r2 = -s2['score']

    score = round((r1+r2)/2,5)
    ans[target[0]] = score
    return ans

def aspect_relation4(dict_now):
    ans = {}
    target = dict_now['aspect']
    sen = dict_now['sentence']

    s1 = bert_distilbert(sen)[0]
    r1 = 1000
    if s1['label'] == 'POSITIVE':
        r1 = s1['score']
    elif s1['label'] == 'NEGATIVE':
        r1 = -s1['score']

    s2 = roberta_english(sen)[0]
    r2 = 1000
    if s2['label'] == 'POSITIVE':
        r2 = s2['score']
    elif s2['label'] == 'NEGATIVE':
        r2 = -s2['score']

    score = round((r1+r2)/2,5)
    for i in range(len(target)):
        ans[target[i]] = score
    return ans

def aspect_relation5(dict_now):
    ans = {}
    target = dict_now['aspect']
    sen = dict_now['sentence']

    s1 = bert_distilbert(sen)[0]
    r1 = 1000
    if s1['label'] == 'POSITIVE':
        r1 = s1['score']
    elif s1['label'] == 'NEGATIVE':
        r1 = -s1['score']

    s2 = roberta_english(sen)[0]
    r2 = 1000
    if s2['label'] == 'POSITIVE':
        r2 = s2['score']
    elif s2['label'] == 'NEGATIVE':
        r2 = -s2['score']

    score = round((r1+r2)/2,5)
    ans[target[0]] = score
    return ans

def aspect_relation6(dict_now):
    ans = {}
    target = dict_now['aspect']
    target_i = dict_now['aspect_i']
    words1 = dict_now['description']
    words2 = dict_now['adv']
    wordsi_1 = dict_now['description_i']
    wordsi_2 = dict_now['adv_i']
    wordsf_1 = []
    wordsf_2 = []
    for i in range(len(wordsi_1)):
        now = wordsi_1[i]
        if type(now) == list:
            wordsf_1.append(np.mean(now))
        else:
            wordsf_1.append(now)
            
    for i in range(len(wordsi_2)):
        now = wordsi_2[i]
        if type(now) == list:
            wordsf_2.append(np.mean(now))
        else:
            wordsf_2.append(now)
    X = target
    XX = target_i
    Y = words1 + words2
    YY = wordsf_1 + wordsf_2
    
    y = copy.deepcopy(Y)
    yy = copy.deepcopy(YY)
    ans_list = []
    ind_list = []
    for i in range(len(XX)):
        min_dis = 1000
        now = 0
        for j in range(len(YY)):
            if abs(YY[j]-XX[i]) < min_dis:
                min_dis = abs(YY[j]-XX[i])
                now = j
        ans_list.append(Y[now])
        ind_list.append(now)
    ind_list = np.sort(list(set(ind_list)))[::-1].tolist()
    
    for i in range(len(ind_list)):
        index = ind_list[i]
        y.pop(index)
        yy.pop(index)
    
    relation = []
    for i in range(len(ans_list)):
        relation.append([target[i],ans_list[i]])
    
    for i in range(len(yy)):
        min_dis = 1000
        now = 0
        for j in range(len(XX)):
            if abs(yy[i]-XX[j]) < min_dis:
                min_dis = abs(yy[i]-XX[j])
                now = j
        relation.append([X[now],y[i]])
        
    
    for i in range(len(target)):
        ans[target[i]] = []
    for i in range(len(relation)):
        word = relation[i][0]
        sen = relation[i][1]
        
        s1 = bert_distilbert(sen)[0]
        r1 = 1000
        if s1['label'] == 'POSITIVE':
            r1 = s1['score']
        elif s1['label'] == 'NEGATIVE':
            r1 = -s1['score']

        s2 = roberta_english(sen)[0]
        r2 = 1000
        if s2['label'] == 'POSITIVE':
            r2 = s2['score']
        elif s2['label'] == 'NEGATIVE':
            r2 = -s2['score']

        score = round((r1+r2)/2,5)
        
        list_now = ans[word]
        list_now.append(score)
        ans[word] = list_now
    
    for itm in ans:
        value = ans[itm]
        value_mean = np.mean(value)
        ans[itm] = value_mean
    
    return ans

In [18]:
def type_detection(ab,column):
    rows = len(ab)
    final = {}
    for i in range(rows):
        dict_now = ab[i]
        NC = len(dict_now['aspect'])
        DC = len(dict_now['description']) + len(dict_now['adv'])
        if NC == 0:
#             print('Type 1')
            ans = aspect_relation1(dict_now)
            final = combine_dictionary(ans,final)
        elif DC == 0:
#             print('Type 2')
            ans = aspect_relation2(dict_now,column)
            final = combine_dictionary(ans,final)
        elif NC == 1 and DC == 1:
#             print('Type 3')
            ans = aspect_relation3(dict_now)
            final = combine_dictionary(ans,final)
        elif NC > 1 and DC == 1:
#             print('Type 4')
            ans = aspect_relation4(dict_now)
            final = combine_dictionary(ans,final)
        elif NC == 1 and DC > 1:
#             print('Type 5')
            ans = aspect_relation5(dict_now)
            final = combine_dictionary(ans,final)
        elif NC > 1 and DC > 1:
#             print('Type 6')
            ans = aspect_relation6(dict_now)
            final = combine_dictionary(ans,final)
        else:
            print('Type X')
    return final

In [19]:
helper = pd.read_csv('aspect_dictionary.csv')
print(helper.shape)
helper['Phrase'] = helper['Phrase'].astype(str)
helper['Class'] = helper['Class'].astype(str)
keywords = list(set(helper.loc[:,'Class'].to_list()))
classes_dict = {}
for i in range(len(helper)):
    add = lemma(remove_first_end_spaces(helper.loc[i,'Phrase']))
    add = remove_punctuation_without_full(add)
    add = remove_first_end_spaces(add)
    classes_dict[add] = helper.loc[i,'Class']

(1345, 2)


In [20]:
import gensim.downloader as api
from scipy import spatial
glove_model = api.load('glove-twitter-25')

def find_aspect(aspect,keywords):
    sim = []
    sample_glove_embedding1=glove_model[aspect.lower()]
    for i in range(len(keywords)):
        sample_glove_embedding2=glove_model[keywords[i].lower()]
        res = 1 - spatial.distance.cosine(sample_glove_embedding1, sample_glove_embedding2)
        sim.append(res)
    x = np.argmax(sim)
    return keywords[x]

In [21]:
def find_class(aspect, keywords, classes_dict):
    aspect = lemma(remove_first_end_spaces(aspect))
    aspect = remove_punctuation_without_full(aspect)
    aspect = remove_first_end_spaces(aspect)
    if aspect in classes_dict:
        if classes_dict[aspect] == 'dump':
            ret = -1
            return ret
        return classes_dict[aspect]
    try:
        ret = find_aspect(aspect, keywords)
    except:
        ret = -1
    return ret

In [22]:
def collate_classes(dict_now,keywords,classes_dict):
    ans = {}
    for itm in dict_now:
        class_add = find_class(itm,keywords,classes_dict)
        if class_add == -1:
            continue
        if class_add in ans:
            now = ans[class_add]
            now.append(dict_now[itm])
            ans[class_add] = now
        else:
            ans[class_add] = [dict_now[itm]]
    for itm in ans:
        now = ans[itm]
        score = np.mean(now)
        ans[itm] = round(score,5)
    return ans

In [23]:
def ensemble_sentiment(text):
    snip = text
    now = bert_distilbert(snip)[0]
    x = 0
    y = 0
    if now['label'] == 'POSITIVE':
        x += now['score']
    else:
        x -= now['score']
    temp = roberta_english(snip)[0]
    if now['label'] == 'POSITIVE':
        y += now['score']
    else:
        y -= now['score']
#     print("Overall Sentiment Score:",round((x+y)/2,5))
    return round((x+y)/2,5)

In [24]:
def find_aspect_class_relation(relation, td, keywords, classes_dict):
    for itm in td:
        relation[itm] = find_class(itm, keywords, classes_dict)
    return relation

In [25]:
cd.shape

(1079, 9)

In [27]:
import sys, os

# for i in range(cd.shape[0]):
for i in range(cd.shape[0]):
#     print('Run:',i)
    try:
        text = cd.loc[i,'clean_Reviewcontent']
        cd.loc[i,'Sentiment Score'] = ensemble_sentiment(text)

        pol_prop = {}
        index = i
        snip = raw1[index]
        text = snip
        text = snip.split('.')
        ab = aspect_based1(text)
        # pos_display(snip)
        # dataframe_display(snip)
        # ensemble_sentiment(snip)
        # polarity_ab(ab)
        
        relation = {}

        td = type_detection(ab,'POSITIVE')
        relation = find_aspect_class_relation(relation, td, keywords, classes_dict)
        c1 = collate_classes(td,keywords,classes_dict)

        index = i
        snip = raw2[index]
        text = snip
        text = snip.split('.')
        ab = aspect_based1(text)
        # pos_display(snip)
        # dataframe_display(snip)
        # ensemble_sentiment(snip)
        # polarity_ab(ab)

        td = type_detection(ab,'NEGATIVE')
        relation = find_aspect_class_relation(relation, td, keywords, classes_dict)
        c2 = collate_classes(td,keywords,classes_dict)

        index = i
        snip = raw3[index]
        text = snip
        text = snip.split('.')
        ab = aspect_based1(text)
        # pos_display(snip)
        # dataframe_display(snip)
        # ensemble_sentiment(snip)
        # polarity_ab(ab)

        td = type_detection(ab,'NEUTRAL')
        relation = find_aspect_class_relation(relation, td, keywords, classes_dict)
        c3 = collate_classes(td,keywords,classes_dict)

        pol_prop = combine_dictionary(pol_prop,c1)
        pol_prop = combine_dictionary(pol_prop,c2)
        pol_prop = combine_dictionary(pol_prop,c3)

#         for itm in relation:
#             if itm in classes_dict:
#                 continue
#             print(itm,'--->',relation[itm])
        cd.loc[i,'Relation'] = str(relation)
        cd.loc[i,'Properties'] = str(pol_prop)
#         print('---------------------------------')
    except Exception as e:
        continue
        print(i,e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)

Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors


In [31]:
cd.to_csv('results_sample_testing.csv')

In [32]:
cd.head()

Unnamed: 0,row,Reviewcontent,ReviewContentPos,ReviewContentNeg,ReviewContentNeu,clean_Reviewcontent,clean_ReviewContentPos,clean_ReviewContentNeg,clean_ReviewContentNeu,Sentiment Score,Relation,Properties
0,0,"The Room, hotel, menu everything was just perf...",,,"The Room, hotel, menu everything was just perf...","The Room, hotel, menu everything wa just perfe...",,,"The Room, hotel, menu everything wa just perfe...",0.99988,"{'room': 'room', 'view': 'location', 'stay': '...","{'room': 0.99924, 'location': 0.99929, 'food':..."
1,1,Excellent experience to spend time in this res...,,,Excellent experience to spend time in this res...,Excellent experience to spend time in this res...,,,Excellent experience to spend time in this res...,0.99968,"{'food': 'food', 'atmosphere': 'facilities', '...","{'food': 0.9993, 'facilities': 0.99877, 'time'..."
2,2,"It's spacious, neat & clean. Ordered misal pav...",,,"It's spacious, neat & clean. Ordered misal pav...","It s spacious, neat clean. Ordered misal pav s...",,,"It s spacious, neat clean. Ordered misal pav s...",0.99814,"{'snack': 'food', 'misal': -1, 'snacks': 'food'}",{'food': 0.4686}
3,3,Decent size rooms and the heating was very eff...,,,Decent size rooms and the heating was very eff...,Decent size room and the heating wa very effec...,,,Decent size room and the heating wa very effec...,0.9992,"{'location': 'location', 'market': 'location',...","{'location': 0.97639, 'time': 0.49284, 'facili..."
4,4,"If zero Rating is available, this deserves the...",,,"If zero Rating is available, this deserves the...","If zero Rating is available, this deserves the...",,,"If zero Rating is available, this deserves the...",-0.99741,"{'behaviour': 'service', 'staff': 'staff', 'ho...","{'service': -0.33469, 'staff': 0.49978, 'hotel..."
