# Extracting features and descriptive words

In [87]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import stanza
from preprocessing import remove_emojis, handle_contractions, handle_ellipses, handle_apostrophes,lowercase, remove_punctuation



# download English model
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 12.5MB/s]                    
2024-04-13 16:21:48 INFO: Downloaded file to /Users/shaojieee/stanza_resources/resources.json
2024-04-13 16:21:48 INFO: Downloading default packages for language: en (English) ...
2024-04-13 16:21:49 INFO: File exists: /Users/shaojieee/stanza_resources/en/default.zip
2024-04-13 16:21:52 INFO: Finished downloading models and saved to /Users/shaojieee/stanza_resources
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shaojieee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shaojieee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shaojieee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
def aspect_sentiment_analysis(txt, stop_words, nlp):

    txt = txt.lower() # LowerCasing the given Text
    sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences

    fcluster = []
    totalfeatureList = []
    finalcluster = []
    dic = {}
    try:
        for line in sentList:
            txt_list = nltk.word_tokenize(line) # Splitting up into words
            taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word

            newwordList = []
            flag = 0
            for i in range(0,len(taggedList)-1):
                if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                    newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                    flag=1
                else:
                    if(flag==1):
                        flag=0
                        continue
                    newwordList.append(taggedList[i][0])
                    if(i==len(taggedList)-2):
                        newwordList.append(taggedList[i+1][0])

            
            finaltxt = ' '.join(word for word in newwordList) 
            new_txt_list = nltk.word_tokenize(finaltxt)
            wordsList = [w for w in new_txt_list if not w in stop_words]
            taggedList = nltk.pos_tag(wordsList)

            doc = nlp(finaltxt) 
            # Getting the dependency relations between the words
            dep_node = []
            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])

            # Coverting it into appropriate format
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]

            featureList = []
            categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i)) # For features for each sentence
                    totalfeatureList.append(list(i)) # Stores the features of all the sentences in the text
                    categories.append(i[0])

            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])
                
        for i in totalfeatureList:
            dic[i[0]] = i[1]
        
        for i in fcluster:
            if(dic[i[0]]=="NN"):
                finalcluster.append(i)
            
        return(finalcluster)
    except:
        return []

nlp = stanza.Pipeline()
stop_words = set(stopwords.words('english'))

2024-04-12 20:36:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 26.4MB/s]                    
2024-04-12 20:36:10 INFO: Downloaded file to /Users/shaojieee/stanza_resources/resources.json
2024-04-12 20:36:11 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-04-12 20:36:11 

In [27]:
def preprocess_text(text):
    text = lowercase(text)
    text = remove_emojis(text)
    text = handle_contractions(text)
    text = handle_ellipses(text)
    text = handle_apostrophes(text)
    return text

In [28]:
txt = "Service shocking had to ask for 2 beers twice then got presented with astronomical bill I.e 30 dollars for 2 small beers won't be back soon."
txt = preprocess_text(txt)
aspect_sentiment_analysis(txt, stop_words, nlp)

[['service', ['shocking']], ['bill', ['astronomical']]]

In [29]:
data = pd.read_excel('./eval.xls')
data['Review_Clean'] = data['Review'].apply(preprocess_text)

In [31]:
data['absa'] = data['Review_Clean'].apply(lambda x:aspect_sentiment_analysis(x, stop_words, nlp))

In [86]:
data.sample().values

array([['Food was hands down beating expectations. I knew it would be amazing for a Michelin star restaurant but I actually thought it was better quality than just a 1 star. Also appreciated the large diversity in courses and in between tasters for …',
        'POSITIVE',
        'food was hands down beating expectations. i knew it would be amazing for a michelin star restaurant but i actually thought it was better quality than just a 1 star. also appreciated the large diversity in courses and in between tasters for  ',
        list([['food', ['beating']], ['michelinstar', ['starrestaurant']], ['starrestaurant', ['michelinstar']], ['quality', ['it', 'better']], ['star', ['just']], ['diversity', ['large', 'appreciated']]])]],
      dtype=object)

In [66]:
data.to_csv('./results/absa_results.csv')

In [None]:
# array([['Nice rooftop terrace with a view, but that’s it. Very poor service and overpriced food for low quality. Overall disappointing experience.',
#         'NEGATIVE',
#         'nice rooftop terrace with a view, but that s it. very poor service and overpriced food for low quality. overall disappointing experience.',
#         list([['rooftopterrace', ['nice']], ['view', []], ['service', ['poor']], ['food', ['overpriced']], ['quality', ['low']], ['experience', ['overall', 'disappointing']]])]],
#       dtype=object)

In [None]:
# array([['Incredible food and amazing service!\n\nWe came for the chili crab and it was delicious! The soups are also very good. …',
#         'POSITIVE',
#         'incredible food and amazing service!\n\nwe came for the chili crab and it was delicious! the soups are also very good.  ',
#         list([['food', ['incredible']], ['service', ['amazing']]])]],
#       dtype=object)

# Classifying aspect into polarity

In [159]:
data = pd.read_csv('./results/absa_results.csv', converters={'absa': pd.eval}, index_col=0)

In [160]:
data.head()

Unnamed: 0,Review,Label,Review_Clean,absa
0,Japanese style Italian restaurant. Ikura Pasta...,POSITIVE,japanese style italian restaurant. ikura pasta...,"[[style, [restaurant]], [restaurant, [japanese..."
1,Definately visit this love'd their masala dosa,POSITIVE,definately visit this love'd their masala dosa,"[[visit, [definately, love]], [love, [visit]],..."
2,I had a great experience at Meadesmoore! We or...,POSITIVE,i had a great experience at meadesmoore! we or...,"[[experience, [great, had]], [meadesmoore, []]..."
3,Food was good! Service level was slightly belo...,POSITIVE,food was good! service level was slightly belo...,"[[food, [good]], [good, [food]], [servicelevel..."
4,"Food, presentation and service were all top no...",POSITIVE,"food, presentation and service were all top no...",[]


In [161]:
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')

def calculate_sentiment_score(text):
    tokens = nltk.word_tokenize(text)
    positive_score = 0
    negative_score = 0
    for token in tokens:
        synsets = list(swn.senti_synsets(token))
        if synsets:
            pos_score = sum(s.pos_score() for s in synsets) / len(synsets)
            neg_score = sum(s.neg_score() for s in synsets) / len(synsets)
            positive_score += pos_score
            negative_score += neg_score
    if positive_score > negative_score:
        return 'POSITIVE'
    elif positive_score < negative_score:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/shaojieee/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [162]:
def convert_to_sentiment(absa):
    polarities = []
    for x in absa:
        feature, desc = x
        if len(desc)>0:
            polarity = calculate_sentiment_score(' '.join(desc))
            polarities.append([feature, polarity])
    
    return polarities

In [163]:
data['polarity'] = data['absa'].apply(convert_to_sentiment)

In [164]:
data.to_csv('./results/absa_results.csv')