<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

In [None]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from spacy import displacy
import texthero as hero

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
from tqdm import tqdm
import string

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm", enable=["tokenizer", "lemmatizer"])

import os
import openai

import stanza
# download and load the English model
#stanza.download("en")
nlp_stanza = stanza.Pipeline("en")

## Read data

In [None]:
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz

In [None]:
data = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")

In [None]:
data.Rating.value_counts()

## Convert ratings to Sentiment

In [None]:
def rating_to_sentiment(rating):
    if rating>4:
        return 1 # pos
    elif rating < 2:
        return 0 # neg

data['Sentiment'] = data['Rating'].apply(rating_to_sentiment)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data.head()

In [None]:
data.Sentiment.value_counts()

## Clean data

In [None]:
def tokenize(sentence):
# Tokenize and lemmatize text, remove stopwords and punctuation
    punctuations = string.punctuation
    stopwords = list(STOP_WORDS)

    # Tokenize
    tokens = nlp(sentence)
    # Lemmatize
    tokens = [word.lemma_.lower().strip() for word in tokens]
    # Remove stopwords and punctuation
    #tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    tokens = " ".join([i for i in tokens])
    return tokens

In [None]:
data['cleaned_reviews'] = data['Review'].pipe(hero.clean)
data.head()

In [None]:
tqdm.pandas() #Lematize
data['cleaned_reviews'] = data['cleaned_reviews'].progress_apply(lambda x: tokenize(x))
data

In [None]:
data = pd.read_pickle("../data/clean_lemmatized_data.pkl")
data.head()

## Create Numerical Features

In [None]:
## Create numeircal features based on
n_gram_range = (1, 1)

In [None]:
def build_features(train_data, test_data, ngram_range, method='count'):
    if method == 'tfidf':
        # Create features using TFIDF
        vec = TfidfVectorizer(ngram_range=ngram_range, min_df=800)
        X_train = vec.fit_transform(train_data.cleaned_Review)
        X_test = vec.transform(test_data.cleaned_Review)

    else:
        # Create features using word counts
        vec = CountVectorizer(ngram_range=ngram_range, min_df=800)
        X_train = vec.fit_transform(train_data.cleaned_Review)
        X_test = vec.transform(test_data.cleaned_Review)

    return X_train, X_test, vec

In [None]:
X_train_df, X_test_df, y_train, y_test = train_test_split(data.drop(columns=["Sentiment", "Rating"]), data.Sentiment.values, test_size=0.2, random_state=0, stratify=data.Sentiment.values)

In [None]:
X_train_df.reset_index(drop=True, inplace=True)
X_train_df

In [None]:
X_test_df.reset_index(drop=True, inplace=True)
X_test_df

In [None]:
X_train_vec, X_test_vec, vec = build_features(X_train_df, X_test_df, n_gram_range, method='count')

In [None]:
print(X_train_vec.shape, X_test_vec.shape)

In [None]:
clf = RandomForestClassifier(class_weight={0:1, 1:6})
clf.fit(X_train_vec, y_train)

In [None]:
predictions = clf.predict(X_test_vec)

print("Accuracy: ", accuracy_score(y_test, predictions))

In [None]:
pred_prob = clf.predict_proba(X_test_vec)
pred_prob

## Shap explain

In [None]:
import shap

In [None]:
# Create Tree Explainer object that can calculate shap values
explainer = shap.TreeExplainer(clf)

In [None]:
instance = 0

In [None]:
shap_values = explainer.shap_values(X_test_vec.toarray()[instance].reshape(1, -1), check_additivity=False)

In [None]:
shap.initjs()

In [None]:
print(X_test_df.loc[instance]["Review"])

In [None]:
print("Actual Sentiment: ", y_test[instance])
print("Predicted Sentiment: ", predictions[instance])
print("Positive Sentiment Score: ", pred_prob[instance][1])
print("Negative Sentiment Score: ", pred_prob[instance][0])

pred_class = int(predictions[instance])

In [None]:
## Feature index giving max shap value
index = shap_values[pred_class].argmax()
print(index)

In [None]:
## Top adjective giving the max shap value
top_adjective = list(vec.get_feature_names_out())[index]
print(top_adjective)

In [None]:
#  Getting indices of N = 3 maximum values
# x = np.argsort(scores)[::-1][:3]
# print("Indices:",x)
candidates = list(vec.get_feature_names_out())
keywords = [candidates[index] for index in shap_values[pred_class].argsort()[0][-4:]]
keywords
# # Getting N maximum values
# print("Values:",scores[x])

In [None]:
shap.force_plot(explainer.expected_value[pred_class], shap_values[pred_class], feature_names=list(vec.get_feature_names_out()))

## Dependency parsing

In [None]:
openai.api_key = ""

def grammar_correction(text):

    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user",
             "content": f"Can you correct the grammar of the following sentence? '{text}'"}
        ]
    )
    return dict(completion.choices[0].message)["content"].replace("\n", "")

review_text = grammar_correction(X_test_df.Review[instance])
print(review_text)

In [None]:
# process the sentence with stanza
def extract_adjectives_with_nouns(text):
    # extract all the adjectives and the nouns they are describing
    doc = nlp_stanza(review_text)
    adj_noun_pairs = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos == "ADJ":
                for child in sentence.words:
                    if child.head == word.id and child.upos == "NOUN":
                        adj_noun_pairs.append((word.text, child.text))
    return adj_noun_pairs

def extract_adjectives_with_dependencies(text):
    doc = nlp_stanza(text)
    adjective_dependencies = {}  # Create an empty dictionary to store the adjective dependencies
    for sent in doc.sentences:  # Loop through each sentence in the parsed document
        for word in sent.words:  # Loop through each word in the sentence
            if word.upos == 'ADJ':  # If the word is an adjective
                adjective = word.text
                parent_word = sent.words[word.head - 1]  # Get the parent word of the adjective
                if parent_word.deprel == 'root':  # If the parent is the root of the tree, don't include it
                    continue
                dependencies = [parent_word.text]  # Initialize a list of dependencies with the parent word
                for candidate_child in sent.words:
                    if (candidate_child.head-1) == parent_word.id and candidate_child.deprel in ['amod', 'nsubj', 'advmod']:
                        if candidate_child.text != parent_word.text:
                            dependencies.append(candidate_child.text)  # Add the child to the list of dependencies
                        for grandchild in sent.words:  # Loop through the children of the child
                            if (grandchild.head-1) == candidate_child.id and grandchild.deprel in ['amod', 'nsubj', 'advmod']:
                                if grandchild.text != candidate_child.text:
                                    dependencies.append(grandchild.text)  # Add the grandchild to the list of dependencies
                adjective_dependencies[adjective] = list(set(dependencies))  # Add the adjective and its dependencies to the dictionary, removing any duplicates
    return adjective_dependencies

adj_noun_pairs = extract_adjectives_with_nouns(review_text)
adjective_dependencies = extract_adjectives_with_dependencies(review_text)

# print the adjective-noun pairs
top_adjective_shap = []
remaining_adjectives = []

for adj, noun in adj_noun_pairs:
    if adj in keywords:
        top_adjective_shap.append((adj, noun))
    else:
        remaining_adjectives.append((adj, noun))


for adjective, dependencies in adjective_dependencies.items():
    for dependency in dependencies:
        if dependency!=".":
            if adjective in keywords:
                top_adjective_shap.append((adjective, noun))
            else:
                remaining_adjectives.append((adjective, noun))

top_adjective_shap = list(set(top_adjective_shap))
remaining_adjectives = list(set(remaining_adjectives))

print("\n")
print("Top Adjectives and Nouns using Shap Values: \n")
for adj, noun in top_adjective_shap:
    print(f"{adj} -> {noun}")

print("\nOther Adjectives and Nouns using Dependency Parsing: \n")
for adj, noun in remaining_adjectives:
    print(f"{adj} -> {noun}")

## head - 1 we do for first sentence as it is " the 0 index, but for all other sentence the zero index is ."
## check this bug

In [None]:
## Dependency parsing using Spacy
# #nlp = spacy.load('en_core_web_sm')
# txt = X_test_df.Review[instance]
# doc = nlp(txt)
# doc

In [None]:
# chunks = []

# for chunk in doc.noun_chunks:
#     out = {}
#     noun = chunk.root
#     if noun.pos_ != 'NOUN':
#         continue
#     out['noun'] = noun
#     for tok in chunk:
#         if tok != noun:
#             out[tok.pos_] = tok
#     chunks.append(out)
    
# chunks = [chunk for chunk in chunks if 'ADJ' in chunk.keys()]
    
# print(chunks)