<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

In [None]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from spacy import displacy

In [None]:
data = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")
review_texts = list(data.Review)

In [None]:
# Extract candidate 1-grams and 2-grams 
n_gram_range = (1, 2)
vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords.words('english'))
vectorizer.fit(review_texts)
candidates = vectorizer.get_feature_names_out()

# Get noun phrases and nouns from reviews
nlp = spacy.load('en_core_web_sm')
all_nouns = set()

for doc in review_texts:
    doc_processed = nlp(doc)
    # Add noun chunks
    all_nouns.add(chunk.text.strip().lower() for chunk in doc_processed.noun_chunks)
    # Add nouns
    for token in doc_processed:
            if token.pos_ == "NOUN":
                all_nouns.add(token.text)

# Filter candidate topics to only those in the nouns set
candidates = [c for c in candidates if c in all_nouns]

## Embed candidates and documents and find matching topics

In [None]:
def model_topics(documents, candidates, num_topics):
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    # Encode each of the articles
    doc_embeddings = [model.encode([doc]) for doc in documents]
    # Encode the candidate topics
    candidate_embeddings = model.encode(candidates)

    # Calculate cosine similarity between each document and candidate topics
    # Take the top candidate topics as keywords for each document
    review_keywords = []
    for doc in doc_embeddings:
        scores = cosine_similarity(doc, candidate_embeddings)
        keywords = [candidates[index] for index in scores.argsort()[0][-num_topics:]]
        review_keywords.append(keywords)
    
    return review_keywords

In [None]:
topics = model_topics(review_texts, candidates, num_topics=5)

data["Topic Keywords"] = topics

In [None]:
for i,keywords in enumerate(topics[:10]):
    print(review_texts[i])
    print('Topic keywords: {}'.format(keywords))
    print("\n")

In [None]:
data.to_csv("topics_from_transformer.csv", index=False)

## Dependency parsing

In [None]:
nlp = spacy.load('en_core_web_sm')
txt = review_texts[1]
doc = nlp(txt)
doc

In [None]:
chunks = []

for chunk in doc.noun_chunks:
    out = {}
    noun = chunk.root
    if noun.pos_ != 'NOUN':
        continue
    out['noun'] = noun
    for tok in chunk:
        if tok != noun:
            out[tok.pos_] = tok
    chunks.append(out)
    
chunks = [chunk for chunk in chunks if 'ADJ' in chunk.keys()]
    
print(chunks)

## Random Forest

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")
review_list = list(data.Review)

In [None]:
def rating_to_sentiment(rating):
    if rating>3 and rating<=5:
        return 2
    elif rating == 3:
        return 1
    else:
        return 0

data['Sentiment'] = data['Rating'].apply(rating_to_sentiment)
data.head()

In [None]:
## Create numeircal features based on
n_gram_range = (1, 1)
vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords.words('english'))

#With respect to the memory issues. That is something that I hopefully will have f
#ixed in the new version. However, you can reduce the sparse matrix's size quite 
#a bit by setting min_df=10 or higher in the vectorizer. This will reduce the number 
#of words significantly which results in less sparsity.


#X = vectorizer.fit_transform(review_list).toarray()
Y = np.array(data.Sentiment)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(review_list, Y, test_size=0.2, random_state=0)

In [None]:
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, predictions))

## Shap explain

In [None]:
import shap

In [None]:
rf_shap_values = shap.KernelExplainer(clf.predict, X_test)

In [None]:
rf_shap_values

In [None]:
shap.summary_plot(rf_shap_values, X_test

In [None]:
explainer = shap.TreeExplainer(clf)

In [None]:
d = pd.DataFrame(X_test[0].reshape(1,-1), columns = list(vectorizer.get_feature_names_out()))
d

In [None]:
# Calculate Shap values
choosen_instance = d
shap_values = explainer.shap_values(choosen_instance, check_additivity=False)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)

In [None]:
clf.predict(vectorizer.transform(review_list[0]).toarray())

In [None]:
list(vectorizer.get_feature_names_out())

In [None]:
len(list(vectorizer.get_feature_names_out()))

In [None]:
len(vectorizer.transform(["My name is archit"]).toarray()[0])