<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

In [1]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from spacy import displacy
import texthero as hero

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")
review_texts = list(data.Review)

In [None]:
# Extract candidate 1-grams and 2-grams 
n_gram_range = (1, 2)
vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords.words('english'))
vectorizer.fit(review_texts)
candidates = vectorizer.get_feature_names_out()

# Get noun phrases and nouns from reviews
nlp = spacy.load('en_core_web_sm')
all_nouns = set()

for doc in review_texts:
    doc_processed = nlp(doc)
    # Add noun chunks
    all_nouns.add(chunk.text.strip().lower() for chunk in doc_processed.noun_chunks)
    # Add nouns
    for token in doc_processed:
            if token.pos_ == "NOUN":
                all_nouns.add(token.text)

# Filter candidate topics to only those in the nouns set
candidates = [c for c in candidates if c in all_nouns]

## Embed candidates and documents and find matching topics

In [None]:
def model_topics(documents, candidates, num_topics):
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    # Encode each of the articles
    doc_embeddings = [model.encode([doc]) for doc in documents]
    # Encode the candidate topics
    candidate_embeddings = model.encode(candidates)

    # Calculate cosine similarity between each document and candidate topics
    # Take the top candidate topics as keywords for each document
    review_keywords = []
    for doc in doc_embeddings:
        scores = cosine_similarity(doc, candidate_embeddings)
        keywords = [candidates[index] for index in scores.argsort()[0][-num_topics:]]
        review_keywords.append(keywords)
    
    return review_keywords

In [None]:
topics = model_topics(review_texts, candidates, num_topics=5)

data["Topic Keywords"] = topics

In [None]:
for i,keywords in enumerate(topics[:10]):
    print(review_texts[i])
    print('Topic keywords: {}'.format(keywords))
    print("\n")

In [None]:
data.to_csv("topics_from_transformer.csv", index=False)

## Dependency parsing

In [None]:
nlp = spacy.load('en_core_web_sm')
txt = abc
doc = nlp(txt)
doc

In [None]:
chunks = []

for chunk in doc.noun_chunks:
    out = {}
    noun = chunk.root
    if noun.pos_ != 'NOUN':
        continue
    out['noun'] = noun
    for tok in chunk:
        if tok != noun:
            out[tok.pos_] = tok
    chunks.append(out)
    
chunks = [chunk for chunk in chunks if 'ADJ' in chunk.keys()]
    
print(chunks)

## Random Forest

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
from tqdm import tqdm
import string

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm", enable=["tokenizer", "lemmatizer"])

In [None]:
data = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")

In [None]:
data.Rating.value_counts()

In [None]:
def rating_to_sentiment(rating):
    if rating>4:
        return 1 # pos
    elif rating < 2:
        return 0 # neu
    #else:
        #return 0 # neg

data['Sentiment'] = data['Rating'].apply(rating_to_sentiment)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data.head()

In [None]:
data.Sentiment.value_counts()

In [None]:
def tokenize(sentence,method='spacy'):
# Tokenize and lemmatize text, remove stopwords and punctuation

    punctuations = string.punctuation
    stopwords = list(STOP_WORDS)

    if method=='nltk':
        # Tokenize
        tokens = nltk.word_tokenize(sentence,preserve_line=True)
        # Remove stopwords and punctuation
        tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
        # Lemmatize
        #wordnet_lemmatizer = WordNetLemmatizer()
        #tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]
        tokens = " ".join([i for i in tokens])
    else:
        # Tokenize
        #with nlp.select_pipes(enable=['tokenizer','lemmatizer']):
        tokens = nlp(sentence)
        # Lemmatize
        tokens = [word.lemma_.lower().strip() for word in tokens]
        # Remove stopwords and punctuation
        #tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
        tokens = " ".join([i for i in tokens])
    return tokens

# def clean(review):
    
#     review = review.lower()
#     review = re.sub('[^a-z A-Z 0-9-]+', '', review)
#     #review = " ".join([word for word in review.split() if word not in stopwords.words('english')])
    
#     return review

# tqdm.pandas()
# data['Review'] = data['Review'].progress_apply(clean)
# data.head()
# # Process the reviews
# #tqdm.pandas()
# #df_term_freq['processed_reviews'] = df_term_freq['Review'].progress_apply(lambda x: tokenize(x,method='nltk'))

In [None]:
data['cleaned_reviews'] = data['Review'].pipe(hero.clean)

In [None]:
tqdm.pandas()
data['cleaned_reviews'] = data['cleaned_reviews'].progress_apply(lambda x: tokenize(x,method='spacy'))
data

In [None]:
## Create numeircal features based on
n_gram_range = (1, 1)
#vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords.words('english'), min_df=10 )

# you can reduce the sparse matrix's size quite 
# a bit by setting min_df=10 or higher in the vectorizer. This will reduce the number 
# of words significantly which results in less sparsity.


#X = vectorizer.fit_transform(review_list).toarray()
#X_train = vectorizer.fit_transform(X_train).toarray()
#X_test = vectorizer.transform(X_test).toarray()

Y = np.array(data.Sentiment)

In [None]:
def build_features(train_data, test_data, ngram_range, method='count'):
    if method == 'tfidf':
        # Create features using TFIDF
        vec = TfidfVectorizer(ngram_range=ngram_range, min_df=500)
        X_train = vec.fit_transform(train_data)
        X_test = vec.transform(test_data)

    else:
        # Create features using word counts
        vec = CountVectorizer(ngram_range=ngram_range, min_df=1000)
        X_train = vec.fit_transform(train_data)
        X_test = vec.transform(test_data)

    return X_train, X_test, vec

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.cleaned_reviews.values, data.Sentiment.values, test_size=0.2, random_state=0, stratify=data.Sentiment.values)

In [None]:
abc = X_test.copy()
print(abc[0])
print(y_test[0])

In [None]:
X_train, X_test, vec = build_features(X_train, X_test, n_gram_range)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
clf = RandomForestClassifier(class_weight={0:1, 1:6})
clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, predictions))

In [None]:
pred_prob = clf.predict_proba(X_test)

In [None]:
pred_prob

## Shap explain

In [None]:
import shap

In [None]:
# Create Tree Explainer object that can calculate shap values
explainer = shap.TreeExplainer(clf)

In [None]:
instance = -1

In [None]:
shap_values = explainer.shap_values(X_test.toarray()[instance].reshape(1, -1), check_additivity=False)

In [None]:
shap.initjs()

In [None]:
print(abc[instance])

In [None]:
shap_values[0].argmax()

In [None]:
list(vec.get_feature_names_out())[36]

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0], feature_names=list(vec.get_feature_names_out()))

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1], feature_names=list(vec.get_feature_names_out()))

In [None]:
X_test.shape

In [None]:
choosen_instance = d
shap_values = explainer.shap_values(choosen_instance, check_additivity=False)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance, check_additivity=False)

In [None]:
# Calculate Shap values
choosen_instance = d
shap_values = explainer.shap_values(choosen_instance, check_additivity=False)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)

In [None]:
clf.predict(vectorizer.transform(review_list[0]).toarray())

In [None]:
list(vectorizer.get_feature_names_out())

In [None]:
len(list(vectorizer.get_feature_names_out()))

In [None]:
len(vectorizer.transform(["My name is archit"]).toarray()[0])