#**INITIALIZATION**

In [None]:
!pip install textblob --user
!pip install nltk --user
!pip install stanza --user
!pip install transformers --user
!pip install seaborn --user
!pip install pandas --user
!pip install sentencepiece --user

In [None]:
import stanza
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import  PorterStemmer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import string
import re
import warnings
warnings.simplefilter("ignore")
nltk.download('all')
stanza.download('en')
nlp = stanza.Pipeline('en')

In [None]:
#load dataset
df = pd.read_json('data/dataset.json')

#remove max
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [None]:
df.head()

In [None]:
df.drop(['reviewTime','unixReviewTime'], axis=1, inplace=True)

In [None]:
df.info()

# **RE-CHECK NULL AND DUPLICATES**


**DUPLICATES**

In [None]:
#CHECK TOTAL DUPLICATE OCCURENCES
dup = df.duplicated().sum()
print("Number of duplicates in dataset: ", dup)

In [None]:
df = df.drop_duplicates().reset_index(drop=True)
df.info()

**Null**

In [None]:
df.isnull().sum()

In [None]:
print(df.groupby("verified").describe())

In [None]:
df = df.drop(df[df['verified']].sample(frac=.92).index).reset_index(drop=True)
print(df.groupby("verified").describe())
df.info()

# **Text Review**

We have conducted an in-depth review surrounding the background of the Amazon dataset, and this time the review_text itself is going to be taken a further look. To aid in our pre-processing, certain columns will be added to understand certain instances the sentences have. Those include the counts of:


1. Word
2. Characters (with spaces)
3. Stopwords
4. Punctuations
5. Uppercase characters

After the columns are added, necessary summary statistics will be conducted to get an idea on how the pre-processing will take place.

In [None]:
#WORD COUNT
df['total words'] = df['reviewText'].apply(lambda i: len(str(i).split(" ")))

#CHARACTER COUNT
df['total characters'] = df['reviewText'].str.len() #spaces are included

#STOPWORDS COUNT
sw = set(stopwords.words('english'))
df['total stopwords'] = df['reviewText'].str.split().apply(lambda i: len(set(i) & sw))

#PUNCTUATION AND SPECIAL CHARA COUNT
count_p = lambda p1,p2: sum([1 for i in p1 if i in p2])
df['total punctuations'] = df.reviewText.apply(lambda p: count_p(p, string.punctuation))

#UPPERCASE CHARA COUNT
df['total uppercases'] = df['reviewText'].str.findall(r'[A-Z]').str.len() #findall - finds all

In [None]:
df.head()

SUMMARY

In [None]:
df.describe()

In [None]:
print(df.groupby("verified").describe())

In [None]:
#PIE CHART ON VERFIED PURCHASES -two
colors = ['#FED8B1','#79BAEC']
plt.figure(figsize=(4,4))
label = df['verified'].value_counts()
plt.pie(label.values,colors = colors, labels=label.index, autopct= '%1.1f%%', startangle=90)
plt.title('True and False Reviews Count', fontsize=15)
plt.show()

In [None]:
sns.catplot(x ='overall',kind="count", hue="verified", data=df)
plt.xlabel("review_rating")
plt.ylabel("count of reviews")
plt.title("Review_Rating Grouped by Verified_Purchase")

In [None]:
cols = ["verified", "reviewText"]
vprt = df[cols] #making a subset of the dataframe-

#FILTERING BASED ON TRUE AND FALSE VP
checkTrue = vprt["verified"] == True
filtered_true = vprt[checkTrue]

checkFalse = vprt["verified"] == False
filtered_false = vprt[checkFalse]


#AVERAGE REVIEW LENGTH BASED ON TRUE AND FALSE VP
false_average_length = filtered_false["reviewText"].apply(len).mean()
true_average_length = filtered_true["reviewText"].apply(len).mean()

#PLOTTING THE GRAPH
x_data = [true_average_length,false_average_length]
y_data = ["True", "False"]
sns.barplot(x=x_data, y=y_data)
plt.xlabel("average length of reviews")
plt.ylabel("verified_purchases")
plt.title("Average Length of Reviews based on Verified Purchases")
plt.show()

# **Pre-processing**

In [None]:
#DROP UNNECESSARY COLUMNS
df.drop(["total words","total characters",
         "total stopwords","total punctuations",
         "total uppercases"], axis=1, inplace=True)
df.head()

In [None]:
# Load Aspect-Based Sentiment Analysis model
model_name = "yangheng/deberta-v3-base-absa-v1.1"
absa_tokenizer = AutoTokenizer.from_pretrained(model_name)
absa_model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=absa_model, tokenizer=absa_tokenizer)

# Load a traditional Sentiment Analysis model
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path, tokenizer=sentiment_model_path)

In [None]:
def get_final_txt(text):
    finaltxt = ""
    output_string = re.sub("[^a-zA-Z\.\']", " ", str(text))
    output_string = re.sub(r'\.{2,}', '.', output_string)
    txt = output_string.lower() # LowerCasing the given Text
    sentList = nltk.sent_tokenize(txt)

    for line in sentList:
        newtaggedList = []
        txt_list = nltk.word_tokenize(line) # Splitting up into words
        taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word
        newwordList = []
        flag = 0
        for i in range(0,len(taggedList)-1):
            if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                flag=1
            else:
                if(flag==1):
                    flag=0
                    continue
                newwordList.append(taggedList[i][0])
                if(i==len(taggedList)-2):
                    newwordList.append(taggedList[i+1][0])
                    
        finaltxt += ' '.join(word for word in newwordList)
    return finaltxt

def recursive_find_adjs(root, sent):
    children = [w for w in sent.words if w.head == root.id]

    if not children:
        return []

    filtered_c = [w for w in children if w.deprel == "conj" and w.upos == "ADJ"]
    # Do not include an adjective if it is the parent of a noun
    results = [w for w in filtered_c if not any(sub.head == w.id and sub.upos == "NOUN" for sub in sent.words)]
    for w in children:
        results += recursive_find_adjs(w, sent)

    return results

def get_aspect_pairs(text,nlp):

  doc = nlp(text)
  noun_adj_pairs = {}

  for sent in doc.sentences:
      nouns = [w for w in sent.words if w.upos == "NOUN"]
      for noun in nouns:
          cop_root = sent.words[noun.head-1]
          adjs = [cop_root] + recursive_find_adjs(cop_root, sent) if cop_root.upos == "ADJ" else []

          mod_adjs = [w for w in sent.words if w.head == noun.id and w.upos == "ADJ"]

          if mod_adjs:
              mod_adj = mod_adjs[0]
              adjs.extend([mod_adj] + recursive_find_adjs(mod_adj, sent))

          if adjs:
              unique_adjs = []
              unique_ids = set()
              for adj in adjs:
                  if adj.id not in unique_ids:
                      unique_adjs.append(adj)
                      unique_ids.add(adj.id)

              noun_adj_pairs[noun.text] = " ".join([adj.text for adj in unique_adjs])

  return noun_adj_pairs

In [None]:
def normalize_score(obj):
  score = obj['score']
  if obj['label'].lower() == 'positive':
    score += 1
  elif obj['label'].lower() == 'negative':
    score *= -1
  normalized_scores = (score - -1) / (2 - -1)
  return normalized_scores

In [None]:
df['Overall Sentiment'] = df.index
error_list = []
for index, value in df['reviewText'].items():
    summation = 0
    sent_list = nltk.sent_tokenize(value)
    for sentence in sent_list:
        try:
            normalized_score = normalize_score(sentiment_model(sentence)[0])
        except:
            error_list.append(index)
            print("Error at index" + str(index))
        summation += normalized_score
    df['Overall Sentiment'][index] = summation/len(sent_list)
df.head()

In [None]:
#DROP ERROR LIST
df.drop(df.index[error_list], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Mean ABSA Sentiment'] = 0

In [None]:
for index, text in df['reviewText'].items():
    final_string = get_final_txt(text)
    aspectPairs = get_aspect_pairs(final_string,nlp)
    score = 0
    aspectsITEM = []
    if aspectPairs:
        for key in aspectPairs:
            score += normalize_score(classifier(final_string,  text_pair=key)[0])
        df.at[index, 'Mean ABSA Sentiment'] = score/len(aspectPairs)
    else:
        df.at[index, 'Mean ABSA Sentiment'] = df['Overall Sentiment'][index]

In [None]:
# Calculate mean sentiment per 'overall' score grouped by 'verified'
grouped_data = df.groupby(['verified', 'overall'])['sentiment'].mean().reset_index()

# Plotting using Seaborn catplot
sns.catplot(x='overall', y='sentiment', hue='verified', data=grouped_data, kind='bar', height=6, aspect=1.5)
plt.title('Mean ABSA Sentiment per Score Rating Grouped by Verified')
plt.xlabel('Score Rating')
plt.ylabel('sentiment Score')
plt.show()

In [None]:
cols = ["verified", "reviewText", "overall"]
vprt = df[cols]


grouped_data = vprt.groupby(['verified', 'overall'])['reviewText'].apply(lambda x: x.str.len().mean()).reset_index()
grouped_data['overall'] = grouped_data['overall'].astype(int)  # Ensure 'overall' is treated as integer for plotting

# Plotting using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='overall', y='reviewText', hue='verified', data=grouped_data)
plt.title('Average Length of Review per Score Rating Grouped by Verified')
plt.xlabel('score rating')
plt.ylabel('average length of review')
plt.show()

In [None]:
df.to_json('data/absa_processed.json')