In [None]:
# @title Load the Drive helper and mount

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Imports

import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords, words
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import spacy
from spacy.lang.ro.examples import sentences
import unicodedata

nltk.download('punkt')
nltk.download('stopwords')

!python -m spacy download ro_core_news_sm

In [24]:
# @title We load the dataset

df = pd.read_csv("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/cleaned_dataset.csv")

reviews = [df.iloc[i]['review'] for i in range(len(df.index))]
labels = [int(df.iloc[i]['label']) for i in range(len(df.index))]

In [25]:
# @title Data preprocessing

def preprocess_review(review):
    # Normalize the text font
    review = unicodedata.normalize('NFKC', review)
    # Replace inconsistent characters
    review = review.replace('“', '"').replace('”', '"')
    review = review.replace('ş', 'ș').replace("Ş", "Ș").replace('ţ', 'ț').replace("Ţ", "Ț")
    # Remove links
    review = re.sub(r'https?://\S+', '', review)
    # Remove included english reviews
    rev_beginning = ["english review:", "english:", "[english]"]
    indices = [review.lower().find(b) for b in rev_beginning]
    if indices[0] != -1:
        index = indices[0]
    elif indices[1] != -1:
        index = indices[1]
    elif indices[2] != -1:
        index = indices[2]
    else:
        index = -1
    review = review[:index]

    return review

reviews = [preprocess_review(review) for review in reviews]

In [None]:
# @title We see on average how lengthy is each review type, based on the number of characters

len_reviews = [len(reviews[i]) for i in range(len(reviews))]
len_reviews_0 = [len(reviews[i]) for i in range(len(reviews)) if labels[i] == 0]
len_reviews_1 = [len(reviews[i]) for i in range(len(reviews)) if labels[i] == 1]
len_reviews_2 = [len(reviews[i]) for i in range(len(reviews)) if labels[i] == 2]

np.mean(len_reviews_0), np.mean(len_reviews_1), np.mean(len_reviews_2)

In [None]:
# @title We visualize the samples after data preprocessing

def plot_samples(X, y, mode, mode_str):
    plt_colors = ['tab:orange', 'tab:purple', 'tab:green']

    transformer = mode(n_components=2)
    X_transformed = transformer.fit_transform(X)

    plt.rc('axes', axisbelow=True)
    plt.grid()

    positive = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == 2], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == 2], marker='.', color=plt_colors[2])
    neutral = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == 1], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == 1], marker='.', color=plt_colors[1])
    negative = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == 0], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == 0], marker='.', color=plt_colors[0])

    plt.legend((negative, neutral, positive),
               ('Negative', 'Neutral', 'Positive'),
               scatterpoints=1,
               loc='best',
               ncol=4,
               fontsize=8)

    plt.title(f"{mode_str} analysis of samples")
    plt.xlabel("x coord")
    plt.ylabel("y coord")

    plt.show()

# Convert text to embeddings
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') # Pre-trained model
embeddings = model.encode(reviews)

plot_samples(embeddings, labels, PCA, 'PCA')

### Negative and positive words distribution in each review type

In [6]:
# We first get the lists of possible negative and positive words and remove the stop words from them as they are not relevant

stop_words = set(stopwords.words('romanian'))

negative_words = []
with open("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/negative_words_ro.txt", "r") as f:
    for line in f:
        if line.strip() not in stop_words:
            negative_words.append(line.strip())

positive_words = []
with open("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/positive_words_ro.txt", "r") as f:
    for line in f:
        if line.strip() not in stop_words:
            positive_words.append(line.strip())

In [7]:
# We then get the number of all negative and positive words from each review type

# Function to count positive and negative words
def count_words(text, positive_words, negative_words):
    positive_count = 0
    negative_count = 0
    words = text.lower().split()
    for word in words:
        if word in positive_words:
            positive_count += 1
        elif word in negative_words:
            negative_count += 1
    return positive_count, negative_count

# Create lists to store word counts for each review type
positive_counts, negative_counts = [], []
review_types = ["Negative reviews", "Neutral reviews", "Positive reviews"]

for label in range(3): # Iterate over each label (0, 1, 2)
  positive_counts_label, negative_counts_label = [], []
  for i in range(len(reviews)):
      if labels[i] == label:
          positive, negative = count_words(reviews[i], positive_words, negative_words)
          positive_counts_label.append(positive)
          negative_counts_label.append(negative)
  positive_counts.append(positive_counts_label)
  negative_counts.append(negative_counts_label)

# Obtain the total positive and negative words in each review type
total_positive_counts = [sum(counts) for counts in positive_counts]
total_negative_counts = [sum(counts) for counts in negative_counts]

In [None]:
# Create the stacked bar chart

width = 0.5
x = range(3) # Create x-coordinates for the bar chart positions

fig, ax = plt.subplots()

ax.grid(zorder=0)
# Plot stacked bars for each label
ax.bar(x, total_positive_counts, width, label='Positive words')
ax.bar(x, total_negative_counts, width, bottom=total_positive_counts, label='Negative words')

ax.set_xlabel('Review type')
ax.set_ylabel('Word count')
ax.set_title('Distribution of sentiment words in each type of review')
ax.set_xticks(x)
ax.set_xticklabels(review_types)
ax.legend()

plt.show()

### PoS analysis

In [18]:
# Get all PoS from each review

spacy.require_gpu()

nlp = spacy.load("ro_core_news_sm")

d_neg, d_neu, d_pos = {}, {}, {}

for i in range(len(reviews)):
    doc = nlp(reviews[i])
    for token in doc:
        if labels[i] == 0:
            if token.pos_ not in d_neg:
                d_neg[token.pos_] = 1
            else:
                d_neg[token.pos_] += 1
        elif labels[i] == 1:
            if token.pos_ not in d_neu:
                d_neu[token.pos_] = 1
            else:
                d_neu[token.pos_] += 1
        else:
            if token.pos_ not in d_pos:
                d_pos[token.pos_] = 1
            else:
                d_pos[token.pos_] += 1

# Sort the PoS from each review type in descending order by their number of appearances
d_neg = dict(sorted(d_neg.items(), key=lambda item: item[1], reverse=True))
d_neu = dict(sorted(d_neu.items(), key=lambda item: item[1], reverse=True))
d_pos = dict(sorted(d_pos.items(), key=lambda item: item[1], reverse=True))

In [None]:
# Intra-review type analysis

noun_adv_neg, noun_verb_neg, noun_adj_neg, adv_verb_neg, adv_adj_neg, verb_adj_neg = d_neg['NOUN'] / d_neg['ADV'], d_neg['NOUN'] / d_neg['VERB'], d_neg['NOUN'] / d_neg['ADJ'], d_neg['ADV'] / d_neg['VERB'], d_neg['ADV'] / d_neg['ADJ'], d_neg['VERB'] / d_neg['ADJ']
noun_adv_neu, noun_verb_neu, noun_adj_neu, adv_verb_neu, adv_adj_neu, verb_adj_neu = d_neu['NOUN'] / d_neu['ADV'], d_neu['NOUN'] / d_neu['VERB'], d_neu['NOUN'] / d_neu['ADJ'], d_neu['ADV'] / d_neu['VERB'], d_neu['ADV'] / d_neu['ADJ'], d_neu['VERB'] / d_neu['ADJ']
noun_adv_pos, noun_verb_pos, noun_adj_pos, adv_verb_pos, adv_adj_pos, verb_adj_pos = d_pos['NOUN'] / d_pos['ADV'], d_pos['NOUN'] / d_pos['VERB'], d_pos['NOUN'] / d_pos['ADJ'], d_pos['ADV'] / d_pos['VERB'], d_pos['ADV'] / d_pos['ADJ'], d_pos['VERB'] / d_pos['ADJ']

print(noun_adv_neg / noun_adv_neu, noun_verb_neg / noun_verb_neu, noun_adj_neg / noun_adj_neu, adv_verb_neg / adv_verb_neu, adv_adj_neg / adv_adj_neu, verb_adj_neg / verb_adj_neu)
print(noun_adv_neg / noun_adv_pos, noun_verb_neg / noun_verb_pos, noun_adj_neg / noun_adj_pos, adv_verb_neg / adv_verb_pos, adv_adj_neg / adv_adj_pos, verb_adj_neg / verb_adj_pos)
print(noun_adv_neu / noun_adv_pos, noun_verb_neu / noun_verb_pos, noun_adj_neu / noun_adj_pos, adv_verb_neu / adv_verb_pos, adv_adj_neu / adv_adj_pos, verb_adj_neu / verb_adj_pos)

In [None]:
# Inter-review type analysis

print(d_neg['NOUN'] / d_neu['NOUN'])
print(d_neg['AUX'] / d_neu['AUX'])
print(d_neg['ADP'] / d_neu['ADP'])
print(d_neg['PRON'] / d_neu['PRON'])
print(d_neg['ADV'] / d_neu['ADV'])
print(d_neg['DET'] / d_neu['DET'])
print(d_neg['VERB'] / d_neu['VERB'])
print(d_neg['ADJ'] / d_neu['ADJ'])
print(d_neg['CCONJ'] / d_neu['CCONJ'])
print(d_neg['SCONJ'] / d_neu['SCONJ'])
print(d_neg['NUM'] / d_neu['NUM'])

print()

print(d_neg['NOUN'] / d_pos['NOUN'])
print(d_neg['AUX'] / d_pos['AUX'])
print(d_neg['ADP'] / d_pos['ADP'])
print(d_neg['PRON'] / d_pos['PRON'])
print(d_neg['ADV'] / d_pos['ADV'])
print(d_neg['DET'] / d_pos['DET'])
print(d_neg['VERB'] / d_pos['VERB'])
print(d_neg['ADJ'] / d_pos['ADJ'])
print(d_neg['CCONJ'] / d_pos['CCONJ'])
print(d_neg['SCONJ'] / d_pos['SCONJ'])
print(d_neg['NUM'] / d_pos['NUM'])

print()

print(d_neu['NOUN'] / d_pos['NOUN'])
print(d_neu['AUX'] / d_pos['AUX'])
print(d_neu['ADP'] / d_pos['ADP'])
print(d_neu['PRON'] / d_pos['PRON'])
print(d_neu['ADV'] / d_pos['ADV'])
print(d_neu['DET'] / d_pos['DET'])
print(d_neu['VERB'] / d_pos['VERB'])
print(d_neu['ADJ'] / d_pos['ADJ'])
print(d_neu['CCONJ'] / d_pos['CCONJ'])
print(d_neu['SCONJ'] / d_pos['SCONJ'])
print(d_neu['NUM'] / d_pos['NUM'])