In [None]:
# @title Load the Drive helper and mount

from google.colab import drive
drive.mount('/content/drive')

In [3]:
# @title Imports

import matplotlib.pyplot as plt
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

In [4]:
# @title We load the dataset

df = pd.read_csv("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/dataset.csv")

In [None]:
# @title Cleaning the dataset

def remove_outliers(data_cleaned, reviews):
    # Convert text to embeddings
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') # Pre-trained model
    reviews = [data_cleaned.iloc[i]['review'] for i in range(len(data_cleaned.index))]
    embeddings = model.encode(reviews)

    # Detect anomalies using Isolation Forest
    clf = IsolationForest(random_state=42, contamination=0.05) # Set contamination (percentage of anomalies)
    anomaly_labels = clf.fit_predict(embeddings) # -1 for anomaly, 1 for normal

    # Remove anomalous reviews
    anomaly_labels_indices = [i for i in range(len(anomaly_labels)) if anomaly_labels[i] == -1]
    data_cleaned = data_cleaned.drop(anomaly_labels_indices)

    return data_cleaned, embeddings, anomaly_labels

data_cleaned = df.copy()

# Step 1: Manually remove reviews that contain only a link to a YouTube video for that reivew
data_cleaned = data_cleaned.drop([3109, 3974])

# Step 2: Drop the rows with missing values in the "review" column
data_cleaned = data_cleaned.dropna(subset=['review'])

# Step 3: Standardize the reviews by getting rid of leading and trailing whitespaces
data_cleaned.loc[:, 'review'] = data_cleaned['review'].str.strip()

# Step 4: Drop duplicate reviews
data_cleaned = data_cleaned.drop_duplicates(subset=['review'])
reviews = [data_cleaned.iloc[i]['review'] for i in range(len(data_cleaned.index))]
labels = [int(data_cleaned.iloc[i]['label']) for i in range(len(data_cleaned.index))]

# Step 5: Remove outliers
data_cleaned = data_cleaned.reset_index(drop=True)
data_cleaned_after_outliers, embeddings, anomaly_labels = remove_outliers(data_cleaned, reviews)

# Update entries ids
new_id = 0
for i, row in data_cleaned_after_outliers.iterrows():
    data_cleaned_after_outliers.loc[i, 'id'] = new_id
    new_id += 1
data_cleaned_after_outliers = data_cleaned_after_outliers.reset_index(drop=True)

In [None]:
# @title We visualize details about the cleaned dataset

label_counts = data_cleaned_after_outliers['label'].value_counts()
print(label_counts)
print()
genre_counts = data_cleaned_after_outliers['genre'].value_counts()
print(genre_counts)
print()
print(data_cleaned_after_outliers.info())
print()
print(data_cleaned_after_outliers.head())

In [None]:
# @title Saving the cleaned dataset

# data_cleaned.to_csv("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/cleaned_dataset.csv", index=False)

In [16]:
# @title Plotting the dataset samples during outliers removal

def plot_outliers(X, y, mode, mode_str):
    plt_colors = ['tab:orange', 'tab:purple', 'tab:green', 'tab:red']

    transformer = mode(n_components=2)
    X_transformed = transformer.fit_transform(X)

    plt.rc('axes', axisbelow=True)
    plt.grid()

    positive = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == 2], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == 2], marker='.', color=plt_colors[2])
    neutral = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == 1], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == 1], marker='.', color=plt_colors[1])
    negative = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == 0], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == 0], marker='.', color=plt_colors[0])
    outlier = plt.scatter([X_transformed[i][0] for i in range(len(X_transformed)) if y[i] == -1], [X_transformed[i][1] for i in range(len(X_transformed)) if y[i] == -1], marker='.', color=plt_colors[3])

    plt.legend((negative, neutral, positive, outlier),
               ('Negative', 'Neutral', 'Positive', 'Outlier'),
               scatterpoints=1,
               loc='best',
               ncol=4,
               fontsize=8)

    plt.title(f"{mode_str} analysis of samples after outliers removal")
    plt.xlabel("x coord")
    plt.ylabel("y coord")

    plt.show()

# Marking the outliers with label -1 for the plotting process
plot_labels = labels.copy()
for i in range(len(anomaly_labels)):
    if anomaly_labels[i] == -1:
        plot_labels[i] = -1

plot_outliers(embeddings, plot_labels, PCA, 'PCA')