# 1. Install and Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NeighbourhoodCleaningRule, RandomUnderSampler
from sentence_transformers import SentenceTransformer

from google.colab import drive
from tqdm import tqdm
import os

# 2. Environment Settings and Initialization

## 2.1. Set Seed for Reproducibility

In [None]:
SEED = 42

## 2.2. Manage Google Drive

### 2.2.1. Mount Google Drive

In [None]:
drive.mount('/content/drive')

### 2.2.2. Set File & Folder Path

In [None]:
# Dataset
READ_FILE_PATH = '/content/drive/MyDrive/Bach_Thesis/Dataset/emoji_words.csv'

# Re-Sampled Dataset
SAVE_FOLDER_PATH = '/content/drive/MyDrive/Bach_Thesis/Dataset/'

## 2.3. Load Dataset

In [None]:
df = pd.read_csv(READ_FILE_PATH)
df.head()

In [None]:
df = df[['cleaned_content', 'sentiment']].copy()
df.head()

In [None]:
df['sentiment'].value_counts()

## 2.4. Embedding Model



In [None]:
embedding_model = SentenceTransformer('LazarusNLP/all-Indo-e5-small-v4')

# 3. Resampling Dataset

## 3.1. Relabel the data

In [None]:
label_3lab = {
    'positive': 2,
    'neutral': 1,
    'negative': 0
}

In [None]:
df_3lab = df.copy()

In [None]:
df_3lab['sentiment'] = df_3lab['sentiment'].map(label_3lab)
df_3lab.head()

In [None]:
label_2lab = {
    'positive': 1,
    'negative': 0
}

In [None]:
df_2lab = df[df['sentiment'] != 'neutral'].copy()

In [None]:
df_2lab['sentiment'] = df_2lab['sentiment'].map(label_2lab)
df_2lab.head()

In [None]:
len(df_2lab)

## 3.2. Split the Dataset

In [None]:
train_set, temp = train_test_split(df_2lab, test_size = 0.3, random_state = SEED, stratify = df_2lab['sentiment'])

In [None]:
val_set, test_set = train_test_split(temp, test_size = 0.5, random_state = SEED, stratify = temp['sentiment'])

In [None]:
len(train_set), len(val_set), len(test_set)

In [None]:
X_train_ori = train_set['cleaned_content']
X_train = train_set[['cleaned_content']]
y_train = train_set['sentiment']

df_train = pd.DataFrame({
    'cleaned_content': X_train_ori,
    'sentiment': y_train
})

output_path_train = f"{SAVE_FOLDER_PATH}Train2lab.csv"
df_train.to_csv(output_path_train, index=False, encoding='utf-8-sig')
print(f"✅ Train set saved in '{output_path_train}'")

In [None]:
X_val = val_set['cleaned_content']
y_val = val_set['sentiment']

df_val = pd.DataFrame({
    'cleaned_content': X_val,
    'sentiment': y_val
})

output_path_val = f"{SAVE_FOLDER_PATH}Validation2lab.csv"
df_val.to_csv(output_path_val, index=False, encoding='utf-8-sig')
print(f"✅ Validation set saved in '{output_path_val}'")

In [None]:
X_test = test_set['cleaned_content']
y_test = test_set['sentiment']

df_test = pd.DataFrame({
    'cleaned_content': X_test,
    'sentiment': y_test
})

output_path_test = f"{SAVE_FOLDER_PATH}Test2lab.csv"

df_test.to_csv(output_path_test, index=False, encoding='utf-8-sig')
print(f"✅ Test set saved in '{output_path_test}'")

### 3.1.1. ROS on 'Neutral' Texts

In [None]:
class_counts = Counter(y_train)
majority_class_count = max(class_counts.values())

In [None]:
sampling_strategy = {cls: majority_class_count for cls in class_counts.keys()}
print("Sampling strategy:", sampling_strategy)

In [None]:
ros = RandomOverSampler(sampling_strategy = sampling_strategy, random_state = SEED)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [None]:
df_ros = pd.DataFrame({
    'cleaned_content': X_ros['cleaned_content'],
    'sentiment': y_ros
})

In [None]:
output_path_ros = f"{SAVE_FOLDER_PATH}Train_ROS2lab.csv"

df_ros.to_csv(output_path_ros, index = False, encoding='utf-8-sig')
print(f"✅ ROS dataset saved in '{output_path_ros}'")

### 3.1.2. ROS + NCL (0.8 ratio)

In [None]:
X_embedded = embedding_model.encode(
    X_ros['cleaned_content'].tolist(),
    convert_to_numpy = True,
    show_progress_bar = True
)

In [None]:
n_neighbors = min(5, max(3, int(np.sqrt(len(X_embedded)))))
ncl = NeighbourhoodCleaningRule(
    sampling_strategy = 'all',
    n_neighbors = n_neighbors)

X_final_emb, y_final = ncl.fit_resample(X_embedded, y_ros)

In [None]:
kept_indices = ncl.sample_indices_
df_ros_ncl = X_ros.iloc[kept_indices].copy()
df_ros_ncl['sentiment'] = y_ros.iloc[kept_indices].values

In [None]:
output_path_ros_ncl = f"{SAVE_FOLDER_PATH}Train_ROS_NCL2lab.csv"

df_ros_ncl.to_csv(output_path_ros_ncl, index = False, encoding='utf-8-sig')
print(f"✅ ROS dataset saved in '{output_path_ros_ncl}'")

In [None]:
ct_ori = df_train['sentiment'].value_counts()
ct_ros = df_ros['sentiment'].value_counts()
ct_ros_ncl = df_ros_ncl['sentiment'].value_counts()

In [None]:
pd.concat([ct_ori, ct_ros, ct_ros_ncl], axis=1, keys=['Original', 'ROS', 'ROS-NCL'])