In [55]:
import pandas as pd

In [56]:
df = pd.read_csv('../../../datasets/combined_dataset.csv')

In [57]:
df['Rating'] = df['Rating'].astype(int)

In [58]:
def label_sentiment(Rating):
    if Rating >=7:
        return 'positive'
    elif Rating ==5 or Rating ==6:
        return 'neutral'
    else:
        return 'negative'

In [59]:
df['Sentiment'] = df['Rating'].apply(label_sentiment)

In [62]:
df.to_csv('../../../labeled_dataset.csv', index = False)

In [63]:
print(df['Sentiment'].value_counts())

Sentiment
positive    10668
negative     7805
neutral      2408
Name: count, dtype: int64


In [64]:
positive = df[df['Sentiment'] == 'positive'].sample(n = 7800, random_state = 42)
negative = df[df['Sentiment'] == 'negative'].sample(n = 7800, random_state = 42)
neutral = df[df['Sentiment'] == 'neutral']

balanced_df = pd.concat([positive,negative], ignore_index=True).sample(frac=1, random_state = 42)

balanced_df.to_csv('../../../datasets/balanced.csv', index=False)

In [65]:
df_bal = pd.read_csv('../../../datasets/balanced.csv')
print(df_bal['Sentiment'].value_counts())

Sentiment
negative    7800
positive    7800
Name: count, dtype: int64


In [66]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_bal, test_size = 0.2, stratify=df_bal['Sentiment'], random_state = 42)
print(f"Train size: {len(train_df)} | Test size: {len(test_df)}")
print("Train class counts:\n", train_df['Sentiment'].value_counts())
print("Test class counts:\n", test_df['Sentiment'].value_counts())

Train size: 12480 | Test size: 3120
Train class counts:
 Sentiment
positive    6240
negative    6240
Name: count, dtype: int64
Test class counts:
 Sentiment
negative    1560
positive    1560
Name: count, dtype: int64


In [67]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
train_df['cleaned_review'] = train_df['Review'].apply(clean_text)
test_df['cleaned_review'] = test_df['Review'].apply(clean_text)

In [None]:
import nltk
from nltk.corpus import stopwords
import spacy

nltk.download('stopwords')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

stop_words = set(stopwords.words('english'))


In [None]:
def lemmantize_and_remove_stopwords(text):
    doc = nlp(text)
    return ' '. join([
        token.lemma_ for token in doc
        if token.lemma_ not in stop_words and token.is_alpha
    ])
    
train_df['final_review'] = train_df['cleaned_review'].apply(lemmantize_and_remove_stopwords)
test_df['final_review'] = test_df['cleaned_review'].apply(lemmantize_and_remove_stopwords)

In [43]:
train_df[['cleaned_review', 'Sentiment']].to_csv('../../../datasets/train.csv', index=False)
test_df[['cleaned_review', 'Sentiment']].to_csv('../../../datasets/test.csv', index=False)