<a href="https://colab.research.google.com/github/ashibullah/Romanian-Bangla-Sentiment-Analysis-NLP/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install imbalanced-learn xgboost transformers
!pip install --upgrade datasets



In [6]:
import pandas as pd
import numpy as np
import re

from datasets import load_dataset
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight


In [7]:
df = load_dataset("aplycaebous/BnSentMix" , split = "train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(df)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text.strip()



df['Sentence'] = df['Sentence'].astype(str).apply(clean_text)

# Optional: apply clean_vocab filtering if available
# clean_vocab = set(...)  # Load your clean vocab here if you want
# def filter_text(text):
#     return ' '.join([word for word in text.split() if word in clean_vocab])
# df['Cleaned_Sentence'] = df['Sentence'].apply(filter_text)

 # If no clean_vocab, fallback to cleaned text
df['Label'] = df['Label'].astype(int)

In [None]:
# Vectorizers
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=3)
bow = CountVectorizer(max_features=10000, ngram_range=(1,2), min_df=3)
hash_vec = HashingVectorizer(n_features=10000, ngram_range=(1,2))

# Vectorized datasets
X_tfidf = tfidf.fit_transform(df['Sentence'])
X_bow = bow.fit_transform(df['Sentence'])
X_hash = hash_vec.fit_transform(df['Sentence'])
y = df['Label'].values

# ML models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(objective='multi:softmax', num_class=4, eval_metric='mlogloss', random_state=42),
    "Naive Bayes": MultinomialNB()
}

# Vectorizer dictionary
vectorizers = {
    "TF-IDF": X_tfidf,
    "Bag of Words": X_bow,
    "HashingVectorizer": X_hash
}

samplers = {
    "Oversampling": RandomOverSampler(sampling_strategy='auto', random_state=42),
    "Undersampling": RandomUnderSampler(sampling_strategy='auto', random_state=42),
    "SMOTE": SMOTE(sampling_strategy='auto', random_state=42)
}


for vec_name, X_vec in vectorizers.items():
    print(f"\n{'='*40}\nUsing Vectorizer: {vec_name}\n{'='*40}")

    for sampler_name, sampler in samplers.items():
        print(f"\n{'-'*20}\nSampling Strategy: {sampler_name}\n{'-'*20}")

        # Apply sampling
        X_sampled, y_sampled = sampler.fit_resample(X_vec, y)
        print("After Sampling:", Counter(y_sampled))

        # Split
        X_train, X_test, y_train, y_test = train_test_split(
            X_sampled, y_sampled,
            test_size=0.15,
            stratify=y_sampled,
            random_state=42
        )

        # Run models
        for model_name, model in models.items():
            print(f"\n----- Model: {model_name} -----")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            print(classification_report(y_test, y_pred, target_names=["Positive", "Negative", "Neutral", "Mixed"]))



Using Vectorizer: TF-IDF

--------------------
Sampling Strategy: Oversampling
--------------------
After Sampling: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})

----- Model: Logistic Regression -----
              precision    recall  f1-score   support

    Positive       0.82      0.83      0.83       993
    Negative       0.75      0.74      0.74       993
     Neutral       0.73      0.71      0.72       993
       Mixed       0.87      0.88      0.87       993

    accuracy                           0.79      3972
   macro avg       0.79      0.79      0.79      3972
weighted avg       0.79      0.79      0.79      3972


----- Model: SVM -----
