<a href="https://colab.research.google.com/github/ashibullah/Romanian-Bangla-Sentiment-Analysis-NLP/blob/main/RomanianBanglaSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data collection + libraries

In [None]:
!pip install datasets --upgrade

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt_tab')  # For tokenizer
nltk.download('stopwords')  # For tokenizer

In [9]:
nltk.download('punkt_tab')  # For tokenizerfrom datasets import load_dataset
from textblob import TextBlob
import re
from nltk.tokenize import word_tokenize
import pandas as pd

import json

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
from datasets import load_dataset

df = load_dataset("aplycaebous/BnSentMix" , split = "train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# **Preprocessing**

In [2]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [3]:
def correct_text(text):
    return str(TextBlob(text).correct())

In [5]:
# Create DataFrame
df = pd.DataFrame(df)
# Convert 'Sentence' column to lowercase
df['Sentence'] = df['Sentence'].str.lower()

# remove urls
df['Sentence'] = df['Sentence'].apply(clean_text)


NameError: name 'pd' is not defined

In [None]:
import re

def reduce_repeated_letters(word, max_repeats=2):
    # This will limit repeated letters to max_repeats (e.g., 'ooooo' -> 'oo')
    return re.sub(r'(.)\1{'+str(max_repeats)+',}', r'\1' * max_repeats, word)

In [None]:
def reduce_repeats_in_sentence(sentence):
    return ' '.join([reduce_repeated_letters(word) for word in sentence.split()])

df['Sentence'] = df['Sentence'].apply(reduce_repeats_in_sentence)


In [None]:
dict_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/normalization_dict.json'
with open(dict_path, 'r', encoding='utf-8') as f:
    normalization_dict = json.load(f)

print(f"Loaded normalization dictionary with {len(normalization_dict)} entries")


# Function to normalize text using your dictionary
def normalize_text(text, norm_dict):
    tokens = text.split()
    reverse_map = {}
    for std_word, variants in norm_dict.items():
        reverse_map[std_word] = std_word
        for var in variants:
            reverse_map[var] = std_word
    normalized_tokens = [reverse_map.get(token.lower(), token.lower()) for token in tokens]
    return ' '.join(normalized_tokens)
# run
df['Sentence'] = df['Sentence'].apply(lambda x: normalize_text(x, normalization_dict))


In [None]:
abbrev_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/abbreviation.json'
with open(abbrev_path, 'r', encoding='utf-8') as f:
    abbreviation_dict = json.load(f)

print(f"Loaded abbreviation dictionary with {len(abbreviation_dict)} entries")

# Function to normalize text using abbreviation dictionary
def normalize_abbreviations(text, abbr_dict):
    tokens = text.split()
    normalized_tokens = [abbr_dict.get(token.lower(), token) for token in tokens]
    return ' '.join(normalized_tokens)

    # run
    df['Sentence'] = df['Sentence'].apply(lambda x: normalize_abbreviations(x, abbreviation_dict))

# **Feature Extraction Starts Here**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Step 1: Vectorize and extract vocabulary
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(df['Sentence'])

# Step 2: Get raw vocab
raw_vocab = vectorizer.get_feature_names_out()

# Step 3: Filter to keep only clean alphabetic words
def is_clean_word(word):
    return re.fullmatch(r'[a-zA-Z]+', word) is not None

clean_vocab = [word for word in raw_vocab if is_clean_word(word)]

print(f"Cleaned vocabulary size: {len(clean_vocab)}")

# Step 4: Save only cleaned vocab to txt file
with open("clean_vocab.txt", "w", encoding="utf-8") as f:
    for word in clean_vocab:
        f.write(f"{word}\n")

print("Cleaned vocabulary exported to clean_vocab.txt")

# Preview first 20 words
# print(clean_vocab[:20])


Cleaned vocabulary size: 26637
Cleaned vocabulary exported to clean_vocab.txt


# **DL ALGO's**

LSTM

In [None]:
!pip install tensorflow nltk

import numpy as np
import pandas as pd
import re
import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
texts = df['Sentence'].tolist()
labels = df['Label'].tolist()

# Tokenize and pad
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')

y = np.array(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)


In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

In [None]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')  # 4 sentiment classes
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)


X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)
from collections import Counter
print("Training label distribution:", Counter(y_train))


Training label distribution: Counter({np.int64(0): 5626, np.int64(3): 5626, np.int64(2): 5626, np.int64(1): 5626})


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


Epoch 1/8


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def clean_UserInput(text):
    import re
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'[^\w\s]', '', text)               # remove punctuation
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)         # reduce repeated letters
    return text.strip()


def predict_user_input(text):
    # Clean and prepare input
    text = clean_UserInput(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=100, padding='post')

    # Predict
    pred = model.predict(padded)
    pred_class = np.argmax(pred)

    # Class label mapping
    label_map = {
        0: "Positive",
        1: "Negative",
        2: "Neutral",
        3: "Mixed"
    }

    print(f"Input: {text}")
    print(f"Predicted Sentiment: {label_map[pred_class]}")


In [None]:
predict_user_input("weather forecast dekhe khushi hoye berolam, ekhon dekhi brishti")

Under Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Check original distribution
print("Original:", Counter(y))

# Apply undersampling
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Check new distribution
print("Undersampled:", Counter(y_resampled))


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)

# Train your LSTM (you don't need class_weight here)
model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))


cnn for text

In [None]:
!pip install imbalanced-learn

import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping




In [None]:
texts = df['Sentence'].tolist()
labels = df['Label'].tolist()

# Tokenization
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = np.array(labels).astype(int)

In [None]:
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("After Oversampling:", Counter(y_resampled))


After Oversampling: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)


In [None]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')  # 4 sentiment classes
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(X_train, y_train,
          epochs=8,
          batch_size=64,
          validation_data=(X_test, y_test),
          callbacks=[early_stop])


Epoch 1/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 125ms/step - accuracy: 0.5252 - loss: 1.0656 - val_accuracy: 0.8029 - val_loss: 0.5188
Epoch 2/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 128ms/step - accuracy: 0.8651 - loss: 0.3870 - val_accuracy: 0.8477 - val_loss: 0.4141
Epoch 3/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 122ms/step - accuracy: 0.9373 - loss: 0.1907 - val_accuracy: 0.8567 - val_loss: 0.4398
Epoch 4/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 119ms/step - accuracy: 0.9647 - loss: 0.1092 - val_accuracy: 0.8494 - val_loss: 0.5301


<keras.src.callbacks.history.History at 0x7961e5316850>

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step
              precision    recall  f1-score   support

    Positive       0.87      0.90      0.89       993
    Negative       0.80      0.77      0.78       993
     Neutral       0.80      0.77      0.78       993
       Mixed       0.92      0.95      0.93       993

    accuracy                           0.85      3972
   macro avg       0.85      0.85      0.85      3972
weighted avg       0.85      0.85      0.85      3972



In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
print("After undersampling:", Counter(y_resampled))


After undersampling: Counter({np.int64(0): 1871, np.int64(1): 1871, np.int64(2): 1871, np.int64(3): 1871})


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)

model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


Epoch 1/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 132ms/step - accuracy: 0.9407 - loss: 0.1958 - val_accuracy: 0.9305 - val_loss: 0.2001
Epoch 2/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 130ms/step - accuracy: 0.9718 - loss: 0.0961 - val_accuracy: 0.9252 - val_loss: 0.2150
Epoch 3/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 129ms/step - accuracy: 0.9886 - loss: 0.0476 - val_accuracy: 0.9190 - val_loss: 0.2525


<keras.src.callbacks.history.History at 0x7961e619ae50>

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
              precision    recall  f1-score   support

    Positive       0.91      0.92      0.91       281
    Negative       0.94      0.91      0.93       280
     Neutral       0.92      0.92      0.92       281
       Mixed       0.95      0.98      0.96       281

    accuracy                           0.93      1123
   macro avg       0.93      0.93      0.93      1123
weighted avg       0.93      0.93      0.93      1123



RCNN (Recurrent CNN)

In [None]:
!pip install imbalanced-learn

import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping






In [None]:
texts = df['Sentence'].tolist()
labels = df['Label'].tolist()

tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = np.array(labels).astype(int)


In [None]:
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
print("Oversampled:", Counter(y_ros))


Oversampled: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [None]:
X_used, y_used = X_ros, y_ros
X_train, X_test, y_train, y_test = train_test_split(
    X_used, y_used,
    test_size=0.15,
    stratify=y_used,
    random_state=42
)


In [None]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=True)),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(X_train, y_train,
          epochs=8,
          batch_size=64,
          validation_data=(X_test, y_test),
          callbacks=[early_stop])


Epoch 1/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 366ms/step - accuracy: 0.5219 - loss: 1.0389 - val_accuracy: 0.8129 - val_loss: 0.5037
Epoch 2/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 374ms/step - accuracy: 0.8821 - loss: 0.3541 - val_accuracy: 0.8406 - val_loss: 0.4508
Epoch 3/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 362ms/step - accuracy: 0.9422 - loss: 0.1833 - val_accuracy: 0.8366 - val_loss: 0.5041
Epoch 4/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 361ms/step - accuracy: 0.9608 - loss: 0.1181 - val_accuracy: 0.8537 - val_loss: 0.5469


<keras.src.callbacks.history.History at 0x7961e52c7bd0>

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 106ms/step
              precision    recall  f1-score   support

    Positive       0.84      0.91      0.87       993
    Negative       0.82      0.77      0.79       993
     Neutral       0.79      0.76      0.77       993
       Mixed       0.91      0.93      0.92       993

    accuracy                           0.84      3972
   macro avg       0.84      0.84      0.84      3972
weighted avg       0.84      0.84      0.84      3972



In [None]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
print("Undersampled:", Counter(y_rus))


Undersampled: Counter({np.int64(0): 1871, np.int64(1): 1871, np.int64(2): 1871, np.int64(3): 1871})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_used, y_used,
    test_size=0.15,
    stratify=y_used,
    random_state=42
)
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=True)),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(X_train, y_train,
          epochs=8,
          batch_size=64,
          validation_data=(X_test, y_test),
          callbacks=[early_stop])




Epoch 1/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 370ms/step - accuracy: 0.5000 - loss: 1.0645 - val_accuracy: 0.8127 - val_loss: 0.5107
Epoch 2/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 357ms/step - accuracy: 0.8714 - loss: 0.3694 - val_accuracy: 0.8353 - val_loss: 0.4503
Epoch 3/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 368ms/step - accuracy: 0.9349 - loss: 0.1885 - val_accuracy: 0.8462 - val_loss: 0.5001
Epoch 4/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 364ms/step - accuracy: 0.9599 - loss: 0.1162 - val_accuracy: 0.8477 - val_loss: 0.5632


<keras.src.callbacks.history.History at 0x7961e5b21e10>

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step
              precision    recall  f1-score   support

    Positive       0.84      0.91      0.87       993
    Negative       0.78      0.76      0.77       993
     Neutral       0.79      0.74      0.77       993
       Mixed       0.92      0.92      0.92       993

    accuracy                           0.84      3972
   macro avg       0.83      0.84      0.83      3972
weighted avg       0.83      0.84      0.83      3972



CNN + LSTM Model

In [None]:
!pip install imbalanced-learn

import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

texts = df['Sentence'].tolist()
labels = df['Label'].tolist()

tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = np.array(labels).astype(int)



In [None]:
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("Oversampled class counts:", Counter(y_resampled))


Oversampled class counts: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')  # 4 sentiment classes
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


Epoch 1/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 278ms/step - accuracy: 0.2433 - loss: 1.3874 - val_accuracy: 0.2500 - val_loss: 1.3848
Epoch 2/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 262ms/step - accuracy: 0.2471 - loss: 1.3848 - val_accuracy: 0.2533 - val_loss: 1.3853
Epoch 3/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 260ms/step - accuracy: 0.2544 - loss: 1.3844 - val_accuracy: 0.2550 - val_loss: 1.3810
Epoch 4/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 249ms/step - accuracy: 0.2474 - loss: 1.3835 - val_accuracy: 0.2538 - val_loss: 1.3799
Epoch 5/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 257ms/step - accuracy: 0.2504 - loss: 1.3818 - val_accuracy: 0.2550 - val_loss: 1.3789
Epoch 6/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 267ms/step - accuracy: 0.2486 - loss: 1.3804 - val_accuracy: 0.2555 - val_loss: 1.3786
Epoch 7/8


In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


# Bert Multilingual

In [None]:
!pip install transformers datasets scikit-learn torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
df['Sentence'] = df['Sentence'].astype(str)
df['Label'] = df['Label'].astype(int)

# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Sentence'].tolist(),
    df['Label'].tolist(),
    test_size=0.15,
    stratify=df['Label'],
    random_state=42
)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
import torch

class BanglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

train_dataset = BanglishDataset(train_encodings, train_labels)
test_dataset = BanglishDataset(test_encodings, test_labels)


In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=4  # 4 sentiment classes
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy="epoch",  # Corrected argument name
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

def compute_metrics(eval_pred):
    from sklearn.metrics import classification_report
    import numpy as np
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1_macro": report["macro avg"]["f1-score"],
        "f1_weighted": report["weighted avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mashibullahbhai[0m ([33mashibullahbhai-port-city-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
results = trainer.evaluate()
print(results)


#  **ML ALGOS**

✅ Logistic Regression (with class weight balancing)

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# Clean text
df['Sentence'] = df['Sentence'].astype(str)
df['Label'] = df['Label'].astype(int)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
X = vectorizer.fit_transform(df['Sentence'])
y = df['Label'].values

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("After Oversampling:", Counter(y_resampled))

# Then split and train as usual
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

After Oversampling: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [None]:
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))


              precision    recall  f1-score   support

    Positive       0.84      0.83      0.84       993
    Negative       0.75      0.75      0.75       993
     Neutral       0.70      0.71      0.71       993
       Mixed       0.87      0.87      0.87       993

    accuracy                           0.79      3972
   macro avg       0.79      0.79      0.79      3972
weighted avg       0.79      0.79      0.79      3972



In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
# Clean text
df['Sentence'] = df['Sentence'].astype(str)
df['Label'] = df['Label'].astype(int)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
X = vectorizer.fit_transform(df['Sentence'])
y = df['Label'].values

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

print("After undersampling:", Counter(y_resampled))

# Then split and train as usual
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

After undersampling: Counter({np.int64(0): 1871, np.int64(1): 1871, np.int64(2): 1871, np.int64(3): 1871})


In [None]:
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))

              precision    recall  f1-score   support

    Positive       0.70      0.69      0.70       281
    Negative       0.60      0.66      0.63       280
     Neutral       0.62      0.72      0.67       281
       Mixed       0.73      0.56      0.63       281

    accuracy                           0.66      1123
   macro avg       0.66      0.66      0.66      1123
weighted avg       0.66      0.66      0.66      1123



# ✅ SVM + TF-IDF + Oversampling

In [None]:
!pip install imbalanced-learn

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

df['Sentence'] = df['Sentence'].astype(str)
df['Label'] = df['Label'].astype(int)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
X = vectorizer.fit_transform(df['Sentence'])
y = df['Label'].values

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("After Oversampling:", Counter(y_resampled))

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)
svm_model = SVC(kernel='linear', probability=True)  # 'linear' is best for text data
svm_model.fit(X_train, y_train)


After Oversampling: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["Positive", "Negative", "Neutral", "Mixed"]))

              precision    recall  f1-score   support

    Positive       0.84      0.86      0.85       993
    Negative       0.76      0.75      0.75       993
     Neutral       0.74      0.71      0.73       993
       Mixed       0.88      0.92      0.90       993

    accuracy                           0.81      3972
   macro avg       0.81      0.81      0.81      3972
weighted avg       0.81      0.81      0.81      3972



In [1]:
from imblearn.under_sampling import RandomUnderSampler
df['Sentence'] = df['Sentence'].astype(str)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
X = vectorizer.fit_transform(df['Sentence'])
y = df['Label'].values

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
print("After undersampling:", Counter(y_resampled))

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)
svm_model = SVC(kernel='linear', probability=True)  # 'linear' is best for text data
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["Positive", "Negative", "Neutral", "Mixed"]))

NameError: name 'df' is not defined