<a href="https://colab.research.google.com/github/ashibullah/Romanian-Bangla-Sentiment-Analysis-NLP/blob/main/RomanianBanglaSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data collection + libraries

In [1]:
!pip install datasets --upgrade

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency res

In [2]:
!pip install nltk



In [3]:
import nltk
nltk.download('punkt_tab')  # For tokenizer
nltk.download('stopwords')  # For tokenizer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
nltk.download('punkt_tab')  # For tokenizerfrom datasets import load_dataset
from textblob import TextBlob
import re
from nltk.tokenize import word_tokenize
import pandas as pd

import json

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from datasets import load_dataset

df = load_dataset("aplycaebous/BnSentMix" , split = "train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

dataset.csv:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20015 [00:00<?, ? examples/s]

# **Preprocessing**

In [7]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [8]:
def correct_text(text):
    return str(TextBlob(text).correct())

In [9]:
# Create DataFrame
df = pd.DataFrame(df)
# Convert 'Sentence' column to lowercase
df['Sentence'] = df['Sentence'].str.lower()

# remove urls
df['Sentence'] = df['Sentence'].apply(clean_text)


In [10]:
import re

def reduce_repeated_letters(word, max_repeats=2):
    # This will limit repeated letters to max_repeats (e.g., 'ooooo' -> 'oo')
    return re.sub(r'(.)\1{'+str(max_repeats)+',}', r'\1' * max_repeats, word)

In [11]:
def reduce_repeats_in_sentence(sentence):
    return ' '.join([reduce_repeated_letters(word) for word in sentence.split()])

df['Sentence'] = df['Sentence'].apply(reduce_repeats_in_sentence)


In [12]:
dict_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/normalization_dict.json'
with open(dict_path, 'r', encoding='utf-8') as f:
    normalization_dict = json.load(f)

print(f"Loaded normalization dictionary with {len(normalization_dict)} entries")


# Function to normalize text using your dictionary
def normalize_text(text, norm_dict):
    tokens = text.split()
    reverse_map = {}
    for std_word, variants in norm_dict.items():
        reverse_map[std_word] = std_word
        for var in variants:
            reverse_map[var] = std_word
    normalized_tokens = [reverse_map.get(token.lower(), token.lower()) for token in tokens]
    return ' '.join(normalized_tokens)
# run
df['Sentence'] = df['Sentence'].apply(lambda x: normalize_text(x, normalization_dict))


Loaded normalization dictionary with 59 entries


In [13]:
abbrev_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/abbreviation.json'
with open(abbrev_path, 'r', encoding='utf-8') as f:
    abbreviation_dict = json.load(f)

print(f"Loaded abbreviation dictionary with {len(abbreviation_dict)} entries")

# Function to normalize text using abbreviation dictionary
def normalize_abbreviations(text, abbr_dict):
    tokens = text.split()
    normalized_tokens = [abbr_dict.get(token.lower(), token) for token in tokens]
    return ' '.join(normalized_tokens)

    # run
    df['Sentence'] = df['Sentence'].apply(lambda x: normalize_abbreviations(x, abbreviation_dict))

Loaded abbreviation dictionary with 72 entries


# **Feature Extraction Starts Here**

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Step 1: Vectorize and extract vocabulary
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(df['Sentence'])

# Step 2: Get raw vocab
raw_vocab = vectorizer.get_feature_names_out()

# Step 3: Filter to keep only clean alphabetic words
def is_clean_word(word):
    return re.fullmatch(r'[a-zA-Z]+', word) is not None

clean_vocab = [word for word in raw_vocab if is_clean_word(word)]

print(f"Cleaned vocabulary size: {len(clean_vocab)}")

# Step 4: Save only cleaned vocab to txt file
with open("clean_vocab.txt", "w", encoding="utf-8") as f:
    for word in clean_vocab:
        f.write(f"{word}\n")

print("Cleaned vocabulary exported to clean_vocab.txt")

# Preview first 20 words
# print(clean_vocab[:20])


Cleaned vocabulary size: 26637
Cleaned vocabulary exported to clean_vocab.txt


In [18]:
# df['Tokens'] = df['Sentence'].apply(word_tokenize)

# df

TD IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,3),
    max_features=15000,
    min_df=3,
    sublinear_tf=True
)

X = tfidf.fit_transform(df['Sentence'])  # raw text, not tokenized


In [20]:
# TfidfVectorizer(
#     ngram_range=(1,3),
#     max_features=15000,
#     min_df=3,
#     sublinear_tf=True
# )


# **Modeling**

In [21]:
y = df['Label']  # or df['Label'] depending on your column name


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,          # maintains label ratio
    random_state=42,     # ensures same shuffle every time
    shuffle=True         # force shuffling — important for your case
)


In [23]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)


In [24]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.724956282787909
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1071
           1       0.74      0.74      0.74      1234
           2       0.71      0.74      0.73      1324
           3       0.58      0.58      0.58       374

    accuracy                           0.72      4003
   macro avg       0.70      0.70      0.70      4003
weighted avg       0.73      0.72      0.73      4003



In [25]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)


In [26]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.724956282787909
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1071
           1       0.74      0.74      0.74      1234
           2       0.71      0.74      0.73      1324
           3       0.58      0.58      0.58       374

    accuracy                           0.72      4003
   macro avg       0.70      0.70      0.70      4003
weighted avg       0.73      0.72      0.73      4003



# **DL ALGO's**

LSTM

In [27]:
!pip install tensorflow nltk

import numpy as np
import pandas as pd
import re
import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [28]:
texts = df['Sentence'].tolist()
labels = df['Label'].tolist()

# Tokenize and pad
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')

y = np.array(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)


In [29]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

In [30]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')  # 4 sentiment classes
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




oversampling

In [31]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)


X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)
from collections import Counter
print("Training label distribution:", Counter(y_train))


Training label distribution: Counter({np.int64(0): 5626, np.int64(3): 5626, np.int64(2): 5626, np.int64(1): 5626})


In [33]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


Epoch 1/8


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def clean_UserInput(text):
    import re
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'[^\w\s]', '', text)               # remove punctuation
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)         # reduce repeated letters
    return text.strip()


def predict_user_input(text):
    # Clean and prepare input
    text = clean_UserInput(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=100, padding='post')

    # Predict
    pred = model.predict(padded)
    pred_class = np.argmax(pred)

    # Class label mapping
    label_map = {
        0: "Positive",
        1: "Negative",
        2: "Neutral",
        3: "Mixed"
    }

    print(f"Input: {text}")
    print(f"Predicted Sentiment: {label_map[pred_class]}")


In [None]:
predict_user_input("weather forecast dekhe khushi hoye berolam, ekhon dekhi brishti")

Under Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Check original distribution
print("Original:", Counter(y))

# Apply undersampling
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Check new distribution
print("Undersampled:", Counter(y_resampled))


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)

# Train your LSTM (you don't need class_weight here)
model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))


cnn for text

In [34]:
!pip install imbalanced-learn

import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping




In [35]:
texts = df['Sentence'].tolist()
labels = df['Label'].tolist()

# Tokenization
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = np.array(labels).astype(int)

In [38]:
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("After Oversampling:", Counter(y_resampled))


After Oversampling: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)


In [40]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')  # 4 sentiment classes
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [41]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(X_train, y_train,
          epochs=8,
          batch_size=64,
          validation_data=(X_test, y_test),
          callbacks=[early_stop])


Epoch 1/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 125ms/step - accuracy: 0.5252 - loss: 1.0656 - val_accuracy: 0.8029 - val_loss: 0.5188
Epoch 2/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 128ms/step - accuracy: 0.8651 - loss: 0.3870 - val_accuracy: 0.8477 - val_loss: 0.4141
Epoch 3/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 122ms/step - accuracy: 0.9373 - loss: 0.1907 - val_accuracy: 0.8567 - val_loss: 0.4398
Epoch 4/8
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 119ms/step - accuracy: 0.9647 - loss: 0.1092 - val_accuracy: 0.8494 - val_loss: 0.5301


<keras.src.callbacks.history.History at 0x7961e5316850>

In [42]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step
              precision    recall  f1-score   support

    Positive       0.87      0.90      0.89       993
    Negative       0.80      0.77      0.78       993
     Neutral       0.80      0.77      0.78       993
       Mixed       0.92      0.95      0.93       993

    accuracy                           0.85      3972
   macro avg       0.85      0.85      0.85      3972
weighted avg       0.85      0.85      0.85      3972



In [46]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
print("After undersampling:", Counter(y_resampled))


After undersampling: Counter({np.int64(0): 1871, np.int64(1): 1871, np.int64(2): 1871, np.int64(3): 1871})


In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.15,
    stratify=y_resampled,
    random_state=42
)

model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)


Epoch 1/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 132ms/step - accuracy: 0.9407 - loss: 0.1958 - val_accuracy: 0.9305 - val_loss: 0.2001
Epoch 2/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 130ms/step - accuracy: 0.9718 - loss: 0.0961 - val_accuracy: 0.9252 - val_loss: 0.2150
Epoch 3/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 129ms/step - accuracy: 0.9886 - loss: 0.0476 - val_accuracy: 0.9190 - val_loss: 0.2525


<keras.src.callbacks.history.History at 0x7961e619ae50>

In [49]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
              precision    recall  f1-score   support

    Positive       0.91      0.92      0.91       281
    Negative       0.94      0.91      0.93       280
     Neutral       0.92      0.92      0.92       281
       Mixed       0.95      0.98      0.96       281

    accuracy                           0.93      1123
   macro avg       0.93      0.93      0.93      1123
weighted avg       0.93      0.93      0.93      1123



RCNN (Recurrent CNN)

In [50]:
!pip install imbalanced-learn

import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping






In [51]:
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = np.array(labels).astype(int)

In [52]:
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
print("Oversampled:", Counter(y_ros))


Oversampled: Counter({np.int64(3): 6619, np.int64(2): 6619, np.int64(1): 6619, np.int64(0): 6619})


In [53]:
X_used, y_used = X_ros, y_ros
X_train, X_test, y_train, y_test = train_test_split(
    X_used, y_used,
    test_size=0.15,
    stratify=y_used,
    random_state=42
)


In [54]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=True)),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model.fit(X_train, y_train,
          epochs=8,
          batch_size=64,
          validation_data=(X_test, y_test),
          callbacks=[early_stop])


Epoch 1/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 470ms/step - accuracy: 0.3946 - loss: 1.2685 - val_accuracy: 0.6492 - val_loss: 0.8519
Epoch 2/8
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 393ms/step - accuracy: 0.7771 - loss: 0.6077 - val_accuracy: 0.6901 - val_loss: 0.8088
Epoch 3/8
[1m 49/100[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m17s[0m 347ms/step - accuracy: 0.9149 - loss: 0.2586

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=["Positive", "Negative", "Neutral", "Mixed"]))
