<a href="https://colab.research.google.com/github/ashibullah/Romanian-Bangla-Sentiment-Analysis-NLP/blob/main/RomanianBanglaSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data collection + libraries

In [None]:
!pip install datasets --upgrade

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt_tab')  # For tokenizer
nltk.download('stopwords')  # For tokenizer

In [None]:
nltk.download('punkt_tab')  # For tokenizerfrom datasets import load_dataset
from textblob import TextBlob
import re
from nltk.tokenize import word_tokenize
import pandas as pd

import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset

df = load_dataset("aplycaebous/BnSentMix" , split = "train")

# **Preprocessing**

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [None]:
def correct_text(text):
    return str(TextBlob(text).correct())

In [None]:
# Create DataFrame
df = pd.DataFrame(df)
# Convert 'Sentence' column to lowercase
df['Sentence'] = df['Sentence'].str.lower()

# remove urls
df['Sentence'] = df['Sentence'].apply(clean_text)


In [None]:
import re

def reduce_repeated_letters(word, max_repeats=1):
    # This will limit repeated letters to max_repeats (e.g., 'ooooo' -> 'oo')
    return re.sub(r'(.)\1{'+str(max_repeats)+',}', r'\1' * max_repeats, word)


In [None]:
def reduce_repeats_in_sentence(sentence):
    return ' '.join([reduce_repeated_letters(word) for word in sentence.split()])

df['Sentence'] = df['Sentence'].apply(reduce_repeats_in_sentence)


In [None]:
# df = df['Sentence'].apply(correct_text)


In [None]:
dict_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/normalization_dict.json'
with open(dict_path, 'r', encoding='utf-8') as f:
    normalization_dict = json.load(f)

print(f"Loaded normalization dictionary with {len(normalization_dict)} entries")


# Function to normalize text using your dictionary
def normalize_text(text, norm_dict):
    tokens = text.split()
    reverse_map = {}
    for std_word, variants in norm_dict.items():
        reverse_map[std_word] = std_word
        for var in variants:
            reverse_map[var] = std_word
    normalized_tokens = [reverse_map.get(token.lower(), token.lower()) for token in tokens]
    return ' '.join(normalized_tokens)
# run
df['Sentence'] = df['Sentence'].apply(lambda x: normalize_text(x, normalization_dict))


Loaded normalization dictionary with 59 entries


In [None]:
abbrev_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/abbreviation.json'
with open(abbrev_path, 'r', encoding='utf-8') as f:
    abbreviation_dict = json.load(f)

print(f"Loaded abbreviation dictionary with {len(abbreviation_dict)} entries")

# Function to normalize text using abbreviation dictionary
def normalize_abbreviations(text, abbr_dict):
    tokens = text.split()
    normalized_tokens = [abbr_dict.get(token.lower(), token) for token in tokens]
    return ' '.join(normalized_tokens)

    # run
    df['Sentence'] = df['Sentence'].apply(lambda x: normalize_abbreviations(x, abbreviation_dict))

Loaded abbreviation dictionary with 72 entries


# **Feature Extraction Starts Here**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Step 1: Vectorize and extract vocabulary
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(df['Sentence'])

# Step 2: Get raw vocab
raw_vocab = vectorizer.get_feature_names_out()

# Step 3: Filter to keep only clean alphabetic words
def is_clean_word(word):
    return re.fullmatch(r'[a-zA-Z]+', word) is not None

clean_vocab = [word for word in raw_vocab if is_clean_word(word)]

print(f"Cleaned vocabulary size: {len(clean_vocab)}")

# Step 4: Save only cleaned vocab to txt file
with open("clean_vocab.txt", "w", encoding="utf-8") as f:
    for word in clean_vocab:
        f.write(f"{word}\n")

print("Cleaned vocabulary exported to clean_vocab.txt")

# Preview first 20 words
print(clean_vocab[:20])


Cleaned vocabulary size: 25086
Cleaned vocabulary exported to clean_vocab.txt
['ab', 'aba', 'abal', 'abalbebohar', 'abalchuda', 'abaler', 'abalra', 'abar', 'abaro', 'abas', 'abash', 'abdar', 'abded', 'abdi', 'abdulah', 'abe', 'abedon', 'abeg', 'aben', 'aber']


In [None]:
# df['Tokens'] = df['Sentence'].apply(word_tokenize)

# df

Unnamed: 0,Sentence,Label,Tokens
0,youtube ar volg gula boring hoye jaitase din d...,3,"[youtube, ar, volg, gula, boring, hoye, jaitas..."
1,your video making camera work is really good i...,3,"[your, video, making, camera, work, is, really..."
2,you made me nostalgic college life a ei dokan ...,3,"[you, made, me, nostalgic, college, life, a, e..."
3,workshop ta engaging but resources ta insuffic...,3,"[workshop, ta, engaging, but, resources, ta, i..."
4,win hoy nay but onek bhalo khelecu,3,"[win, hoy, nay, but, onek, bhalo, khelecu]"
...,...,...,...
20010,1 march use kortasi pocof5 kono problem nai ga...,0,"[1, march, use, kortasi, pocof5, kono, problem..."
20011,1 day beshi stay kora jae na,0,"[1, day, beshi, stay, kora, jae, na]"
20012,1 boro na 2 boro tushar bhai er mon boro,0,"[1, boro, na, 2, boro, tushar, bhai, er, mon, ..."
20013,1 boro na 2 boro sam bhai er mon boro,0,"[1, boro, na, 2, boro, sam, bhai, er, mon, boro]"


TD IDF

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,3),
    max_features=15000,
    min_df=3,
    sublinear_tf=True
)

X = tfidf.fit_transform(df['Sentence'])  # raw text, not tokenized


In [None]:
# TfidfVectorizer(
#     ngram_range=(1,3),
#     max_features=15000,
#     min_df=3,
#     sublinear_tf=True
# )


# **Modeling**

In [34]:
y = df['Label']  # or df['Label'] depending on your column name


In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,          # maintains label ratio
    random_state=42,     # ensures same shuffle every time
    shuffle=True         # force shuffling — important for your case
)


In [43]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)


In [45]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7302023482388209
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      1071
           1       0.74      0.74      0.74      1234
           2       0.72      0.74      0.73      1324
           3       0.57      0.59      0.58       374

    accuracy                           0.73      4003
   macro avg       0.71      0.71      0.71      4003
weighted avg       0.73      0.73      0.73      4003



In [30]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)


In [31]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7304521608793405
              precision    recall  f1-score   support

           0       0.79      0.74      0.76      1071
           1       0.72      0.76      0.74      1234
           2       0.70      0.77      0.73      1324
           3       0.80      0.44      0.57       374

    accuracy                           0.73      4003
   macro avg       0.75      0.68      0.70      4003
weighted avg       0.74      0.73      0.73      4003



# **DL ALGO's**

In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert text to lowercase if not already done
texts = df['Sentence'].astype(str).str.lower().tolist()
labels = df['Label'].tolist()

# Tokenize
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences
maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = labels


In [48]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)


In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')  # 4 classes: 0, 1, 2, 3
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
history = model.fit(X_train, y_train,
                    epochs=5,
                    batch_size=64,
                    validation_data=(X_test, y_test))


Epoch 1/5
[1m224/251[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m9s[0m 335ms/step - accuracy: 0.4808 - loss: 1.1244

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=['Positive', 'Negative', 'Neutral', 'Mixed']))
