<a href="https://colab.research.google.com/github/ashibullah/Romanian-Bangla-Sentiment-Analysis-NLP/blob/main/RomanianBanglaSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data collection + libraries

In [None]:
!pip install datasets --upgrade

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt_tab')  # For tokenizer
nltk.download('stopwords')  # For tokenizer

In [4]:
nltk.download('punkt_tab')  # For tokenizerfrom datasets import load_dataset
from textblob import TextBlob
import re
from nltk.tokenize import word_tokenize
import pandas as pd

import json

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = load_dataset("aplycaebous/BnSentMix" , split = "train")

# **Preprocessing**

In [7]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [8]:
def correct_text(text):
    return str(TextBlob(text).correct())

In [49]:
# Create DataFrame
df = pd.DataFrame(df)
# Convert 'Sentence' column to lowercase
df['Sentence'] = df['Sentence'].str.lower()

# remove urls
df['Sentence'] = df['Sentence'].apply(clean_text)


In [50]:
import re

def reduce_repeated_letters(word, max_repeats=1):
    # This will limit repeated letters to max_repeats (e.g., 'ooooo' -> 'oo')
    return re.sub(r'(.)\1{'+str(max_repeats)+',}', r'\1' * max_repeats, word)


In [51]:
def reduce_repeats_in_sentence(sentence):
    return ' '.join([reduce_repeated_letters(word) for word in sentence.split()])

df['Sentence'] = df['Sentence'].apply(reduce_repeats_in_sentence)


In [None]:
# df = df['Sentence'].apply(correct_text)
# avoiding this

In [52]:
dict_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/normalization_dict.json'
with open(dict_path, 'r', encoding='utf-8') as f:
    normalization_dict = json.load(f)

print(f"Loaded normalization dictionary with {len(normalization_dict)} entries")


# Function to normalize text using your dictionary
def normalize_text(text, norm_dict):
    tokens = text.split()
    reverse_map = {}
    for std_word, variants in norm_dict.items():
        reverse_map[std_word] = std_word
        for var in variants:
            reverse_map[var] = std_word
    normalized_tokens = [reverse_map.get(token.lower(), token.lower()) for token in tokens]
    return ' '.join(normalized_tokens)



Loaded normalization dictionary with 59 entries


In [54]:
df['Sentence'] = df['Sentence'].apply(lambda x: normalize_text(x, normalization_dict))

# **Feature Extraction Starts Here**

In [55]:
abbrev_path = '/content/drive/MyDrive/Colab Notebooks/RomanianBanglaUtilities/abbreviation.json'
with open(abbrev_path, 'r', encoding='utf-8') as f:
    abbreviation_dict = json.load(f)

print(f"Loaded abbreviation dictionary with {len(abbreviation_dict)} entries")

# Function to normalize text using abbreviation dictionary
def normalize_abbreviations(text, abbr_dict):
    tokens = text.split()
    normalized_tokens = [abbr_dict.get(token.lower(), token) for token in tokens]
    return ' '.join(normalized_tokens)

Loaded abbreviation dictionary with 72 entries


In [56]:
df['Sentence'] = df['Sentence'].apply(lambda x: normalize_abbreviations(x, abbreviation_dict))

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Step 1: Vectorize and extract vocabulary
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(df['Sentence'])

# Step 2: Get raw vocab
raw_vocab = vectorizer.get_feature_names_out()

# Step 3: Filter to keep only clean alphabetic words
def is_clean_word(word):
    return re.fullmatch(r'[a-zA-Z]+', word) is not None

clean_vocab = [word for word in raw_vocab if is_clean_word(word)]

print(f"Cleaned vocabulary size: {len(clean_vocab)}")

# Step 4: Save only cleaned vocab to txt file
with open("clean_vocab.txt", "w", encoding="utf-8") as f:
    for word in clean_vocab:
        f.write(f"{word}\n")

print("Cleaned vocabulary exported to clean_vocab.txt")

# Preview first 20 words
print(clean_vocab[:20])


Cleaned vocabulary size: 25056
Cleaned vocabulary exported to clean_vocab.txt
['ab', 'aba', 'abal', 'abalbebohar', 'abalchuda', 'abaler', 'abalra', 'abar', 'abaro', 'abas', 'abash', 'abdar', 'abded', 'abdi', 'abdulah', 'abe', 'abedon', 'abeg', 'aben', 'aber']


In [17]:
df['Tokens'] = df['Sentence'].apply(word_tokenize)

df

Unnamed: 0,Sentence,Label,Tokens
0,youtube ar volg gula boring hoye jaitase din d...,3,"[youtube, ar, volg, gula, boring, hoye, jaitas..."
1,your video making camera work is really good i...,3,"[your, video, making, camera, work, is, really..."
2,you made me nostalgic college life a ei dokan ...,3,"[you, made, me, nostalgic, college, life, a, e..."
3,workshop ta engaging but resources ta insuffic...,3,"[workshop, ta, engaging, but, resources, ta, i..."
4,win hoy nay but onek bhalo khelecu,3,"[win, hoy, nay, but, onek, bhalo, khelecu]"
...,...,...,...
20010,1 march use kortasi pocof5 kono problem nai ga...,0,"[1, march, use, kortasi, pocof5, kono, problem..."
20011,1 day beshi stay kora jae na,0,"[1, day, beshi, stay, kora, jae, na]"
20012,1 boro na 2 boro tushar bhai er mon boro,0,"[1, boro, na, 2, boro, tushar, bhai, er, mon, ..."
20013,1 boro na 2 boro sam bhai er mon boro,0,"[1, boro, na, 2, boro, sam, bhai, er, mon, boro]"
