In [2]:
import re
import pandas as pd


### Preprocessing of Dataset (Roman Urdu Dataset)

In [4]:
def cleaner(word):
    word = re.sub(r'\#\.', '', word)
    word = re.sub(r'\n', '', word)
    word = re.sub(r',', '', word)
    word = re.sub(r'\-', ' ', word)
    word = re.sub(r'\.', '', word)
    word = re.sub(r'\\', ' ', word)
    word = re.sub(r'\\x\.+', '', word)
    word = re.sub(r'\d', '', word)
    word = re.sub(r'^_.', '', word)
    word = re.sub(r'_', ' ', word)
    word = re.sub(r'^ ', '', word)
    word = re.sub(r' $', '', word)
    word = re.sub(r'\?', '', word)
    return word.lower()

# Define the hashing function
def hashing(word):
    word = re.sub(r'ain$', r'ein', word)
    word = re.sub(r'ai', r'ae', word)
    word = re.sub(r'ay$', r'e', word)
    word = re.sub(r'ey$', r'e', word)
    word = re.sub(r'ie$', r'y', word)
    word = re.sub(r'^es', r'is', word)
    word = re.sub(r'a+', r'a', word)
    word = re.sub(r'j+', r'j', word)
    word = re.sub(r'd+', r'd', word)
    word = re.sub(r'u', r'o', word)
    word = re.sub(r'o+', r'o', word)
    word = re.sub(r'ee+', r'i', word)
    if not re.match(r'ar', word):
        word = re.sub(r'ar', r'r', word)
    word = re.sub(r'iy+', r'i', word)
    word = re.sub(r'ih+', r'eh', word)
    word = re.sub(r's+', r's', word)
    if re.search(r'[rst]y', word) and word[-1] != 'y':
        word = re.sub(r'y', r'i', word)
    if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
        word = re.sub(r'i$', r'y', word)
    if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
        word = re.sub(r'h', '', word)
    word = re.sub(r'k', r'q', word)
    return word

# Clean the text data
def array_cleaner(array):
    X = []
    for sentence in array:
        clean_sentence = ''
        words = str(sentence).split(' ')
        for word in words:
            clean_sentence = clean_sentence + ' ' + cleaner(word)
        X.append(clean_sentence)
    return X



### Loading the Dataset

In [3]:
# Load your dataset (replace with your actual dataset)
data = pd.read_csv('/kaggle/input/sentiment-dataset/Roman Urdu DataSet.csv')  # Replace with your actual CSV path
# Manually assign column names
data.columns = ['Text', 'Sentiment', 'extra']
data = data.iloc[:,0:2]
# Check the first few rows to confirm
data.head(5)



Unnamed: 0,Text,Sentiment
0,sahi bt h,Positive
1,"Kya bt hai,",Positive
2,Wah je wah,Positive
3,Are wha kaya bat hai,Positive
4,Wah kya baat likhi,Positive


In [4]:
X_cleaned = array_cleaner(data['Text']) 

In [5]:
df = pd.DataFrame({
    'text': X_cleaned,  # Cleaned text data
    'label': data['Sentiment']  # Sentiment labels
})

df.head(5)

Unnamed: 0,text,label
0,sahi bt h,Positive
1,kya bt hai,Positive
2,wah je wah,Positive
3,are wha kaya bat hai,Positive
4,wah kya baat likhi,Positive


In [6]:

df['label'].value_counts()

label
Neutral     8929
Positive    6012
Negative    5286
Neative        1
Name: count, dtype: int64

In [7]:
df = df[df['label'] != 'Neative']

df['label'].value_counts()

label
Neutral     8929
Positive    6012
Negative    5286
Name: count, dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['label'] = label_encoder.fit_transform(df['label'])



#### 1 as Neutrel, 2 as positive, 0 as negative

In [9]:
df['label'].value_counts()

label
1    8929
2    6012
0    5286
Name: count, dtype: int64

#### now Fine tuning model

1) First we will use auto-tokenizer class from transformer library. it will read the description from config file of model (i.e our model type: albert)
2) so it will use albert tokenizer for this model
3) then we will convert our df in hugging face compatible dataset
4) then we will tokenize our dataset according to the model in which max lenght is 128 it means there will be 128 tokens for each input.
5) then splitting

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("callmesan/indic-bert-roman-urdu-fine-grained")

# Convert your pandas DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Tokenize the text column of the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the dataset into training and test sets (80% train, 20% test)
train_dataset = tokenized_datasets.shuffle(seed=42).select([i for i in range(0, int(0.8 * len(tokenized_datasets)))])  # 80% train
test_dataset = tokenized_datasets.shuffle(seed=42).select([i for i in range(int(0.8 * len(tokenized_datasets)), len(tokenized_datasets))])  # 20% test


1) Now i red in the config file that there are 5 labels in the model but we have only 3 labels in our dataset so we reset the number of labels to 3
2) it is using albert for sequence classification cos our model is based on albert

In [19]:
from transformers import AutoModelForSequenceClassification, AutoConfig

# Load the config and set num_labels=3
config = AutoConfig.from_pretrained(
    'callmesan/indic-bert-roman-urdu-fine-grained',
    num_labels=3
)

# Load model with the modified config and ignore mismatches
model = AutoModelForSequenceClassification.from_pretrained(
    'callmesan/indic-bert-roman-urdu-fine-grained',
    config=config,
    ignore_mismatched_sizes=True  # this tells Transformers to ignore the old classification head
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check where the model is
print("Model device:", next(model.parameters()).device)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at callmesan/indic-bert-roman-urdu-fine-grained and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model device: cuda:0


now the training arguments in which 
epochs are 10 means the dataset will be pass 10 times through model
batch size is 16 means the model will take 16 sentences at once as input for learning 

In [46]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_strategy="steps",
    logging_steps=10,
    eval_strategy="epoch",     # evaluation strategy to adopt during training
    save_strategy="epoch",           # save checkpoint every epoch
    report_to="all",  # Show both console and progress bar
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False  # Ensure progress bars are visible
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
)

# Start the training



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [21]:
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.9674,0.981294
2,0.9311,0.92581
3,0.807,0.837916
4,0.6582,0.820168
5,0.4497,0.940775
6,0.3093,1.101908
7,0.3146,1.395361
8,0.0691,1.829128
9,0.0886,2.009142
10,0.0056,2.109982




TrainOutput(global_step=5060, training_loss=0.45969014363437005, metrics={'train_runtime': 1995.3972, 'train_samples_per_second': 81.092, 'train_steps_per_second': 2.536, 'total_flos': 966833525137920.0, 'train_loss': 0.45969014363437005, 'epoch': 10.0})

In [33]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./results/checkpoint-4554"
tokenizer = AutoTokenizer.from_pretrained("callmesan/indic-bert-roman-urdu-fine-grained")
model = AutoModelForSequenceClassification.from_pretrained(model_path)


In [34]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

Device set to use cuda:0


In [35]:
label_map = {
    'LABEL_0': 'Negative',
    'LABEL_1': 'Neutral',
    'LABEL_2': 'Positive'
}



In [43]:
# Pretty print result
text = "tum kitne pyare ho"
result = classifier(text)[0]
label_name = label_map[result['label']]
score = result['score']

print(f"Text: {text}")
print(f"Predicted Sentiment: {label_name} ({score:.2f})")

Text: tum kitne pyare ho
Predicted Sentiment: Negative (0.90)


#### now again fine tuning on a new dataset (review 1 cleaned.csv)
i already cleaned it on google collab (Review 1 cleaning in google colab using svm.ipynb)

In [5]:
df2 = pd.read_csv('/kaggle/input/sentiment-dataset-2/review 1 cleaned.csv', encoding='ISO-8859-1')
df2.head()

Unnamed: 0,text,label
0,allah nahi usko bachana tha to us road sy hi...,negative
1,apnay bhai ki madad karo qatil,positive
2,bohot ghandi choice hai teri qatil,negative
3,kis kis ko pakistan buhat pasand hai ...,positive
4,ibrahim ki mama qatil ki bohot achi acting d...,positive


In [6]:
df2['label'].value_counts()

label
positive    10331
negative     9437
neutral      8321
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Encode string labels to numeric
le = LabelEncoder()
df2['label'] = le.fit_transform(df2['label'])

#### 1 as Neutrel, 2 as positive, 0 as negative (again)

In [8]:
df2['label'].value_counts()

label
2    10331
0     9437
1     8321
Name: count, dtype: int64

##### now again all training steps are same

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
tokenizer = AutoTokenizer.from_pretrained("callmesan/indic-bert-roman-urdu-fine-grained")

# Convert your pandas DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df2)

# Tokenize the text column of the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the dataset into training and test sets (80% train, 20% test)
train_dataset = tokenized_datasets.shuffle(seed=42).select([i for i in range(0, int(0.8 * len(tokenized_datasets)))])  # 80% train
test_dataset = tokenized_datasets.shuffle(seed=42).select([i for i in range(int(0.8 * len(tokenized_datasets)), len(tokenized_datasets))])  # 20% test


In [10]:
from transformers import AutoModelForSequenceClassification, AutoConfig
import torch
# Load the config and set num_labels=3
config = AutoConfig.from_pretrained(
    '/kaggle/input/checkpoint-4554',
    num_labels=3
)

# Load model with the modified config and ignore mismatches
model = AutoModelForSequenceClassification.from_pretrained(
    '/kaggle/input/checkpoint-4554',
    config=config,
    ignore_mismatched_sizes=True  # this tells Transformers to ignore the old classification head
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check where the model is
print("Model device:", next(model.parameters()).device)

2025-04-12 18:50:10.528217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744483810.978906      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744483811.118389      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Model device: cuda:0


In [11]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_strategy="steps",
    logging_steps=10,
    eval_strategy="epoch",     # evaluation strategy to adopt during training
    save_strategy="epoch",           # save checkpoint every epoch
    report_to="all",  # Show both console and progress bar
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False  # Ensure progress bars are visible
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
)

# Start the training



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.8192,0.812374
2,0.6486,0.715568
3,0.5084,0.739749
4,0.3255,0.935292
5,0.1197,1.110428
6,0.1508,1.4505
7,0.0861,1.68989
8,0.0655,1.925538
9,0.0207,2.044914
10,0.0468,2.056302




TrainOutput(global_step=7030, training_loss=0.2738979932725076, metrics={'train_runtime': 2769.4232, 'train_samples_per_second': 81.14, 'train_steps_per_second': 2.538, 'total_flos': 1342668323550720.0, 'train_loss': 0.2738979932725076, 'epoch': 10.0})

In [35]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "/kaggle/input/final-tuning/model fine tuned on review 1"
tokenizer = AutoTokenizer.from_pretrained("callmesan/indic-bert-roman-urdu-fine-grained")
model = AutoModelForSequenceClassification.from_pretrained(model_path)


In [37]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=-1)

Device set to use cpu


In [38]:
label_map = {
    'LABEL_0': 'Negative',
    'LABEL_1': 'Neutral',
    'LABEL_2': 'Positive'
}



In [None]:

text = "tum bohat ache ho"
result = classifier(text)[0]
label_name = label_map[result['label']]
score = result['score']

print(f"Text: {text}")
print(f"Predicted Sentiment: {label_name} ({score:.2f})")

Text: tum bohat ache ho
Predicted Sentiment: Positive (0.94)
