In [30]:
pip install indic-nlp-library




In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import re
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import re
try:
  from indicnlp.tokenize import indic_tokenize
except ImportError:
  print("indic-nlp library not found. Skipping tokenization.")


In [32]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [33]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
train_df = pd.read_csv("/content/drive/My Drive/Data/hindi_train.csv")
val_df = pd.read_csv("/content/drive/My Drive/Data/hindi_val.csv")
test_df = pd.read_csv("/content/drive/My Drive/Data/hindi_test.csv")

In [36]:
print("Train Dataset Sample:")
print(train_df.head())

print("\nValidation Dataset Sample:")
print(val_df.head())

print("\nTest Dataset Sample:")
print(test_df.head())

Train Dataset Sample:
   label                                       text
0      0              ‡§≠‡•Ä‡§°‡§º ‡§Æ‡•á‡§Ç  ‡§¨‡§π‡•Å‡§§  ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á
1      0  ‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Ö‡§™‡§®‡•Ä ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç  ‡§§‡•ã ‡§π‡§ü‡§æ ‡§¶‡•á‡§Ç
2      0           ‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§§‡•ã ‡§Æ‡•Å‡§π ‡§Æ‡•á‡§Ç ‡§≤‡§Ç‡§° ‡§≤‡•á ‡§≤‡•ã ‡§§‡•ã
3      0       ‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§π‡•à ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ ‡§§‡•Ç ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§∞‡§π‡§æ ‡§π‡•à
4      1  ‡§ö‡§æ‡§Ø ‡§®‡§π‡•Ä‡§Ç ‡§™‡•Ä‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§Æ‡•à‡§Ç ‡§á‡§∏‡•Ä ‡§ï‡•ã ‡§õ‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ ok

Validation Dataset Sample:
   label                                               text
0      0                          Comment box ‡§ö‡§æ‡§≤‡•Ç ‡§ï‡§∞ ‡§ù‡§µ‡§æ‡§°‡•á
1      0  ‡§Ü‡§™‡§ï‡•á ‡§™‡§æ‡§∏ ‡§™‡•Å‡§ñ‡•ç‡§§‡§æ ‡§∏‡§¨‡•Ç‡§§ ‡§π‡•à, ‡§ï‡•ç‡§Ø‡§æ ‡§Ø‡§æ ‡§´‡§ø‡§∞ ‡§ï‡•Å‡§õ ‡§≠‡•Ä‡•§ ‡§â...
2      1  üë¨‡§¶‡•ã‡§∏‡•ç‡§§‡•Ä ‡§π‡•ã‡§§‡•Ä ‡§π‡•à ‚Äì One Time üôÉ ‡§π‡§Æ ‡§®‡§ø‡§≠‡

In [37]:
print("Train Dataset Columns:")
print(train_df.columns)

print("\nValidation Dataset Columns:")
print(val_df.columns)

print("\nTest Dataset Columns:")
print(test_df.columns)


Train Dataset Columns:
Index(['label', 'text'], dtype='object')

Validation Dataset Columns:
Index(['label', 'text'], dtype='object')

Test Dataset Columns:
Index(['label', 'text'], dtype='object')


In [38]:
stopwords_file = '/content/drive/My Drive/Data/final_stopwords.txt'  # Replace with the path to your stopwords file
with open(stopwords_file, 'r', encoding='utf-8') as file:
    stop_words_list = [line.strip() for line in file]
stop_words_hindi = set(stop_words_list)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words_hindi])

train_df['text'] = train_df['text'].apply(remove_stopwords)
train_df['tokens'] = train_df['text'].apply(indic_tokenize.trivial_tokenize)

In [39]:
def preprocess_text(text):
  text = text.lower()  # Lowercase
  text = re.sub(r'[^\w\s\u0900-\u097F]', ' ', text)  # Keeps Hindi characters, whitespace, and alphanumeric
  text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
  # Apply stopword removal if stopwords_file is defined
  if stopwords_file:
      text = remove_stopwords(text)
  # Tokenization (using indic_nlp if available)
  try:
      text = indic_tokenize.trivial_tokenize(text)
      text = ' '.join(text)  # Join tokens back into string
  except ImportError:
      print("indic_nlp library not found. Skipping tokenization.")
  return text

In [40]:
print("Preprocessing Training Data:")
train_df["text_preprocessed"] = train_df["text"].apply(preprocess_text)
print("Sample Preprocessed Training Text:")
print(train_df["text_preprocessed"].head())

Preprocessing Training Data:
Sample Preprocessed Training Text:
0                  ‡§≠‡•Ä‡§°‡§º ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á
1    ‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç ‡§π‡§ü‡§æ ‡§¶‡•á‡§Ç
2              ‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§Æ‡•Å‡§π ‡§≤‡§Ç‡§° ‡§≤‡•á ‡§≤‡•ã
3                ‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ ‡§§‡•Ç
4           ‡§ö‡§æ‡§Ø ‡§®‡§π‡•Ä‡§Ç ‡§™‡•Ä‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§õ‡•ã‡§°‡§º ok
Name: text_preprocessed, dtype: object


In [41]:
print("Preprocessing Validation Data:")
val_df["text_preprocessed"] = val_df["text"].apply(preprocess_text)
print("Sample Preprocessed Validation Text:")
print(val_df["text_preprocessed"].head())

Preprocessing Validation Data:
Sample Preprocessed Validation Text:
0                               comment box ‡§ö‡§æ‡§≤‡•Ç ‡§ù‡§µ‡§æ‡§°‡•á
1    ‡§Ü‡§™‡§ï‡•á ‡§™‡§æ‡§∏ ‡§™‡•Å‡§ñ‡•ç‡§§‡§æ ‡§∏‡§¨‡•Ç‡§§ ‡§≠‡•Ä ‡•§ ‡§Ü‡§ß‡§æ‡§∞ ‡§ï‡§æ‡§∞‡•ç‡§° ‡§ï‡•á ‡§ï‡§æ‡§∞‡§£ ‡§µ...
2    ‡§¶‡•ã‡§∏‡•ç‡§§‡•Ä one time ‡§®‡§ø‡§≠‡§æ‡§§‡•á some time ‡§Ø‡§æ‡§¶ ‡§ï‡§∞‡•ã any t...
3    ‡§ö‡§æ‡§Ø ‡§≤‡§µ‡§∞ ‡§°‡§æ‡•Ö ‡§õ‡•ã‡§°‡§º ‡§¶‡•á‡§ó‡•á or mo ‡§õ‡•ã‡§°‡§º ‡§®‡§π‡•Ä‡§Ç ‡§ì‡§∞ m bhi...
4    ‡§Ö‡§®‡§™‡§¢‡§º ‡§≤‡•ã‡§ó ‡§¶‡§ø‡§® ‡§∞‡§æ‡§§ ‡§π‡§ø‡§Ç‡§¶‡•Ç ‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§∞‡§π‡§§‡•á ‡§∏‡•ã‡§ö‡§®‡§æ ‡§¶‡•á‡§ñ...
Name: text_preprocessed, dtype: object


In [42]:
print("Preprocessing Test Data:")
test_df["text_preprocessed"] = test_df["text"].apply(preprocess_text)
print("Sample Preprocessed Test Text:")
print(test_df["text_preprocessed"].head())

Preprocessing Test Data:
Sample Preprocessed Test Text:
0                       ‡§®‡§π‡•Ä ‡§∏‡•ã‡§ö ‡§®‡§ø‡§ï‡§≤‡•á ‡§∏‡•ã‡§ö ‡§´‡§Ç‡§∏‡§æ
1                 ‡§¶‡§ø‡§µ‡§æ‡§≤‡•Ä ‡§¶‡•á‡§∂ ‡§™‡§°‡§º‡§æ‡§ï‡§æ ‡§®‡§π‡•Ä‡§Ç ‡§´‡•ã‡§°‡§æ‡§§
2       ‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ä ‡§™‡§æ‡§≤ ‡§≤‡•á‡§®‡§æ ‡§ó‡§≤‡§§ ‡§´‡§π‡§Æ‡•Ä ‡§®‡§π‡•Ä‡§Ç ‡•§
3    ‡§§‡•á‡§∞‡•Ä ‡§ó‡§æ‡§Ç‡§° ‡§™‡•ç‡§Ø‡§æ‡§ú ‡§ï‡§æ‡§ü ‡§¶‡•á‡§ó‡§æ ‡§ó‡•Å‡§ú‡•ç‡§ú‡§∞ ‡§≠‡•ã‡§∏‡§°‡§º‡•Ä ‡§ï‡•á
4                  ‡§¨‡§Ç‡§ó‡§æ‡§≤‡•Ä ‡§∏‡§æ‡§°‡§º‡•Ä ‡§®‡§π‡•Ä‡§Ç ‡§™‡§π‡§®‡§æ ‡§¶‡•Ä‡§¶‡•Ä
Name: text_preprocessed, dtype: object


In [43]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# Dataset class
class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



In [45]:
# Creating Datasets and Dataloaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ToxicCommentDataset(
        texts=df["text_preprocessed"].tolist(),
        labels=df["label"].tolist(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, shuffle=True)

BATCH_SIZE = 16
MAX_LEN = 128

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)


In [46]:
# Training and Evaluation Functions
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['labels'].numpy())
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
    return accuracy_score(labels, preds)




In [47]:
# Training Loop
EPOCHS = 5

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_data_loader, optimizer, device)
    val_accuracy = evaluate(model, val_data_loader, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss: {train_loss}')
    print(f'Validation accuracy: {val_accuracy}')


Epoch 1/5
Train loss: 0.4767965018194281
Validation accuracy: 0.8317479191438764
Epoch 2/5
Train loss: 0.35736179521835737
Validation accuracy: 0.8315992865636147
Epoch 3/5
Train loss: 0.28990627370520883
Validation accuracy: 0.8434898929845422
Epoch 4/5
Train loss: 0.230724264415964
Validation accuracy: 0.8347205707491082
Epoch 5/5
Train loss: 0.17441970582242625
Validation accuracy: 0.832936979785969


In [48]:
# Evaluate on Test Set
test_accuracy = evaluate(model, test_data_loader, device)
print(f'Test accuracy: {test_accuracy}')


Test accuracy: 0.8376932223543401


In [49]:
# Function to get predictions for a sample of data
def get_predictions(model, data_loader, device):
    model.eval()
    texts = []
    preds = []
    labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['labels'].cpu().numpy())
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            # Decode input_ids to get the original text
            texts.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids])
    return texts, preds, labels

# Get predictions for the test set
test_texts, test_preds, test_labels = get_predictions(model, test_data_loader, device)

# Display a sample of the predictions
sample_size = 10  # Number of samples to display
print(f'Displaying {sample_size} samples from the test set predictions:\n')

for i in range(sample_size):
    print(f'Text: {test_texts[i]}')
    print(f'Actual Label: {test_labels[i]}')
    print(f'Predicted Label: {test_preds[i]}')
    print('---')


Displaying 10 samples from the test set predictions:

Text: ‡§™‡•ç‡§Ø‡§æ‡§∞‡•Ä ‡§≤‡§ó‡§∞‡§π‡•Ä ‡§§‡•Å‡§Æ‡•ç‡§π‡•á ‡§™‡•ç‡§Ø‡§æ‡§∞ ‡§∏‡§®‡§Æ ‡§Ü‡§Ø ‡§≤‡§µ‡•ç‡§π ‡§Ø‡•Å ‡§∏‡§Ç‡§ö‡§ø‡§§‡§æ
Actual Label: 1
Predicted Label: 1
---
Text: ‡§Ö‡§∞‡•á ‡§™‡§æ‡§ó‡§≤ ‡§î‡§∞‡§§ ‡§π‡§∞ ‡§µ‡§ø‡§°‡§ø‡§Ø‡•ã ‡§è‡§ï‡§ü‡§ø‡§ó‡§Ç ‡§ï‡§∞‡•á‡§ó‡•Ä ‡§ï ‡§Ö‡§≤‡§ó ‡§≤‡•á
Actual Label: 0
Predicted Label: 0
---
Text: ‡§¶‡•ã‡§∏‡•ç‡§§‡•ã‡§Ç ‡§¶‡•Ä‡§™‡§æ‡§µ‡§≤‡•Ä ‡§π‡§æ‡§∞‡•ç‡§¶‡§ø‡§ï ‡§∂‡•Å‡§≠‡§ï‡§æ‡§Æ‡§®‡§æ‡§è‡§Ç ‡§à‡§∂‡•ç‡§µ‡§∞ ‡§™‡•ç‡§∞‡§æ‡§∞‡•ç‡§•‡§®‡§æ ‡§µ‡•ã ‡§Ü‡§™‡§ï‡•ã ‡§π‡§Æ‡•á‡§∂‡§æ ‡§∏‡•ç‡§µ‡§∏‡•ç‡§• ‡§ä‡§∞‡•ç‡§ú‡§æ‡§µ‡§æ‡§®
Actual Label: 1
Predicted Label: 1
---
Text: ‡§≤‡§Ç‡§ó‡§∞ mae ‡§¨‡•ã‡§ü‡•Ä ‡§®‡§π‡•Ä‡§Ç ‡§Æ‡§ø‡§≤‡§§‡•Ä chutiye
Actual Label: 0
Predicted Label: 0
---
Text: ‡§≠‡§æ‡§à ‡§≤‡•ã‡§ó ‡§ß‡•ç‡§Ø‡§æ‡§® ‡§¶‡•á‡§ñ‡•ã 2 ‡§ò‡•ã‡§°‡§º‡•á ‡§ó‡§ß‡§æ
Actual Label: 0
Predicted Label: 0
---
Text: ‡§á‡§® ‡•§ ‡§™‡•Å‡§≤‡§ø‡§∏ ‡§µ‡§æ‡§≤‡•ã ‡§ï‡•Å‡§§‡•ç‡§§‡•ã‡§Ç ‡§á‡§§‡§®‡•Ä ‡§π‡§ø‡§Æ‡•ç‡§Æ‡§§ ‡§∏‡§Æ‡§ù ‡§≤‡•