<a href="https://colab.research.google.com/github/anika-tahsin4152/Senior_Design/blob/main/Sentence_Classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
dataset_path = "/content/drive/MyDrive/Research/NLP/Project/Identifying_offensive_text_with_Bengla_language_from_social_media.xlsx"
df = pd.read_excel(dataset_path)

length_yes = len(df[df.threat_label=='yes'])
length_no =len(df[df.threat_label=='no'])
print(length_yes, length_no)

947 3976


In [None]:
# Get the maximum length of text in the 'text_column' column
max_length = df['comment'].str.len().max()

print('Maximum text length:', max_length)

Maximum text length: 1296


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import random
import time

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', truncation=True)

class BanglaNewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.comment
        self.targets = self.data.threat_label
        self.max_length = max_length

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        labels = self.targets[index]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_tensors='pt'

        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return ids.squeeze(), mask.squeeze(), token_type_ids.squeeze(), labels

max_length = 256
batch_size = 16

In [None]:
train_data, test_data = train_test_split(df, test_size=0.10, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.20, random_state=42)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

train_dataset = BanglaNewsDataset(train_data, tokenizer, max_length)
val_dataset = BanglaNewsDataset(val_data, tokenizer, max_length)
test_dataset = BanglaNewsDataset(test_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [None]:
class BanglaNewsClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(BanglaNewsClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=num_classes)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_classes)
        # print("\n\n\n\n", self.bert.config.hidden_size, "\n\n\n\n\n")

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        return outputs.logits

In [None]:
num_classes = len(df.threat_label.unique())

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
save_model = BanglaNewsClassifier(num_classes).to(device)
# Load the saved model weights
save_model.load_state_dict(torch.load('/content/drive/MyDrive/model_weights.pth'))

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

<All keys matched successfully>

In [None]:
def predict_class(save_model, tokenizer, device, sentence):
    # Tokenize the input sentence
    encoded_sent = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='pt'
    )

    # Move the input to the correct device
    input_ids = encoded_sent['input_ids'].to(device)
    attention_mask = encoded_sent['attention_mask'].to(device)
    token_type_ids = encoded_sent['token_type_ids'].to(device)

    # Make the prediction
    with torch.no_grad():
        outputs = save_model(input_ids, attention_mask, token_type_ids)
        predictions = torch.argmax(outputs, dim=1)

    return predictions.item()

In [None]:
sentence = str(input())
predicted_class = predict_class(save_model, tokenizer, device, sentence)
# print(type(predicted_class))
if predicted_class == 1:
  print("This sentence is threatful!!!")
else:
  print("This sentence is not threatful...!")

আজকের খবর: বৃষ্টির কারণে ঢাকায় জলবায়ু উন্নয়ন সফর থাম
This sentence is not threatful...!
