In [1]:
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AdamW
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
df=pd.read_excel('merged.xlsx')

In [3]:
print(df.head())

  articleID          domain             date   category  \
0         0  jagonews24.com  9/19/2018 17:48  Education   
1         0  jagonews24.com  9/19/2018 17:48   National   
2         3  jagonews24.com  9/19/2018 17:48   National   
3         4  jagonews24.com  9/19/2018 17:48      Crime   
4         5  jagonews24.com  9/19/2018 17:48   National   

                                            headline  \
0   হট্টগোল করায় বাকৃবিতে দুইজন বরখাস্ত, ৬ জনকে শোকজ   
1    মালয়েশিয়ায় কর্মী পাঠানোর ব্যবস্থা নেয়ার সুপারিশ   
2  প্রেমের প্রস্তাবে রাজি না হওয়ায় স্কুলছাত্রীকে ...   
3  মেডিয়েশনই মামলাজট নিরসনের পথ : বিচারপতি আহমেদ ...   
4         টকশোতে বক্তব্য দিতে গিয়ে জাপা নেতার মৃত্যু   

                                             content label  
0  গত ১৭ সেপ্টেম্বর বাংলাদেশ কৃষি বিশ্ববিদ্যালয়ে ...     0  
1  বাংলাদেশের বৃহৎ শ্রমবাজার মালয়েশিয়ায় আবার শ্রম...     0  
2  নরসিংদীর মনোহরদীতে প্রেমের প্রস্তাবে রাজি না হ...     0  
3  সুপ্রিম কোর্টের হাইকোর্ট বিভাগের বিচারপতি আহমে...     0  
4  

In [4]:
print(df.isnull().sum())

articleID     98
domain        97
date         109
category     110
headline     112
content      112
label        215
dtype: int64


In [5]:
df['label'] = pd.to_numeric(df['label'], errors='coerce', downcast='integer')
df.loc[~df['label'].isin([0, 1]), 'label'] = None

In [6]:
df.dropna(subset=['headline','label'],inplace=True)

preprocessing start

In [7]:
def preprocess_text(text):
  text=re.sub(r'[^a-zA-Z\u0980-\u09FF\s]','',text)
  text=re.sub(r'\s+',' ',text).strip()
  return text

In [8]:
df['clean_headline'] = df['headline'].apply(preprocess_text)
df['clean_domain'] = df['domain'].apply(preprocess_text)
df['clean_content'] = df['content'].apply(preprocess_text)
df['clean_category'] = df['category'].apply(preprocess_text)
df['clean_category'] = df['clean_category'].apply(preprocess_text)

In [9]:
df['combined_text'] = df['clean_headline'] + ' ' + df['clean_domain'] + ' ' + df['clean_content'] + ' ' + df['clean_category'] + ' ' + df['clean_category']
print(df['combined_text'].head())

0    হট্টগোল করায় বাকৃবিতে দুইজন বরখাস্ত ৬ জনকে শোক...
1    মালয়েশিয়ায় কর্মী পাঠানোর ব্যবস্থা নেয়ার সুপারি...
2    প্রেমের প্রস্তাবে রাজি না হওয়ায় স্কুলছাত্রীকে ...
3    মেডিয়েশনই মামলাজট নিরসনের পথ বিচারপতি আহমেদ সো...
4    টকশোতে বক্তব্য দিতে গিয়ে জাপা নেতার মৃত্যু jag...
Name: combined_text, dtype: object


In [10]:
headline_len=df['clean_content'].apply(len)
avg_len=headline_len.mean()
print(avg_len)

1740.707194835916


tokenization

In [11]:
tokenizer=AutoTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
model=AutoModelForMaskedLM.from_pretrained('sagorsarker/bangla-bert-base')

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with anot

In [12]:
def tokenize(combined_text, max_length=512):
  return tokenizer(combined_text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')

In [13]:
print(df.columns)

Index(['articleID', 'domain', 'date', 'category', 'headline', 'content',
       'label', 'clean_headline', 'clean_domain', 'clean_content',
       'clean_category', 'combined_text'],
      dtype='object')


In [14]:
tokens=tokenize(df['combined_text'].tolist())

dataset preparation

In [15]:
x_train,x_test,y_train,y_test=train_test_split(
    tokens['input_ids'],df['label'],test_size=0.2,random_state=42
)

In [16]:
print(f"x_train shape: {x_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: torch.Size([39906, 512])
x_test shape: torch.Size([9977, 512])
y_train shape: (39906,)
y_test shape: (9977,)


In [17]:
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

In [18]:
x_train = torch.tensor(x_train)
x_test = torch.tensor(x_test)

  x_train = torch.tensor(x_train)
  x_test = torch.tensor(x_test)


In [19]:
print(f"x_train type: {type(x_train)}")
print(f"x_train shape: {x_train.shape}")

x_train type: <class 'torch.Tensor'>
x_train shape: torch.Size([39906, 512])


In [20]:
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)

In [21]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [22]:
model=AutoModelForSequenceClassification.from_pretrained(
    'sagorsarker/bangla-bert-base', num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training components

In [23]:
optimizer=AdamW(model.parameters(),lr=5e-5)



In [24]:
loss_fnc=torch.nn.CrossEntropyLoss()

In [25]:
device=torch.device('cuda')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(102025, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [26]:
epoch=3

In [None]:
for epoch in range(epoch):
    model.train()
    total_loss=0
    for batch in train_loader:
        inputs,labels=batch
        inputs, labels=inputs.to(device), labels.to(device)
        outputs=model(inputs)
        loss=loss_fnc(outputs.logits,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
print(f"EPOCH : {epoch+1}, LOSS:{total_loss/len(train_loader)}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [28]:
model.eval()
all_preds, all_labels = [], []

In [29]:
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [30]:
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9737396010824897
Precision: 0.9481688107162862
Recall: 0.9737396010824897
F1 Score: 0.9607840975539699


In [32]:
model.save_pretrained('saved_model/banglabert_model')
tokenizer.save_pretrained('saved_model/banglabert_tokenizer')

('saved_model/banglabert_tokenizer\\tokenizer_config.json',
 'saved_model/banglabert_tokenizer\\special_tokens_map.json',
 'saved_model/banglabert_tokenizer\\vocab.txt',
 'saved_model/banglabert_tokenizer\\added_tokens.json',
 'saved_model/banglabert_tokenizer\\tokenizer.json')