In [3]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [4]:
# Load pretrained BERT and tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [181]:
data = pd.read_csv("data.csv", encoding="ISO-8859-1")

In [205]:
data=data[["label","text"]]
data["label"]=data["label"].replace({4: 1})
data.head()

Unnamed: 0,label,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [219]:
data["label"].value_counts()

label
0    799999
1    248576
Name: count, dtype: int64

In [206]:
from sklearn.model_selection import train_test_split 

data_text = data['text']
data_labels = data['label']

train_text, temp_text, train_labels, temp_labels = train_test_split(
    data_text, data_labels, test_size=0.3, stratify=data_labels, random_state=2021
)

val_text, test_text, val_labels, test_labels = train_test_split(
    temp_text, temp_labels, test_size=0.5, stratify=temp_labels, random_state=2021
)


In [209]:
# Freeze BERT layers
for param in bert_model.parameters():
    param.requires_grad = False

# Define new architecture on top of BERT to Finetune on classi
class BERTArchitecture(nn.Module):
    def __init__(self, bert_model):
        super(BERTArchitecture, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token output
        x = self.fc1(cls_output)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Instantiate the model
model = BERTArchitecture(bert_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [210]:
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

In [211]:
# Tokenize data
def tokenize_data(texts, labels):
    tokens = tokenizer.batch_encode_plus(
        texts.tolist(),
        max_length=17,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    dataset = torch.utils.data.TensorDataset(
        tokens['input_ids'], tokens['attention_mask'], torch.tensor(labels.tolist())
    )
    return dataset

train_dataset = tokenize_data(train_text, train_labels)
val_dataset = tokenize_data(val_text, val_labels)
test_dataset = tokenize_data(test_text, test_labels)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

In [2]:
# Train the model
def train_model():
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = [item.to(device) for item in batch]
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    
train_model()

In [218]:
# Test the model
def test_model():
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            batch = [item.to(device) for item in batch]
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds))

# Example usage
test_model()


              precision    recall  f1-score   support

           0       0.83      0.93      0.88    120000
           1       0.64      0.39      0.49     37287

    accuracy                           0.80    157287
   macro avg       0.74      0.66      0.68    157287
weighted avg       0.79      0.80      0.79    157287

