In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import BertConfig, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from torch.utils.tensorboard import SummaryWriter
import copy

class TokenData(Dataset):
    def __init__(self, data, train = True):
        # data = (train_x, train_tokens, train_y)#
        if train:
            self.text_data = data[0]
            self.tokens = data[1]
            self.labels = data[2]
        else:
            self.text_data = data[0]
            self.tokens = data[1]
            self.labels = data[2]

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample
    
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return data

2025-04-07 16:21:00.646742: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744035660.670502   13226 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744035660.677674   13226 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744035660.696684   13226 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744035660.696720   13226 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744035660.696723   13226 computation_placer.cc:177] computation placer alr

In [2]:
# Code for provided datasets using amazon reviews #

"""data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data = pd.read_csv("amazon_cells_labelled_LARGE_25K.txt", delimiter='\t', header=None)#
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)
training_data = list(training_data)
testing_data = list(validation_data)
training_labels = list(training_labels)
testing_labels = list(validation_labels)"""

# Code for twitter dataset #

data = pd.read_csv("twitter_training.csv", delimiter=',', header=None)#
data.columns = ['id', 'context', 'Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['id', 'context' ,'Class', 'Sentence', 'index']                      # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('U'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)
training_data = list(training_data)
validation_data = list(validation_data)
training_labels = list(training_labels)
training_labels = [0 if v == 'Positive' else 1 if v == 'Negative' else 2 if v == 'Neutral' else 3 for v in training_labels]
validation_labels = list(validation_labels)
validation_labels = [0 if v == 'Positive' else 1 if v == 'Negative' else 2 if v == 'Neutral' else 3 for v in validation_labels]

data = pd.read_csv("twitter_test.csv", delimiter=',', header=None)#
data.columns = ['id', 'context', 'Class', 'Sentence']
data['index'] = data.index                                          # add new column index
columns = ['id', 'context' ,'Class', 'Sentence', 'index']     

testing_data = list(data['Sentence'])
testing_labels = list(data['Class'])
testing_labels = [0 if v == 'Positive' else 1 if v == 'Negative' else 2 if v == 'Neutral' else 3 for v in testing_labels]

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_data = (training_data, tokenizer(training_data, padding = True, truncation=True), training_labels)
val_data = (validation_data, tokenizer(validation_data, padding=True,  truncation=True), validation_labels)
test_data = (testing_data, tokenizer(testing_data, padding = True, truncation=True), testing_labels)

batch_size = 64
train_dataset = TokenData(train_data, train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

val_dataset = TokenData(val_data, train = True)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)
test_dataset = TokenData(test_data ,train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)
print("training_loader_size:", len(train_loader), "validation_loader_size:", len(val_loader), "test_loader_size:", len(test_loader))

training_loader_size: 1051 validation_loader_size: 117 test_loader_size: 16


In [3]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs, batch_size, device):
    # For each epoch
    writer = SummaryWriter(log_dir="/tf/logs")
    best_validation_loss = 1000
    best_training_loss = 1000
    best_training_accuracy = 0
    best_validation_accuracy = 0
    for i in range(num_epochs):
        
        model.train()
        total_validation_loss = 0
        total_training_loss = 0 
        correct_train = 0
        correct_validation = 0
        for batch_nr, batch in enumerate(train_loader):
            
            input_ids, token_type_ids, attention_mask, labels = batch.values()
            
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.type(torch.LongTensor)
            labels = labels.to(device)
            output = model(ids = input_ids, mask = attention_mask) 
            prediction = output.logits
            training_loss = criterion(prediction, labels) 
            total_training_loss += training_loss.item()
            optimizer.zero_grad()
            
            training_loss.backward()
           
            optimizer.step() 
                
            #Print the epoch, batch, and loss
            print(
                '\rEpoch {} [{}/{}] - Loss: {}'.format(
                    i+1, batch_nr+1, len(train_loader), training_loss
                ),
                end=''
            )
            correct_train += torch.sum((torch.max(F.softmax(prediction, dim=1),dim=1)[1]==labels)).item()
            
        writer.add_scalar('Loss/Train', total_training_loss, i)
        
        model.eval()
        with torch.no_grad():
            for validation_nr, batch in enumerate(val_loader):
                
                input_ids, token_type_ids, attention_mask, labels = batch.values()
            
                input_ids = input_ids.to(device)
                token_type_ids = token_type_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.type(torch.LongTensor)
                labels = labels.to(device)
                
                output = model(ids = input_ids, mask = attention_mask) 
                prediction = output.logits
                
                validation_loss = criterion(prediction, labels)
                total_validation_loss += validation_loss.item()
                correct_validation += torch.sum((torch.max(F.softmax(prediction, dim=1),dim=1)[1]==labels)).item()
            if correct_validation/(len(val_loader)*batch_size) > best_validation_accuracy:
                best_validation_accuracy = correct_validation/(len(val_loader)*batch_size)
            if total_validation_loss > best_validation_loss:
                pass
            else:
                best_model = copy.deepcopy(model)
        del input_ids, token_type_ids, attention_mask, labels, prediction, output, validation_loss, training_loss
        writer.add_scalar("Loss/Validation", total_validation_loss, i)
        writer.flush()
    
    writer.close()
    return best_model

In [None]:
# Non pretrained #

class CustomBERTModel(nn.Module):
    def __init__(self):
          super(CustomBERTModel, self).__init__()
          self.config = BertConfig()
          self.bert = BertModel(self.config)
        
          ### Classification head:
          self.dropout = nn.Dropout(p=0.1, inplace=False)
          self.classifier = nn.Linear(768, 4)

    def forward(self, ids, mask):
          output_bert = self.bert(ids, attention_mask=mask)
          dropout_output = self.dropout(output_bert.pooler_output)
          logits = self.classifier(dropout_output)
          return SequenceClassifierOutput(
            logits=logits
          )

model = CustomBERTModel()

# Pre trained # 
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # Pre-trained model
#model.classifier = nn.Linear(model.classifier.in_features, 4)
#model.classifier.weight = nn.init.normal_(model.classifier.weight, mean=0.0, std=0.01)    
#model.classifier.bias = nn.init.zeros_(model.classifier.bias)

optimizer = AdamW(model.parameters(), lr=1e-5) # Optimization function
criterion = torch.nn.CrossEntropyLoss() # Loss function
epochs = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Transfer model to GPU if available
trained_model = train_model(model, criterion, optimizer, train_loader, val_loader, epochs, batch_size, device)
torch.save(trained_model, "./models/BertTransformerThingV1NoPreTraining")

Epoch 2 [771/1051] - Loss: 6.940901221241802e-0528

In [None]:
#model = torch.load("./models/BertTransformerThingV1",weights_only = False)
model = torch.load("./models/BertTransformerThingV1NoPreTraining",weights_only = False)
model.to(device)
correct = 0
total = 0
with torch.no_grad():
    for validation_nr, batch in enumerate(val_loader):
                
        input_ids, token_type_ids, attention_mask, labels = batch.values()
    
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        # We don't need gradients for testing
        with torch.no_grad():
            output = model(ids = input_ids, mask = attention_mask)
        
        logits = output.logits
    
        correct += (logits.argmax(1) == labels).sum().item()
        total += labels.size(0)
print("Testing accuracy: ",correct/(total))
# TESTING BLOCK ENDS