In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import BertConfig, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import nltk
import time
from nltk.corpus import stopwords
from nltk import word_tokenize
from torch.utils.tensorboard import SummaryWriter
import copy
import pickle

class VectorData(Dataset):
    def __init__(self, data, train = True):
            self.vectors = data[0]
            self.labels = data[1]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.vectors[idx], dtype=torch.float32).squeeze()
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

class VectorTokenData(Dataset):
    def __init__(self, data, vectorizer):
        self.text_data = data[0]
        self.labels = data[1]
        self.vectorizer = vectorizer
        self.stop_words = set(stopwords.words('english'))

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        text = str(self.text_data[idx])
        
        tokens = word_tokenize(text)
        tokens = [w for w in tokens if w.lower() not in self.stop_words]
        cleaned_text = ' '.join(tokens)
        
        vector = self.vectorizer.transform([cleaned_text]).todense()
        x = torch.tensor(vector, dtype=torch.float32).squeeze(0)
        
        label = 0 if self.labels[idx] == 1 else 1
        y = torch.tensor(label, dtype=torch.long)

        return x, y

2025-04-13 10:36:09.891168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744533369.909645   57259 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744533369.915309   57259 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744533369.930361   57259 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744533369.930385   57259 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744533369.930387   57259 computation_placer.cc:177] computation placer alr

In [1]:
# For Regular ANN
# Code for hugging face dataset - multi category sentiment analysis

def preprocess_pandas(data, columns): #Removes unnecessary words
    df_ = pd.DataFrame(columns=columns)
    for index, row in data.iterrows():
        word_tokens = word_tokenize(str(row['Sentence']))
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
  
    return df_

"""# get data, pre-process and split
train_df = pd.read_csv("train_df.csv", delimiter=',')
val_df = pd.read_csv("val_df.csv", delimiter=',')
test_df = pd.read_csv("test_df.csv", delimiter=',')

#training and validation
train_df.columns = ['id', 'Sentence', 'Class', 'sentiment']
train_df['index'] = train_df.index    # add new column index
train_df = preprocess_pandas(train_df, train_df.columns)                           

val_df.columns = ['id', 'Sentence', 'Class', 'sentiment']
val_df['index'] = val_df.index    # add new column index
val_df = preprocess_pandas(val_df, val_df.columns)                                 

test_df.columns = ['id', 'Sentence', 'Class', 'sentiment']
test_df['index'] = test_df.index    # add new column index                    
test_df = preprocess_pandas(test_df, test_df.columns) 

word_vectorizer = TfidfVectorizer(analyzer='word', 
                                  ngram_range=(1,2), 
                                  max_features=50000, 
                                  max_df=0.5, 
                                  use_idf=True, 
                                  norm='l2')

training_data = train_df['Sentence'].values.astype('U')
training_labels = train_df['Class'].values.astype('int32')
training_data = word_vectorizer.fit_transform(training_data)
training_data = training_data.todense()   
vocab_size = len(word_vectorizer.vocabulary_)

validation_data = val_df['Sentence'].values.astype('U')
validation_labels = val_df['Class'].values.astype('int32')
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

testing_data = test_df['Sentence'].values.astype('U')
testing_labels = test_df['Class'].values.astype('int32')
test_data = word_vectorizer.transform(testing_data)        # transform texts to sparse matrix
test_data = test_data.todense()

train_data = (list(train_df['Sentence'].values.astype('U')), list(train_df['Class'].values.astype('int32')))
val_data = (list(val_df['Sentence'].values.astype('U')), list(val_df['Class'].values.astype('int32')))
test_data = (list(test_df['Sentence'].values.astype('U')), list(test_df['Class'].values.astype('int32')))

batch_size = 20

train_dataset = VectorData(train_data, train = True)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

val_dataset = VectorData(val_data, train = True)

val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

test_dataset = VectorData(test_data ,train = False)

test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)
print(next(enumerate(test_loader)))

print("training_loader_size:", len(train_loader), "validation_loader_size:", len(val_loader), "test_loader_size:", len(test_loader))
"""

# Dataset and loaders for massive dataset #

# open a file, where you stored the pickled data which also tells us how we should vectorize our data.
file = open('TFIDFVectorizerPreFit.pkl', 'rb') 
# dump information to that file
vectorizer = pickle.load(file)
# close the file
file.close()

batch_size = 100
data = pd.read_csv("train_massive.csv", delimiter=',', header=None)
data.columns = ['Class', 'Sentence', 'Body']
train_data = list(data['Sentence'])
train_labels = list(data['Class'])
train_data = (train_data, train_labels)
train_val_dataset = VectorTokenData(train_data, vectorizer)

train_dataset, val_dataset = torch.utils.data.random_split(train_val_dataset, [0.8, 0.2])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

data = pd.read_csv("test_massive.csv", delimiter=',', header=None)
data.columns = ['Class', 'Sentence', 'Body']
testing_data = list(data['Sentence'])
testing_labels = list(data['Class'])

test_data = (testing_data, testing_labels)
test_dataset = VectorTokenData(test_data, vectorizer)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

print(next(enumerate(test_loader)), len(next(enumerate(test_loader))))

print("training_loader_size:", len(train_loader), "validation_loader_size:", len(val_loader), "test_loader_size:", len(test_loader))

NameError: name 'pickle' is not defined

In [3]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs, batch_size, device):
    # For each epoch
    tensorboard_logging_path = "/tf/logs/Lab1ANN_Massive2" + str(time.time())
    writer = SummaryWriter(log_dir=tensorboard_logging_path)
    best_validation_loss = float('inf')
    best_training_loss = float('inf')
    for i in range(num_epochs):
        
        model.train()
        total_validation_loss = 0
        total_training_loss = 0 
        correct_train = 0
        correct_validation = 0
        for batch_nr, batch in enumerate(train_loader):
            
            vector, labels = batch
            
            vector = vector.to(device)
            labels = labels.to(device)
            prediction = model(vector) 
            training_loss = criterion(prediction, labels)
            writer.add_scalar('Loss Batch / Train', training_loss, i*batch_size + batch_nr)
            writer.flush()
            total_training_loss += training_loss.item()
            optimizer.zero_grad()
            
            training_loss.backward()
           
            optimizer.step() 
                
            #Print the epoch, batch, and loss
            print(
                '\rEpoch {} [{}/{}] - Loss: {}'.format(
                    i+1, batch_nr+1, len(train_loader), training_loss
                ),
                end=''
            )
        print()
            
        writer.add_scalar('Loss/Train', total_training_loss, i)
        
        model.eval()
        with torch.no_grad():
            for validation_nr, batch in enumerate(val_loader):
                
                vector, labels = batch
                
                vector = vector.to(device)
                labels = labels.to(device)
                
                prediction = model(vector)
                
                validation_loss = criterion(prediction, labels)
                writer.add_scalar('Loss Batch / Validation', validation_loss, i*batch_size + validation_nr)
                writer.flush()
                total_validation_loss += validation_loss.item()
                
            if total_validation_loss < best_validation_loss:
                torch.save(model, "./models/ANNIndependentMassive1")
                best_validation_loss = total_validation_loss
        del labels, prediction, validation_loss, training_loss          #Free up memory
        writer.add_scalar("Loss/Validation", total_validation_loss, i)
        writer.flush()
    
    writer.close()
    return model

In [5]:
class Net(nn.Module):
    def __init__(self, vocab_size):
        super(Net,self).__init__()
        self.network = nn.Sequential(
        nn.Linear(vocab_size,100),
        nn.ReLU(),
        nn.Linear(100,20),
        nn.ReLU(),
        nn.Linear(20,2),
        )
    
    def forward(self, input):
        return self.network(input)

model = Net(len(vectorizer.vocabulary_))

optimizer = AdamW(model.parameters(), lr=0.001, weight_decay = 0.01) # Optimization function
criterion = torch.nn.CrossEntropyLoss() # Loss function
epochs = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Transfer model to GPU if available
#trained_model = train_model(model, criterion, optimizer, train_loader, val_loader, epochs, batch_size, device)


Net(
  (network): Sequential(
    (0): Linear(in_features=50000, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)

In [6]:
model = torch.load("./models/ANNIndependentMassive1",weights_only = False)
model.to(device)
correct = 0
total = 0
with torch.no_grad():
    for test_nr, batch in enumerate(test_loader):
                
        vector, labels = batch
            
        vector = vector.to(device)
        labels = labels.to(device)
        
        prediction = model(vector) 

        correct += (prediction.argmax(1) == labels).sum().item()
        total += labels.size(0)
print("Testing accuracy: ",correct/(total))

Testing accuracy:  0.8316525


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.