# Overview

Load in the tensors and configuration from Config class and yeah. Build the bert model locally and have an option to load in another pre-trained one

Features

* Initialize from scratch
* Initialize from pre-trained model
* Set freezable parameters
Load in tensor data
* Create data loaders
* Predict function(nn inputs and input_ids/attention mask)
* Predict function(nn inputs and text)
* Evaluation function(some data loader)
* Train one epoch
* Train multiple epochs
    * Calls train one epoch but has more overall reporting stats, perhaps return history object
* Save model


# Imports

In [None]:
import utils
import constants
import time

start_time = time.time()

In [None]:
filename = 'part2model2testconfigs'
output_folder, num = utils.get_next_output_folder(f'outputs/{filename}_output')
config = constants.Config(filename,
                          output_folder = output_folder,
                          num_iteration=num,
                          epochs=5,
                          load_tensor_path='outputs/part2preprocessing_output_8',
                          )
print("Output folder: ", output_folder)

In [None]:
# Import Libraries
import os
import io
import json
import sys
import re
import pandas as pd
import kagglehub
import time
import datetime
import torch
import torch.nn as nn
import torchmetrics
from torchmetrics.functional import f1_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import transformers
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, get_linear_schedule_with_warmup, BertModel
from transformers import AutoTokenizer
from IPython.display import display, clear_output
import dask.dataframe as dd
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import seaborn as sns
import GPUtil

import traceback
import sys

import geopandas as gpd

import nltk
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('universal_tagset')
nltk.download('stopwords')
try:
    nltk.download('averaged_perceptron_tagger')
except:
    pass
try:
    nltk.download('averaged_perceptron_tagger_eng')
except:
    pass


from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import spacy

os.system('./venv/bin/python -m spacy download en_core_web_sm')
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
try:
    nltk.download('punkt')
except:
    pass
try:
    nltk.download('punkt_tab')
except:
    pass

from nltk.tokenize import sent_tokenize


# Define tensors

In [None]:
# Overview

class BertClassifier(nn.Module):
	def __init__(self, 
				 load_path=config.load_in_model_path, 
				 batch_size=config.batch_size, 
				 epochs=config.epochs, 
				 tokenizer_max_length=config.tokenizer_max_length, 
				 nn_input_size=config.nn_input_size):
		
		super(BertClassifier, self).__init__()
		
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		print("Running on device: ", self.device)

		print("Constructing model...")
		self.bert = BertModel.from_pretrained('bert-base-uncased').to(self.device)
		self.bert_output_size = self.bert.config.hidden_size

		self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

		self.nn_output_size = 12
		self.nn_layers = nn.Sequential(
			nn.Linear(nn_input_size, 12),
			nn.ReLU(),
			nn.Linear(12, self.nn_output_size),
			nn.ReLU()
		).to(self.device)

		self.final_layers = nn.Sequential(
			nn.Linear(self.bert_output_size + self.nn_output_size, 12),
			nn.ReLU(),
			nn.Linear(12, 1)
		).to(self.device)

		self.optimizer = torch.optim.AdamW(self.parameters(), lr=2e-5, eps=1e-8)
		self.scheduler = self.get_scheduler(self.optimizer, None)
		self.criterion = nn.BCEWithLogitsLoss()
			
		if load_path:
			print("Loading in model...")
			self.load_model(load_path)

		self.batch_size=batch_size
		self.epochs = epochs
		self.tokenizer_max_length = tokenizer_max_length



	
	def forward(self, input_ids, attention_mask, nn_input):
		bert_outputs = self.bert(input_ids, attention_mask)
		pooled_output = bert_outputs.pooler_output

		nn_outputs = self.nn_layers(nn_input)
		final_input = torch.cat((pooled_output, nn_outputs), dim=1)
		return self.final_layers(final_input)



	def train_model(self, train_dataloader: DataLoader,
		   val_dataloader: DataLoader):
		print("Training model...")

		train_losses = []
		val_losses = []
		train_accuracies = []
		val_accuracies = []

		best_val_acc = 0.0

		# Set these when training
		self.optimizer = torch.optim.AdamW(self.parameters(), lr=2e-5, eps=1e-8)
		self.scheduler = self.get_scheduler(self.optimizer, train_dataloader)
		self.criterion = nn.BCEWithLogitsLoss()

		for epoch in range(self.epochs):
			print(f"\n===== Epoch {epoch + 1}/{self.epochs} =====")

			train_loss, train_acc = self.train_epoch(train_dataloader)
			train_losses.append(train_loss)
			train_accuracies.append(train_acc)

			val_loss, val_acc = self.evaluate(val_dataloader, epoch)
			val_losses.append(val_loss)
			val_accuracies.append(val_acc)

			if val_acc > best_val_acc:
				best_val_acc = val_acc
				self.save_model()
		
		return train_losses, val_losses, train_accuracies, val_accuracies

	
	def train_epoch(self, train_dataloader: DataLoader):
		self.train()

		train_loss = 0
		correct_predictions = 0
		total_samples = 0

		for step, batch in enumerate(train_dataloader):
			print(f"Progress: {step + 1}/{len(train_dataloader)}", end='\r')

			self.optimizer.zero_grad()

			b_input_ids = batch[0].to(self.device)
			b_input_mask = batch[1].to(self.device)
			b_nn_input = batch[2].to(self.device)
			b_labels = batch[3].to(self.device)

			b_labels = b_labels.float().view(-1, 1)

			output = self.forward(b_input_ids, b_input_mask, b_nn_input)
			loss = self.criterion(output, b_labels)

			loss.backward()
			self.optimizer.step()
			self.scheduler.step()

			train_loss += loss.item()

			preds = torch.round(torch.sigmoid(output))

			correct_predictions += torch.sum(preds == b_labels)
			total_samples += b_labels.size(0)
		
		train_loss /= len(train_dataloader)
		train_acc = (correct_predictions / total_samples).item()

		print("Training Loss: ", train_loss)
		print("Train accuracy: ", train_acc)
		return train_loss, train_acc,

	
	def predict(self, test_dataloader: DataLoader):
		self.eval()

		all_preds = []
		all_labels = []

		for step, batch in enumerate(test_dataloader):
			print(f"Progress: {step + 1}/{len(test_dataloader)}", end='\r')

			b_input_ids = batch[0].to(self.device)
			b_input_mask = batch[1].to(self.device)
			b_nn_input = batch[2].to(self.device)
			b_labels = batch[3].to(self.device)

			with torch.no_grad():
				output = self.forward(b_input_ids, b_input_mask, b_nn_input)
				preds = torch.round(torch.sigmoid(output))

				all_preds.extend(preds.cpu().numpy())
				all_labels.extend(b_labels.cpu().numpy())
		
		return np.array(all_preds), np.array(all_labels).reshape(-1, 1)

	def predict_single(self, text: str, nn_input: torch.Tensor):
		self.eval()

		encoded_text = self.tokenizer.encode_plus(
			text,
			add_special_tokens=True,
			padding='max_length',
			max_length=config.tokenizer_max_length,
			pad_to_max_length=True,
			return_attention_mask=True,
			return_tensors='pt',
			truncation=True
		)

		input_ids = encoded_text['input_ids'].to(self.device)
		attention_mask = encoded_text['attention_mask'].to(self.device)

		with torch.no_grad():
			output = self.forward(input_ids, attention_mask, nn_input)
			pred = torch.round(torch.sigmoid(output))

		return pred.item()


	def evaluate(self, test_dataloader: DataLoader, epoch=None):
		self.eval()
		
		preds, labels = self.predict(test_dataloader)
		preds = np.array(preds)
		labels = np.array(labels)
		print("Classification Report" + (f" Epoch {epoch}" if epoch else "") + ":")
		print(classification_report(labels, preds))
		print("Number of examples: ", len(labels))
		print("Number of correct predictions: ", (preds == labels).sum())

		preds_tensor, labels_tensor = torch.tensor(preds), torch.tensor(labels)
		num_classes = 2
		print("Accuracy: ", torchmetrics.functional.accuracy(preds_tensor, labels_tensor, num_classes=num_classes, task='binary').item())
		print("Macro F1 score: ", f1_score(preds_tensor, labels_tensor, num_classes=num_classes, average='macro', task='binary').item())
		print("Micro F1 score: ", f1_score(preds_tensor, labels_tensor, num_classes=num_classes, average='micro', task='binary').item())
		print("Weighted F1 score: ", f1_score(preds_tensor, labels_tensor, num_classes=num_classes, average='weighted', task='binary').item())



		loss = self.criterion(torch.tensor(preds).view(-1, 1), torch.tensor(labels).view(-1, 1)).item()

		acc = (preds == labels).mean().item()

		print("Accuracy: ", acc)
		print("Loss: ", loss)

		return loss, acc




	def load_data_tensors(self, which='train', tensor_path=config.load_tensor_path):
		tensors = torch.load(os.path.join(tensor_path, f'all_tensors_percentage_1_{which}.pth'))

		


		print(f"Loading in tensors from {which} dataset...")
		print("Number of samples: ", len(tensors['input_ids']))
		print("Dims of input_ids: ", tensors['input_ids'].shape)
		print("Dims of attention_masks: ", tensors['attention_masks'].shape)
		print("Dims of nn_inputs: ", tensors['nn_inputs'].shape)
		print("Dims of outputs: ", tensors['outputs'].shape)
		
		return tensors['input_ids'], tensors['attention_masks'], tensors['nn_inputs'], tensors['outputs']

	def create_dataloaders(self, 
						input_ids: torch.Tensor,
						attention_masks: torch.Tensor,
						nn_inputs: torch.Tensor,
						outputs: torch.Tensor,
						sampler: SequentialSampler):
	
		dataset = TensorDataset(input_ids, attention_masks, nn_inputs, outputs)

		print("Creating dataloaders...")
		print("Number of samples: ", len(dataset))

		dataloader = DataLoader(
			dataset,
			sampler=sampler(dataset),
			batch_size=self.batch_size
		)

		return dataloader
	
	def get_scheduler(self, optimizer, train_dataloader=None):
		if train_dataloader is None:
			total_steps=100000
		else:
			total_steps = len(train_dataloader) * self.epochs
		scheduler = get_linear_schedule_with_warmup(
			optimizer,
			num_warmup_steps=0,
			num_training_steps=total_steps
		)
		return scheduler

	
	def freeze_layers(self, num_layers_left_unfrozen: int):
		for param in self.bert.parameters():
			param.requires_grad = False
		
		for param in self.bert.encoder.layer[num_layers_left_unfrozen:].parameters():
			param.requires_grad = True
		
	
		

	def save_model(self, output_folder=config.output_folder):
		save_path = os.path.join(output_folder, 'bert_hybrid.pth')
		tokenizer_save_path = os.path.join(output_folder, 'tokenizer.pth')
		torch.save({
			'bert': self.bert.state_dict(),
			'nn_layers': self.nn_layers.state_dict(),
			'final_layers': self.final_layers.state_dict(),
			'optimizer': self.optimizer.state_dict(),
			'scheduler': self.scheduler.state_dict(),
			'criterion': self.criterion.state_dict()
		}, save_path)

		self.tokenizer.save_pretrained(tokenizer_save_path)
		print("Model saved to: ", save_path)
		print("Tokenizer saved to: ", tokenizer_save_path)

		return save_path, tokenizer_save_path



	def load_model(self, model_path=config.load_in_model_path):
		print("Loading in model from : ", model_path)
		try:
			checkpoint = torch.load(model_path, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
		except:
			with open(model_path, "rb") as f:
				buffer = io.BytesIO(f.read())

			checkpoint = torch.load(buffer, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
		print("Successfully loaded big tensor...")
		self.bert.load_state_dict(checkpoint['bert'])
		self.tokenizer = AutoTokenizer.from_pretrained(model_path.replace('bert_hybrid.pth', 'tokenizer.pth'))
		self.nn_layers.load_state_dict(checkpoint['nn_layers'])
		self.final_layers.load_state_dict(checkpoint['final_layers'])
		self.optimizer.load_state_dict(checkpoint['optimizer'])
		self.scheduler.load_state_dict(checkpoint['scheduler'])
		self.criterion.load_state_dict(checkpoint['criterion'])
		self.to(self.device)


# Create model and apply

In [None]:
model = BertClassifier()

input_ids, attention_masks, nn_inputs, outputs = model.load_data_tensors(which='train')
train_dataloader = model.create_dataloaders(input_ids, attention_masks, nn_inputs, outputs, RandomSampler)

input_ids, attention_masks, nn_inputs, outputs = model.load_data_tensors(which='val')
val_dataloader = model.create_dataloaders(input_ids, attention_masks, nn_inputs, outputs, SequentialSampler)

GPUs = GPUtil.getGPUs()
if GPUs:
    gpu = GPUs[0]
    print(f"Training model on: {gpu.name}")
else:
    print("Training model on CPU")
    
train_losses, val_losses, train_accuracies, val_accuracies = model.train_model(train_dataloader, val_dataloader)

print("Train losses: ", train_losses)
print("Val losses: ", val_losses)
print("Train accuracies: ", train_accuracies)
print("Val accuracies: ", val_accuracies)

plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.legend()
title = "Losses Of Train and Validation"
file_name = title.replace(" ", "_").lower()
plt.title(title)
plt.savefig(f'{os.path.join(config.output_folder, file_name)}')
plt.show()
plt.clf()


plt.plot(train_accuracies, label='Training Acc')
plt.plot(val_accuracies, label='Validation Acc')
plt.legend()
title = "Accs Of Train and Validation"
file_name = title.replace(" ", "_").lower()
plt.title(title)
plt.savefig(f'{os.path.join(config.output_folder, file_name)}')
plt.show()

save_path, tokenizer_save_path = model.save_model()

In [None]:
# Comment in/out to either use or not use this model
save_path = 'outputs/part2model2_output_16/bert_hybrid.pth'


loaded_model = BertClassifier(load_path=save_path)

input_ids, attention_masks, nn_inputs, outputs = loaded_model.load_data_tensors(which='test')

test_dataloader = loaded_model.create_dataloaders(input_ids, attention_masks, nn_inputs, outputs, SequentialSampler)


In [None]:
print(loaded_model.evaluate(test_dataloader))

In [None]:
pred, labels = loaded_model.predict(test_dataloader)

print(np.hstack((pred, labels)))

In [None]:
# torch.tensor([])
# text = "Very nice staff, and the airport shuttle runs 24 hours a day which is convenient. The shuttle will also take you to places within a couple of miles of the hotel, so you can eat, shop, or go bar hopping and not have to call a cab! Next door is a very good looking Cuban breakfast/lunch spot which closes at 3pm. Will need to try that on my next visit."
# loaded_model.predict_single(text, torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]).view(1, -1))

In [None]:
utils.send_email(f"✅ Finished {config.filename} execution", f"Took {utils.get_time_from_start(start_time)}\nConfig: {config.__dict__}")

# save the config's state dict as a file
with open(os.path.join(config.output_folder, 'config.json'), 'w') as f:
    json.dump(config.__dict__, f, indent=4)

In [None]:
# import torchviz

# # Move tensors to the same device as the model
# device = loaded_model.device
# input_ids = test_dataloader.dataset.tensors[0][0].view(1, -1).to(device)
# attention_mask = test_dataloader.dataset.tensors[1][0].view(1, -1).to(device)
# nn_input = test_dataloader.dataset.tensors[2][0].view(1, -1).to(device)

# # Forward pass
# y = loaded_model.forward(input_ids, attention_mask, nn_input)

# # Visualize the model
# dot = torchviz.make_dot(y, params=dict(loaded_model.named_parameters()))
# dot.render("model_graph", format="png", cleanup=True)  # Save as PNG
# dot  # Display graph

In [None]:
# from torch.utils.tensorboard import SummaryWriter

# loaded_model
# writer = SummaryWriter()
# dummy_input = [input_ids, attention_mask, nn_input]
# writer.add_graph(loaded_model, dummy_input)
# writer.close()