In [None]:
# import standard libraries
import os
import numpy as np
import sys

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [None]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

In [None]:
# import dataset and models used
sys.path.insert(0, '//wsl$/Ubuntu/home/zaidek/Part-II-Disertation/Neural Networks')
from ipynb.fs.defs.datasets import BertProcessedTitleEmbeddingTitleAndTextDataset
from ipynb.fs.defs.models import FFNetworkBertEmbedding
from ipynb.fs.defs.FFtraining import train_model_bert
from ipynb.fs.defs.FFtraining import define_sampler

In [None]:
# import training data
with open("../data/data_train", "rb") as fb:
    training_data = pickle.load(fb)

# import validation data
with open("../data/data_valid", "rb") as fb:
    validation_data = pickle.load(fb)

In [None]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
device = "cpu"

In [None]:
# reindex data
train_data_indexed = training_data.reset_index(drop=True)
valid_data_indexed = validation_data.reset_index(drop=True)

In [None]:
# define cols which are used in model
req_cols_without_url = ["title", "text", "time"]

In [None]:
# extract specific cols needed
scores = train_data_indexed.score
training_data_indexed = train_data_indexed[req_cols_without_url]

validation_scores = valid_data_indexed.score
validation_data_indexed = valid_data_indexed[req_cols_without_url]

In [None]:
# fill nan values in normal data
training_data_indexed.title = training_data_indexed.title.fillna("")
training_data_indexed.text = training_data_indexed.fillna("")

validation_data_indexed.title = validation_data_indexed.title.fillna("")
validation_data_indexed.text = validation_data_indexed.text.fillna("")

In [None]:
# define loader parameters
cutoff = 20
batch_size = 128
num_epochs = 10

train_sampler = define_sampler(scores, cutoff)
valid_sampler = define_sampler(validation_scores, cutoff)

In [None]:
# create bert embedding datasets
dataset_train = BertProcessedTitleEmbeddingTitleAndTextDataset(training_data_indexed, scores, cutoff)
dataset_valid = BertProcessedTitleEmbeddingTitleAndTextDataset(validation_data_indexed, validation_scores, cutoff)

In [None]:
# define batched bert embedding loaders
use_sampler = False
train_loader, valid_loader = None, None
if use_sampler:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, sampler=valid_sampler)
else:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=True)

In [None]:
# define dimensions of bert model
embedding_dim = 768 * 2
output_dim = 1

# instantiate bert model
model = FFNetworkBertEmbedding(output_dim, embedding_dim)

In [None]:
# define loss function
loss_func = nn.BCEWithLogitsLoss()

# define paramters for optimizers
learning_rate = 0.01

# define basic optimizer class
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [None]:
# train model
model_final, train_losses, valid_losses = train_model_bert(model, train_loader, valid_loader, loss_func, optimizer, device, num_epochs)