In [1]:
# import standard libraries
import os
import numpy as np
import sys

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [2]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

In [3]:
# import dataset and models used
sys.path.insert(0, '//wsl$/Ubuntu/home/zaidek/Part-II-Disertation/Neural Networks')
from ipynb.fs.defs.datasets import BertTitleEmbeddingDataset
from ipynb.fs.defs.models import FFNetworkBertEmbedding
from ipynb.fs.defs.FFtraining import train_model_bert, define_sampler

In [4]:
# import training data
with open("../../data/data_train", "rb") as fb:
    training_data = pickle.load(fb)

# import validation data
with open("../../data/data_valid", "rb") as fb:
    validation_data = pickle.load(fb)

In [5]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
device = "cpu"

cuda


In [6]:
# reindex data
train_data_indexed = training_data.reset_index(drop=True)
valid_data_indexed = validation_data.reset_index(drop=True)

In [7]:
# define cols which are used in model
req_cols_without_url = ["title", "text", "time"]

In [8]:
# extract specific cols needed
scores = train_data_indexed.score
training_data_indexed = train_data_indexed[req_cols_without_url]

validation_scores = valid_data_indexed.score
validation_data_indexed = valid_data_indexed[req_cols_without_url]

In [9]:
# fill nan values in normal data
training_data_indexed.title = training_data_indexed.title.fillna("")
training_data_indexed.text = training_data_indexed.fillna("")

validation_data_indexed.title = validation_data_indexed.title.fillna("")
validation_data_indexed.text = validation_data_indexed.text.fillna("")

In [10]:
# define loader parameters
cutoff = 20
batch_size = 128
num_epochs = 10

train_sampler = define_sampler(scores, cutoff)
valid_sampler = define_sampler(validation_scores, cutoff)

In [11]:
# create bert embedding datasets
dataset_train = BertTitleEmbeddingDataset(training_data_indexed, scores, cutoff)
dataset_valid = BertTitleEmbeddingDataset(validation_data_indexed, validation_scores, cutoff)

In [12]:
# define batched bert embedding loaders
use_sampler = True
train_loader, valid_loader = None, None
if use_sampler:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, sampler=valid_sampler)
else:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=True)

In [13]:
# define dimensions of bert model
embedding_dim = 768
output_dim = 1

# instantiate bert model
model = FFNetworkBertEmbedding(output_dim, embedding_dim)

In [14]:
# define loss function
loss_func = nn.BCEWithLogitsLoss()

# define paramters for optimizers
learning_rate = 0.01

# define basic optimizer class
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [15]:
# train model
model_final, train_losses, valid_losses = train_model_bert(model, train_loader, valid_loader, loss_func, optimizer, device, num_epochs, "../trained_models/bert_model_weighted.pth")

Epoch 0.
Iteration 100. Current Min Loss 3.078091785311699. Running F1 0.49092461144994587.
Iteration 200. Current Min Loss 0.7384689223766326. Running F1 0.551791154949958.
Iteration 300. Current Min Loss 0.6943956524133682. Running F1 0.5860464078882593.
Iteration 400. Current Min Loss 0.696184179186821. Running F1 0.32653047337009.
Iteration 500. Current Min Loss 0.6932570201158523. Running F1 0.31582003050507973.
Iteration 600. Current Min Loss 0.693211475610733. Running F1 0.0524114598358676.
Iteration 700. Current Min Loss 0.6933062469959259. Running F1 0.23311069202708568.
Iteration 800. Current Min Loss 0.6932355099916458. Running F1 0.3913681917083169.
Iteration 900. Current Min Loss 0.6931319665908814. Running F1 0.665865246862458.
Iteration 1000. Current Min Loss 0.6933571839332581. Running F1 0.4030491864611258.
Iteration 1100. Current Min Loss 0.6932243263721466. Running F1 0.3502269469314121.
Iteration 1200. Current Min Loss 0.6932786166667938. Running F1 0.09372429249554

KeyboardInterrupt: 

In [None]:
with open("../losses/bert_weighted_losses", "wb") as fb:
    fb.dump(train_losses)