In [1]:
# import standard libraries
import os
import numpy as np
import sys

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [2]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

In [3]:
# import dataset and models used
sys.path.insert(0, '//wsl$/Ubuntu/home/zaidek/Part-II-Disertation/Neural Networks')
from ipynb.fs.defs.datasets import NonEmbeddingDataset
from ipynb.fs.defs.models import FFNonEmbedding
from ipynb.fs.defs.FFtraining import train_model_bert
from ipynb.fs.defs.FFtraining import define_sampler

In [4]:
# import training data
with open("../data/data_train", "rb") as fb:
    training_data = pickle.load(fb)

# import validation data
with open("../data/data_valid", "rb") as fb:
    validation_data = pickle.load(fb)

In [5]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
device = "cpu"

cuda


In [6]:
# reindex data
train_data_indexed = training_data.reset_index(drop=True)
valid_data_indexed = validation_data.reset_index(drop=True)

In [7]:
# define cols which are used in model
req_cols = ["title", "text", "url", "time", "descendants"]

In [8]:
# extract specific cols needed
scores = train_data_indexed.score
training_data_indexed = train_data_indexed[req_cols]

validation_scores = valid_data_indexed.score
validation_data_indexed = valid_data_indexed[req_cols]

In [9]:
# fill nan values in normal data
training_data_indexed.title = training_data_indexed.title.fillna("")
training_data_indexed.text = training_data_indexed.text.fillna("")
training_data_indexed.url = training_data_indexed.url.fillna("")
training_data_indexed.time = training_data_indexed.time.fillna(0)
training_data_indexed.descendants = training_data_indexed.descendants.fillna(0)

validation_data_indexed.title = validation_data_indexed.title.fillna("")
validation_data_indexed.text = validation_data_indexed.text.fillna("")
validation_data_indexed.url = validation_data_indexed.url.fillna("")
validation_data_indexed.time = validation_data_indexed.time.fillna(0)
validation_data_indexed.descendants = validation_data_indexed.descendants.fillna(0)

In [10]:
# define loader parameters
cutoff = 20
batch_size = 64
num_epochs = 5

train_sampler = define_sampler(scores, cutoff)
valid_sampler = define_sampler(validation_scores, cutoff)

In [11]:
# create bert embedding datasets
dataset_train = NonEmbeddingDataset(training_data_indexed, scores, cutoff)
dataset_valid = NonEmbeddingDataset(validation_data_indexed, validation_scores, cutoff)

In [12]:
# define batched bert embedding loaders
use_sampler = False
train_loader, valid_loader = None, None
if use_sampler:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, sampler=valid_sampler)
else:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=True)

In [13]:
# define dimensions of bert model
input_dim = 8
output_dim = 1

# instantiate bert model
model = FFNonEmbedding(output_dim, input_dim)

In [14]:
# define loss function
loss_func = nn.BCEWithLogitsLoss()

# define paramters for optimizers
learning_rate = 0.01

# define basic optimizer class
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [15]:
# train model
model_final, train_losses, valid_losses = train_model_bert(model, train_loader, valid_loader, loss_func, optimizer, device, num_epochs)

Epoch 0.
Iteration 100. Current Min Loss 1.913975426480174. Running F1 0.18060826669778077.
Iteration 200. Current Min Loss 0.1577922039339319. Running F1 0.6025077634783513.
Iteration 300. Current Min Loss 0.13702458601677792. Running F1 0.674258992469518.
Iteration 400. Current Min Loss 0.09911089092027396. Running F1 0.7537086553797075.
Iteration 500. Current Min Loss 0.12762739507481455. Running F1 0.7290637053005469.
Iteration 600. Current Min Loss 0.10747203032951802. Running F1 0.7366678044178039.
Iteration 700. Current Min Loss 0.10366004472016357. Running F1 0.7918350538350534.
Iteration 800. Current Min Loss 0.12251333357504336. Running F1 0.7536077333742967.
Iteration 900. Current Min Loss 0.10238334556110203. Running F1 0.72246905218964.
Iteration 1000. Current Min Loss 0.1010041024396196. Running F1 0.7632970145609461.
Iteration 1100. Current Min Loss 0.10825303668156266. Running F1 0.7617843884701463.
Iteration 1200. Current Min Loss 0.0980793597176671. Running F1 0.78421

In [None]:
with open("losses/LossWithComments", "wb") as fb:
    pickle.dump(train_losses, fb)