In [1]:
# import standard libraries
import os
import numpy as np
import sys

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [2]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

In [3]:
# import dataset and models used
sys.path.insert(0, '//wsl$/Ubuntu/home/zaidek/Part-II-Disertation/Neural Networks')
from ipynb.fs.defs.datasets import BertProcessedTitleEmbeddingDataset, BertProcessedTitleEmbeddingDataset2, BertProcessedTitleEmbeddingDataset2_reg
from ipynb.fs.defs.models import DecoderTransformer
from ipynb.fs.defs.FFtraining import train_model_bert, train_model_bert_reg
from ipynb.fs.defs.FFtraining import define_sampler

In [4]:
# import training data
with open("../../data/data_train", "rb") as fb:
    training_data = pickle.load(fb)

# import validation data
with open("../../data/data_valid", "rb") as fb:
    validation_data = pickle.load(fb)

In [5]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
# reindex data
train_data_indexed = training_data.reset_index(drop=True)
valid_data_indexed = validation_data.reset_index(drop=True)

In [7]:
# define cols which are used in model
req_cols_without_url = ["title", "text", "time", "url"]

In [8]:
# extract specific cols needed
scores = train_data_indexed.score.fillna(0)
training_data_indexed = train_data_indexed[req_cols_without_url]

validation_scores = valid_data_indexed.score.fillna(0)
validation_data_indexed = valid_data_indexed[req_cols_without_url]

In [9]:
# fill nan values in normal data
training_data_indexed.title = training_data_indexed.title.fillna("")
training_data_indexed.text = training_data_indexed.text.fillna("")
training_data_indexed.url = training_data_indexed.url.fillna("")
training_data_indexed.time = training_data_indexed.time.fillna(0)

validation_data_indexed.title = validation_data_indexed.title.fillna("")
validation_data_indexed.text = validation_data_indexed.text.fillna("")
validation_data_indexed.url = validation_data_indexed.url.fillna("")
validation_data_indexed.time = validation_data_indexed.time.fillna(0)

In [10]:
# define loader parameters
cutoff = 2
batch_size = 128
num_epochs = 10

train_sampler = define_sampler(scores, cutoff)
valid_sampler = define_sampler(validation_scores, cutoff)

In [11]:
# create bert embedding datasets
dataset_train = BertProcessedTitleEmbeddingDataset(training_data_indexed, scores, cutoff)
dataset_valid = BertProcessedTitleEmbeddingDataset(validation_data_indexed, validation_scores, cutoff)

In [12]:
# define batched bert embedding loaders
use_sampler = False
train_loader, valid_loader = None, None
train_loader_2, valid_loader_2 = None, None
if use_sampler:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, sampler=valid_sampler)
else:
    train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=True)


In [13]:
with open("../data/embeddings/bert_train_embeddings", "rb") as fb:
    train_embed = pickle.load(fb)

with open("../data/embeddings/bert_valid_embeddings", "rb") as fb:
    valid_embed = pickle.load(fb)

In [14]:
dataset_train2 = BertProcessedTitleEmbeddingDataset2_reg(training_data_indexed, scores, cutoff, train_embed)
dataset_valid2 = BertProcessedTitleEmbeddingDataset2_reg(validation_data_indexed, validation_scores, cutoff, valid_embed)
train_loader2 = torch.utils.data.DataLoader(dataset=dataset_train2, batch_size=batch_size, shuffle=True)
valid_loader2 = torch.utils.data.DataLoader(dataset=dataset_valid2, batch_size=batch_size, shuffle=True)

In [18]:
# define dimensions of RNN model
embedding_dim = 768
output_dim = 1
num_heads = 32
num_layers = 64

# instantiate bert model
model = DecoderTransformer(embedding_dim, output_dim, num_heads, num_layers)

In [19]:
# define loss function
loss_func1 = nn.MSELoss()

# define paramters for optimizers
learning_rate = 0.01

# define basic optimizer class
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [20]:
# train model
output1 = train_model_bert_reg(model, train_loader2, valid_loader2, loss_func1, optimizer, device, num_epochs, "trans_model_1.pth")

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 0 Iteration 100. Current Loss 103573.55388305664
Epoch 0 Iteration 200. Current Loss 1877.2783251953124
Epoch 0 Iteration 300. Current Loss 1918.9999201202393
Epoch 0 Iteration 400. Current Loss 2244.76918674469
Epoch 0 Iteration 500. Current Loss 2519.0420249176027
Epoch 0 Iteration 600. Current Loss 1549.9229001617432
Epoch 0 Iteration 700. Current Loss 1697.536490020752
Epoch 0 Iteration 800. Current Loss 1548.0922674560547
Epoch 0 Iteration 900. Current Loss 1648.2868260192872
Epoch 0 Iteration 1000. Current Loss 2670.9844153594972
Epoch 0 Iteration 1100. Current Loss 1766.6497334957123
Epoch 0 Iteration 1200. Current Loss 2546.9122151184083
Epoch 0 Iteration 1300. Current Loss 1984.979910736084
Epoch 0 Iteration 1400. Current Loss 1797.704284210205
Epoch 0 Iteration 1500. Current Loss 1996.5664988899232
Epoch 0 Iteration 1600. Current Loss 1659.235429725647
Epoch 0 Iteration 1700. Current Loss 1585.9060993957519
Epoch 0 Iteration 1800. Current Loss 1913.2326282501222
Epoch 0

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 1 Iteration 100. Current Loss 2930.1646717834474
Epoch 1 Iteration 200. Current Loss 2245.931590576172
Epoch 1 Iteration 300. Current Loss 1689.203221130371
Epoch 1 Iteration 400. Current Loss 1632.6535629272462
Epoch 1 Iteration 500. Current Loss 1769.0015618896484
Epoch 1 Iteration 600. Current Loss 1694.148997039795
Epoch 1 Iteration 700. Current Loss 2197.4529917907716
Epoch 1 Iteration 800. Current Loss 1614.9022577667236
Epoch 1 Iteration 900. Current Loss 2042.320186920166
Epoch 1 Iteration 1000. Current Loss 1809.8966709899903
Epoch 1 Iteration 1100. Current Loss 2366.632477416992
Epoch 1 Iteration 1200. Current Loss 1608.729524230957
Epoch 1 Iteration 1300. Current Loss 1974.9254376220704
Epoch 1 Iteration 1400. Current Loss 1777.857006072998
Epoch 1 Iteration 1500. Current Loss 2080.9784188079834
Epoch 1 Iteration 1600. Current Loss 1814.9807354736329
Epoch 1 Iteration 1700. Current Loss 1901.5679830932618
Epoch 1 Iteration 1800. Current Loss 1770.7316779327393
Epoch 1 

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 2 Iteration 100. Current Loss 1901.8561046600341
Epoch 2 Iteration 200. Current Loss 2230.2563989257815
Epoch 2 Iteration 300. Current Loss 66866595074.70795
Epoch 2 Iteration 400. Current Loss 123835630.52313347
Epoch 2 Iteration 500. Current Loss 2432.776856536865
Epoch 2 Iteration 600. Current Loss 1736.779519882202
Epoch 2 Iteration 700. Current Loss 1259.1922903442382
Epoch 2 Iteration 800. Current Loss 1553.5589575195313
Epoch 2 Iteration 900. Current Loss 1707.8379899597169
Epoch 2 Iteration 1000. Current Loss 1879.4169651794434
Epoch 2 Iteration 1100. Current Loss 1507.6143553161621
Epoch 2 Iteration 1200. Current Loss 1381.2274520874023
Epoch 2 Iteration 1300. Current Loss 2136.9699224853516
Epoch 2 Iteration 1400. Current Loss 1740.3048405456543
Epoch 2 Iteration 1500. Current Loss 2864.837562255859
Epoch 2 Iteration 1600. Current Loss 1795.9465102386475
Epoch 2 Iteration 1700. Current Loss 2116.3399160766603
Epoch 2 Iteration 1800. Current Loss 1938.0806842041015
Epoch

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 3 Iteration 100. Current Loss 1703.7881631469727
Epoch 3 Iteration 200. Current Loss 1936.6833955383302
Epoch 3 Iteration 300. Current Loss 1643.4220068359375
Epoch 3 Iteration 400. Current Loss 1629.3242471313476
Epoch 3 Iteration 500. Current Loss 2561.2080168151856
Epoch 3 Iteration 600. Current Loss 1657.9714855194093
Epoch 3 Iteration 700. Current Loss 1622.9184841918945
Epoch 3 Iteration 800. Current Loss 2203.1206355285644
Epoch 3 Iteration 900. Current Loss 1945.8044847106933
Epoch 3 Iteration 1000. Current Loss 2072.052545928955
Epoch 3 Iteration 1100. Current Loss 2012.7938282775879
Epoch 3 Iteration 1200. Current Loss 1930.7905001831055
Epoch 3 Iteration 1300. Current Loss 2368.0141883850097
Epoch 3 Iteration 1400. Current Loss 2632.8247146606445
Epoch 3 Iteration 1500. Current Loss 1487.759104309082
Epoch 3 Iteration 1600. Current Loss 1880.0157418060303
Epoch 3 Iteration 1700. Current Loss 2563.6964851379394
Epoch 3 Iteration 1800. Current Loss 1664.7711729431153
Epo

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 4 Iteration 100. Current Loss 1812.7016822052
Epoch 4 Iteration 200. Current Loss 1764.0617043304444
Epoch 4 Iteration 300. Current Loss 1362.4637509155273
Epoch 4 Iteration 400. Current Loss 1526.0166744995117
Epoch 4 Iteration 500. Current Loss 1751.052608947754
Epoch 4 Iteration 600. Current Loss 1806.4171142578125
Epoch 4 Iteration 700. Current Loss 2161.7858015441893
Epoch 4 Iteration 800. Current Loss 2355.667545928955
Epoch 4 Iteration 900. Current Loss 1709.2313975524903
Epoch 4 Iteration 1000. Current Loss 2046.9478967285156
Epoch 4 Iteration 1100. Current Loss 1597.1304039001466
Epoch 4 Iteration 1200. Current Loss 2005.7264631652831
Epoch 4 Iteration 1300. Current Loss 2629.7787330627443
Epoch 4 Iteration 1400. Current Loss 1931.8144386291503
Epoch 4 Iteration 1500. Current Loss 1567.0761935424805
Epoch 4 Iteration 1600. Current Loss 2066.6089669799803
Epoch 4 Iteration 1700. Current Loss 2045.1845978546144
Epoch 4 Iteration 1800. Current Loss 1732.0966780090332
Epoch 

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 5 Iteration 100. Current Loss 1921.0307737731935
Epoch 5 Iteration 200. Current Loss 1885.4685522460939
Epoch 5 Iteration 300. Current Loss 1634.4609255981445
Epoch 5 Iteration 400. Current Loss 1965.6646715545655
Epoch 5 Iteration 500. Current Loss 1932.7210165405274
Epoch 5 Iteration 600. Current Loss 2120.126061401367
Epoch 5 Iteration 700. Current Loss 1814.7299739074706
Epoch 5 Iteration 800. Current Loss 2498.7688246154785
Epoch 5 Iteration 900. Current Loss 1701.6039819335938
Epoch 5 Iteration 1000. Current Loss 1964.7115675354005
Epoch 5 Iteration 1100. Current Loss 1917.7548678588867
Epoch 5 Iteration 1200. Current Loss 1782.4431748199463
Epoch 5 Iteration 1300. Current Loss 1925.1645681762695
Epoch 5 Iteration 1400. Current Loss 1774.2341038513184
Epoch 5 Iteration 1500. Current Loss 1567.4532152557374
Epoch 5 Iteration 1600. Current Loss 2249.694656829834
Epoch 5 Iteration 1700. Current Loss 1935.9840090942382
Epoch 5 Iteration 1800. Current Loss 1510.0232847595214
Epo

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 6 Iteration 100. Current Loss 1174.4696801757811
Epoch 6 Iteration 200. Current Loss 1598.4098725128174
Epoch 6 Iteration 300. Current Loss 1685.417801513672
Epoch 6 Iteration 400. Current Loss 2379.475118560791
Epoch 6 Iteration 500. Current Loss 1885.2468921661377
Epoch 6 Iteration 600. Current Loss 1812.48424118042
Epoch 6 Iteration 700. Current Loss 1303.7437992858886
Epoch 6 Iteration 800. Current Loss 2116.92976852417
Epoch 6 Iteration 900. Current Loss 1501.8496436309815
Epoch 6 Iteration 1000. Current Loss 2055.226533050537
Epoch 6 Iteration 1100. Current Loss 1438.2738018798827
Epoch 6 Iteration 1200. Current Loss 1817.9561529541015
Epoch 6 Iteration 1300. Current Loss 2411.236504745483
Epoch 6 Iteration 1400. Current Loss 2078.1367500305178
Epoch 6 Iteration 1500. Current Loss 2228.1750202178955
Epoch 6 Iteration 1600. Current Loss 1782.8249324798585
Epoch 6 Iteration 1700. Current Loss 1976.9664915466308
Epoch 6 Iteration 1800. Current Loss 1580.4935113525391
Epoch 6 I

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 7 Iteration 100. Current Loss 1636.759187927246
Epoch 7 Iteration 200. Current Loss 1612.8177693939208
Epoch 7 Iteration 300. Current Loss 1767.687681274414
Epoch 7 Iteration 400. Current Loss 2218.636951599121
Epoch 7 Iteration 500. Current Loss 1898.9251077270508
Epoch 7 Iteration 600. Current Loss 1958.3927219390869
Epoch 7 Iteration 700. Current Loss 2398.0738356018064
Epoch 7 Iteration 800. Current Loss 2154.3636840057375
Epoch 7 Iteration 900. Current Loss 1751.1670320129394
Epoch 7 Iteration 1000. Current Loss 2074.4287130737303
Epoch 7 Iteration 1100. Current Loss 1735.6760382080079
Epoch 7 Iteration 1200. Current Loss 1741.8933346557617
Epoch 7 Iteration 1300. Current Loss 1813.1460809326172
Epoch 7 Iteration 1400. Current Loss 1693.171841430664
Epoch 7 Iteration 1500. Current Loss 1869.0931117248535
Epoch 7 Iteration 1600. Current Loss 1455.5365823364257
Epoch 7 Iteration 1700. Current Loss 2058.6626184082033
Epoch 7 Iteration 1800. Current Loss 2907.379596862793
Epoch 

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 8 Iteration 100. Current Loss 2155.1449098205567
Epoch 8 Iteration 200. Current Loss 2209.8414389038085
Epoch 8 Iteration 300. Current Loss 1865.2110083007813
Epoch 8 Iteration 400. Current Loss 1597.6222231292725
Epoch 8 Iteration 500. Current Loss 1726.3500709533691
Epoch 8 Iteration 600. Current Loss 1890.663244781494
Epoch 8 Iteration 700. Current Loss 2599.9526986694336
Epoch 8 Iteration 800. Current Loss 1397.2021292114257
Epoch 8 Iteration 900. Current Loss 1726.3768812561036
Epoch 8 Iteration 1000. Current Loss 1602.4065759277344
Epoch 8 Iteration 1100. Current Loss 1856.863456954956
Epoch 8 Iteration 1200. Current Loss 1428.7755503082276
Epoch 8 Iteration 1300. Current Loss 1724.9766982269286
Epoch 8 Iteration 1400. Current Loss 1786.4548209381103
Epoch 8 Iteration 1500. Current Loss 1716.5426489257814
Epoch 8 Iteration 1600. Current Loss 1653.5596071624757
Epoch 8 Iteration 1700. Current Loss 1637.5786198425294
Epoch 8 Iteration 1800. Current Loss 2727.220723724365
Epoc

  0%|          | 0/3831 [00:00<?, ?it/s]

Epoch 9 Iteration 100. Current Loss 2166.794865951538
Epoch 9 Iteration 200. Current Loss 1478.504774017334
Epoch 9 Iteration 300. Current Loss 1338.1061249542236
Epoch 9 Iteration 400. Current Loss 1471.4172843170165
Epoch 9 Iteration 500. Current Loss 1920.4941537475586
Epoch 9 Iteration 600. Current Loss 1856.6884677124024
Epoch 9 Iteration 700. Current Loss 2218.5166842651365
Epoch 9 Iteration 800. Current Loss 1531.672385559082
Epoch 9 Iteration 900. Current Loss 2216.0451028442385
Epoch 9 Iteration 1000. Current Loss 2497.1751039123537
Epoch 9 Iteration 1100. Current Loss 1767.3694050598144
Epoch 9 Iteration 1200. Current Loss 1860.7228514099122
Epoch 9 Iteration 1300. Current Loss 1833.9656491088867
Epoch 9 Iteration 1400. Current Loss 1605.373692779541
Epoch 9 Iteration 1500. Current Loss 2013.9069773864746
Epoch 9 Iteration 1600. Current Loss 1640.810926055908
Epoch 9 Iteration 1700. Current Loss 3173.559717025757
Epoch 9 Iteration 1800. Current Loss 1840.6224784088135
Epoch 9

In [22]:
model, train_losses, valid_losses = output1

In [24]:
with open("../losses/transformer_train_loss1", "wb") as fb:
    pickle.dump(train_losses, fb)

with open("../losses/transformer_valid_loss1", "wb") as fb:
    pickle.dump(valid_losses, fb)