In [10]:
import numpy as np
import pandas as pd
from os.path import join
from datetime import datetime
from pathlib import Path

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from utils.architectures import SectorModuleV2
from pytorch_lightning import loggers as pl_loggers

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint

from transformers import BertModel
from utils.data import RelevantDatasetV2

device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import BertModel
bert_id = "google/bert_uncased_L-2_H-128_A-2"

notebook_datetime = datetime.now()
seed_everything(1337)

Global seed set to 1337


1337

## Dataset

In [37]:
class RelevantDatasetV2(Dataset):
    def __init__(
        self,
        dataset: str,
        target_mode: str = "isrelevant",
        device: str = "cpu",
        dimensions: tuple = None,
        load_only_relevant: bool = False
    ):
        """Constructor Function
        Parameters
        ----------
        dataset : str
            Decides which dataset will be loaded. Can be either "train", "test" or "val".
        target_mode : str
            Decides which target is returned in the __getitem__ function.
            Can be either "isrelevant", "sentencetype" or "both".TODO:!!!!
        device : str
            Decides on which device the torch tensors will be returned.
        dimensions : tuple
            The dimensions to use for returning one hot encodings.
        load_only_relevant : bool
            If true the Dataset will only contain samples with the "relevant" target equal True.
        """ 

        if dataset == "train":
            joint_dataframe = pd.read_hdf("./preprocessed_data/train_joint.h5", key="s")
        if dataset == "val":
            joint_dataframe = pd.read_hdf("./preprocessed_data/validation_joint.h5", key="s")
            if not dimensions:
                raise TypeError("Dimensions attribute is required for dataset type \"validation\".")
        if dataset == "test":
            joint_dataframe = pd.read_hdf("./preprocessed_data/test_joint.h5", key="s")
            if not dimensions:
                raise TypeError("Dimensions attribute is required for dataset type \"test\".")
        if load_only_relevant:
            joint_dataframe = joint_dataframe[joint_dataframe["is_relevant"] == True]
        self.target_mode = target_mode
                      
        if target_mode == "isrelevant":
            self.X = joint_dataframe[["sentence_position",
                                      "sentence_length",
                                      "tokenized_sentence", 
                                      "project_name", 
                                      "country_code",
                                      "url",
                                      "text_length",
                                      "sentence_count",
                                      "bert_sum",
                                      "is_relevant"]].to_numpy()
            if dimensions is None:
                self.dimensions = ((1, (4, 
                                        len(set(self.X[:,3])), 
                                        len(set(self.X[:,4])), 
                                        len(set(self.X[:,5]))+1,
                                        len(self.X[0][-2]))
                                   ),
                                   1)
            else:
                self.dimensions = dimensions

        if target_mode == "sentencetype":
            self.X = joint_dataframe[joint_dataframe["is_relevant"] == 1][["sentence_position",
                                                                          "sentence_length",
                                                                          "tokenized_sentence", 
                                                                          "project_name", 
                                                                          "country_code",
                                                                          "url",
                                                                          "text_length",
                                                                          "sentence_count",
                                                                          "bert_sum",
                                                                          "is_relevant",
                                                                          "sector_ids"]]
            self.X.loc[self.X["sector_ids"].apply(len) == 0, "sector_ids"] = 11
            self.X["sector_ids"] = self.X["sector_ids"].apply(lambda x: x[0] if type(x) != int else x)
            self.X = self.X[self.X["is_relevant"] == 1].to_numpy()
            if dimensions is None:
                self.dimensions = ((1, (4, 
                                        len(set(joint_dataframe.loc[:, "project_name"])), 
                                        len(set(joint_dataframe.loc[:, "country_code"])),
                                        len(set(joint_dataframe.loc[:, "url"]))+1,
                                        len(self.X[0, -3])
                                       )
                                   ),
                                   len(set(self.X[:, -1])))
            else:
                self.dimensions = dimensions
            
        self.device = device
        
        
    def __len__(self):
        return len(self.X)
    
    
    def __getitem__(self, idx, x_one_hot = True, x_train_ready = True):
        
        """
        Note that x_train_ready implies x_one_hot
        """
        x_tmp = self.X[idx]
        bert_sum = torch.from_numpy(x_tmp[8]).to(self.device)
        metric_x = torch.tensor([x_tmp[0], x_tmp[1], x_tmp[6], x_tmp[7]], device=self.device)#numerical features
        metric_x = torch.cat((metric_x, bert_sum))
        sentence_x = torch.tensor(x_tmp[2], device=self.device, dtype=torch.long)#bert features
        sentence_x = torch.cat((sentence_x, 
                                torch.zeros(512 - sentence_x.shape[0],
                                            device=self.device, 
                                            dtype= torch.long)))
        #one hot features:
        project_name_x = torch.tensor(x_tmp[3], device=self.device, dtype=torch.long)
        country_code_x = torch.tensor(x_tmp[4], device=self.device, dtype=torch.long)
        url_x = torch.tensor(x_tmp[5], device=self.device)
        y = torch.tensor(x_tmp[-1], device=self.device, dtype=torch.long)
        if x_train_ready or x_one_hot:
            project_name_x = nn.functional.one_hot(project_name_x, num_classes = self.dimensions[0][1][1])
            country_code_x = nn.functional.one_hot(country_code_x, num_classes = self.dimensions[0][1][2])
            url_x = nn.functional.one_hot(url_x, num_classes = self.dimensions[0][1][3])
        if x_train_ready:
            x_other = torch.cat((metric_x, project_name_x, country_code_x, url_x), dim=0).float()
            return (sentence_x, x_other), y
        
        return (sentence_x, (metric_x, project_name_x, country_code_x, url_x)), y

In [29]:
train_ds = RelevantDatasetV2(dataset="train", 
                           target_mode="sentencetype"
                          )
valid_ds = RelevantDatasetV2(dataset="val", 
                           target_mode="sentencetype",
                           dimensions = train_ds.dimensions
                          )

In [30]:
import pickle

pickle.dump(train_ds, open("train_ds", "wb"))
pickle.dump(valid_ds, open("valid_ds", "wb"))

## Hyperparams

In [31]:
start_lr = 1e-4
batch_size = 16

## Loader

In [32]:
train_dl = DataLoader(train_ds,batch_size  = batch_size, shuffle=True)
valid_dl = DataLoader(train_ds, batch_size  = 64, shuffle=False)

## Model

In [33]:
model = SectorModuleV2(
    bert = BertModel.from_pretrained(bert_id).to(device),
    input_size = sum(train_ds.dimensions[0][1]), 
    output_size = train_ds.dimensions[1],
    start_lr=start_lr
)

## Logging

In [34]:
architecture_name = model.__class__.__name__
logdir = join("logs", architecture_name, notebook_datetime.strftime("%Y-%m-%dT%H-%M-%S"))
print(f"Logging to {logdir}")
Path(logdir).mkdir(parents=True, exist_ok=True)

tb_logger = pl_loggers.TensorBoardLogger(logdir, name="", version="")

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=logdir,
    verbose=True,
    save_last=True,
    save_top_k=-1, #save all
    mode="min",
    filename='-{epoch:02d}-{val_loss:.2f}'
)

Logging to logs/SectorModuleV2/2021-04-28T12-20-45


## Trainer

In [35]:
trainer = Trainer(
#     gpus=1,
    gpus=0,
#     precision=16, 
    logger=tb_logger, 
    callbacks=[checkpoint_callback],
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


## Executing run

In [36]:
trainer.fit(model, train_dl, valid_dl)


  | Name              | Type       | Params
-------------------------------------------------
0 | bert              | BertModel  | 4.4 M 
1 | linear_after_bert | Linear     | 33.0 K
2 | feed_forward      | Sequential | 845 K 
-------------------------------------------------
5.3 M     Trainable params
0         Non-trainable params
5.3 M     Total params
21.059    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Saving latest checkpoint...
Epoch 0, global step 21: val_loss reached 2.64347 (best 2.64347), saving model to "/home/loerinczy/Desktop/CHEERS_Challenge/CHEERS_challenge_round_1/logs/SectorModuleV2/2021-04-28T12-20-45/-epoch=00-val_loss=2.64.ckpt" as top 1


1