In [1]:
import numpy as np
import pandas as pd
from os.path import join
from datetime import datetime
from pathlib import Path

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import loggers as pl_loggers

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint

from transformers import BertModel
from utils.data import RelevantDataset

device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import BertModel
bert_id = "google/bert_uncased_L-2_H-128_A-2"

notebook_datetime = datetime.now()
seed_everything(1337)

import torch
from torch import nn
import torch.nn.functional as F

from pytorch_lightning import LightningModule

from transformers import BertModel

Global seed set to 1337


# Loading the data

## Dataset

In [109]:
class RelevantDataset(Dataset):
    def __init__(
        self,
        dataset: str,
        target_mode: str = "isrelevant",
        device: str = "cpu",
        dimensions: tuple = None,
        load_only_relevant: bool = False
    ):
        """Constructor Function
        Parameters
        ----------
        dataset : str
            Decides which dataset will be loaded. Can be either "train", "test" or "val".
        target_mode : str
            Decides which target is returned in the __getitem__ function. Can be either "isrelevant", "sentencetype" or "both".TODO:!!!!
        device : str
            Decides on which device the torch tensors will be returned.
        dimensions : tuple
            The dimensions to use for returning one hot encodings.
        load_only_relevant : bool
            If true the Dataset will only contain samples with the "relevant" target equal True.
        """ 

        if dataset == "train":
            joint_dataframe = pd.read_hdf("./preprocessed_data/train_joint.h5", key="s")
        if dataset == "val":
            joint_dataframe = pd.read_hdf("./preprocessed_data/validation_joint.h5", key="s")
            if not dimensions:
                raise TypeError("Dimensions attribute is required for dataset type \"validation\".")
        if dataset == "test":
            joint_dataframe = pd.read_hdf("./preprocessed_data/test_joint.h5", key="s")
            if not dimensions:
                raise TypeError("Dimensions attribute is required for dataset type \"test\".")
        if load_only_relevant:
            joint_dataframe = joint_dataframe[joint_dataframe["is_relevant"] == True]
        self.target_mode = target_mode
    
    
    
        self.X = joint_dataframe[["sentence_position", "sentence_length", "tokenized_sentence", "project_name", "country_code", "url", "text_length", "sentence_count"]].to_numpy()
          
        if self.target_mode == "isrelevant":
            self.Y = joint_dataframe["is_relevant"].to_numpy()
            if dimensions is None:
                self.dimensions = ((1, (4, len(set(self.X[:,3])), len(set(self.X[:,4])), len(set(self.X[:,5])))), 1)
            else:
                self.dimensions = dimensions

        if self.target_mode == "sentencetype":
            self.Y = joint_dataframe["sector_ids"].to_numpy()
            if dimensions is None:
                self.dimensions = ((1, (4, len(set(self.X[:,3])), len(set(self.X[:,4])), len(set(self.X[:,5])))), 1)
            else:
                self.dimensions = dimensions
            
        self.device = device
        
    def __len__(self):
        return len(self.Y)

    
    def __getitem__(self, idx, x_one_hot = True, x_train_ready = True):
        
        """
        Note that x_train_ready implies x_one_hot
        """
        x_tmp = self.X[idx]
        metric_x = torch.tensor([x_tmp[0], x_tmp[1], x_tmp[6], x_tmp[7]], device=self.device)#numerical features
        sentence_x = torch.tensor(x_tmp[2], device=self.device, dtype=torch.long)#bert features
        sentence_x = torch.cat((sentence_x, torch.zeros(512 - sentence_x.shape[0], device=self.device, dtype= torch.long)))
        
        #one hot features:
        project_name_x = torch.tensor(x_tmp[3], device=self.device, dtype=torch.long)
        country_code_x = torch.tensor(x_tmp[4], device=self.device, dtype=torch.long)
        url_x = torch.tensor(x_tmp[5], device=self.device)
        
        y = torch.tensor(self.Y[idx], device=self.device, dtype=torch.long)

        if x_train_ready or x_one_hot:
            project_name_x = nn.functional.one_hot(project_name_x, num_classes = self.dimensions[0][1][1])
            country_code_x = nn.functional.one_hot(country_code_x, num_classes = self.dimensions[0][1][2])
            url_x = nn.functional.one_hot(url_x, num_classes = self.dimensions[0][1][3])
        if x_train_ready:
            x_other = torch.cat((metric_x, project_name_x, country_code_x, url_x), dim=0)
            if self.target_mode == "isrelevant":
                return (sentence_x, x_other), y
            if self.target_mode == "sentencetype":
                return (sentence_x, x_other), torch.tensor(float(len(y)))
        if self.target_mode == "isrelevant":
            return (sentence_x, (metric_x, project_name_x, country_code_x, url_x)), y
        if self.target_mode == "sentencetype":
            return (sentence_x, (metric_x, project_name_x, country_code_x, url_x)), torch.tensor(float(len(y)))

In [110]:
train_ds = RelevantDataset(dataset="train", 
                           target_mode="sentencetype"
                          )
valid_ds = RelevantDataset(dataset="val", 
                           target_mode="sentencetype",
                           dimensions = train_ds.dimensions
                          )

# Hyperparams

In [121]:
start_lr = 1e-4
batch_size = 16

## Loader

In [122]:
train_dl = DataLoader(train_ds,batch_size  = batch_size, shuffle=True)
valid_dl = DataLoader(train_ds, batch_size  = 64, shuffle=False)

In [123]:
import pickle

In [129]:
pickle.dump(train_dl, open("train_dl", "wb"))
pickle.dump(valid_dl, open("valid_dl", "wb"))

# Training

## Model def

In [113]:
class SectorModuleV00(LightningModule):
    """Simple implementation of a sector module

    Does not implment weighting for samples with multiple sector ids
    """

    def __init__(self, bert: BertModel, input_size: int, output_size: int, start_lr=1e-4):
        super().__init__()        
        self.bert = bert
        self.linear_after_bert = nn.Linear(bert.config.hidden_size, 256)
        self.feed_forward = nn.Sequential(
            nn.Linear(256 + input_size, 1024),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, output_size)
        )
        self.loss = nn.MSELoss()
        
        
        for m in self.feed_forward:
            if type(m) is nn.Linear:
                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0)
        
        self.start_lr = start_lr

    def forward(self, x):
        x_bert = x[0]
        x_other = x[1]

        x_bert = self.bert(x_bert)["last_hidden_state"][:,0] #all batches but only clf output
        x_bert = self.linear_after_bert(x_bert)
        x_bert = torch.relu(x_bert)#is new (not sure if improves by much)

        x = torch.cat((x_bert, x_other), dim=1)#dim=1 is feature dimensions (0 is batch dim)

        return self.feed_forward(x)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.start_lr)
        #return optimizer
        return {
           'optimizer': optimizer,
           'lr_scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor =.2, patience =1, cooldown =2, min_lr =1e-6),
           'monitor': 'val_loss'
       }
       
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch

        y_hat = self.forward(x)

        loss = self.loss(y_hat, y)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        y_hat = self.forward(x)
        print(y_hat, y)
        loss = self.loss(y_hat, y)
        acc = torch.sum(torch.argmax(y_hat, dim=1) == y) / len(y)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)

In [114]:
# from utils.architectures import SectorModuleV00

model = SectorModuleV00(
    bert = BertModel.from_pretrained(bert_id).to(device),
    input_size = sum(train_ds.dimensions[0][1]), 
    output_size = train_ds.dimensions[1],
    start_lr=start_lr
)

In [115]:
architecture_name = model.__class__.__name__
logdir = join("logs", architecture_name, notebook_datetime.strftime("%Y-%m-%dT%H-%M-%S"))
print(f"Logging to {logdir}")
Path(logdir).mkdir(parents=True, exist_ok=True)

Logging to logs/SectorModuleV00/2021-04-20T20-12-04


In [116]:
tb_logger = pl_loggers.TensorBoardLogger(logdir, name="", version="")

In [117]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=logdir,
    verbose=True,
    save_last=True,
    save_top_k=-1, #save all
    mode="min",
    filename='-{epoch:02d}-{val_loss:.2f}'
)

In [118]:
trainer = Trainer(
#     gpus=1,
    gpus=0,
#     precision=16, 
    logger=tb_logger, 
    callbacks=[checkpoint_callback],
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


## Executing run

In [119]:
trainer.fit(model, train_dl, valid_dl)


  | Name              | Type       | Params
-------------------------------------------------
0 | bert              | BertModel  | 4.4 M 
1 | linear_after_bert | Linear     | 33.0 K
2 | feed_forward      | Sequential | 713 K 
3 | loss              | MSELoss    | 0     
-------------------------------------------------
5.1 M     Trainable params
0         Non-trainable params
5.1 M     Total params
20.528    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

tensor([[0.3840]]) tensor([0.])
tensor([[0.3428]]) tensor([0.])


Training: 0it [00:00, ?it/s]

Saving latest checkpoint...
Epoch 0, global step 25: val_loss reached 0.13251 (best 0.13251), saving model to "/home/loerinczy/Desktop/CHEERS_Challenge/CHEERS_challenge_round_1/logs/SectorModuleV00/2021-04-20T20-12-04/-epoch=00-val_loss=0.13.ckpt" as top 1


1

In [120]:
a = torch.tensor(float(3))

In [77]:
a.dtype

torch.float32

In [43]:
nn.MSELoss()(torch.Tensor([3]), torch.Tensor([5]))

tensor(4.)

In [52]:
type(len(torch.Tensor([3])))

int