In [204]:
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertConfig, BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [205]:
dir = "data/webis17/"

# 1. Webis Corpus

In [206]:
class Dataset:
    '''
    self.corpus: (post, text, truthMean)
    '''
    def __init__(self, path):
        self.train_file = path + 'input/instances.jsonl'
        self.truth_file = path + 'input/truth.jsonl'
        df_train = pd.read_json(self.train_file, lines=True)
        df_truth = pd.read_json(self.truth_file, lines=True)
        self.size = df_train.shape[0]

        truth_id, truth_mean = list(df_truth['id']), list(df_truth['truthMean'])
        truth_dict = {truth_id[i]:truth_mean[i] for i in range(self.size)}
        train_id, train_post, train_text = list(df_train['id']), list(df_train['postText']), list(df_train['targetParagraphs'])
        #? train_post[i] is a list
        self.corpus = [(train_post[i][0], ' '.join(para for para in train_text[i]), truth_dict[train_id[i]]) for i in range(self.size)]

# 2. Pre-processing

In [207]:
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

bert_tokenizer.save_pretrained(dir+'bert-base-uncased')
bert_model.save_pretrained(dir+'bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [208]:
# Create dataset class
data = Dataset(dir)

# extract data
title_all = [data[0] for data in data.corpus]
content_all = [data[1] for data in data.corpus]
score_all = torch.tensor([data[2] for data in data.corpus], requires_grad=True)


In [209]:
torch.save(score_all, dir+'scores.pt')

In [210]:
# title profiling
title_all_tokenized_raw = bert_tokenizer(title_all,return_token_type_ids=False, return_attention_mask=False)['input_ids']

print(f"Average # of tokens = {np.mean([len(lst) for lst in title_all_tokenized_raw])}")
print(f"max # of tokens = {max([len(lst) for lst in title_all_tokenized_raw])}")

title_all_tokenized = bert_tokenizer(title_all, padding=True,truncation=True,max_length=20, return_token_type_ids=False, return_attention_mask=False, return_tensors="pt")['input_ids']
print(title_all_tokenized.shape)
print(title_all_tokenized)

# Save tensors
torch.save(title_all_tokenized, dir+'titles_tokens.pt')


Average # of tokens = 17.628058143105743
max # of tokens = 104
torch.Size([19538, 20])
tensor([[  101,  2866,  1521,  ...,  2489,   102,     0],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  1996,  1000,  ...,  1996,  2047,   102],
        ...,
        [  101,  2413,  2015,  ...,  2112,  1997,   102],
        [  101,  2821,  5076,  ...,     0,     0,     0],
        [  101,  2957, 11011,  ...,     0,     0,     0]])


## Process in batches

In [211]:
title_all_tokenized = torch.load(dir+'titles_tokens.pt')
print(title_all_tokenized.shape)

torch.Size([19538, 20])


In [212]:
# import gc

# num_data = 19538
# extract_size = 800
# for i in range(num_data//800):
#     outputs = bert_model(title_all_tokenized[(extract_size*i):(extract_size*(i+1)), :])
#     title_all_embed = outputs[0]  # The last hidden-state is the first element of the output tuple
#     print(title_all_embed.shape) # batchsize x # tokens of sent x embed_dim
#     print(f"From size {str(extract_size*i)} to {str(extract_size*(i+1))}")
    
#     # save Data
#     torch.save(title_all_embed, dir+'/embeddings/titles_'+str(extract_size*i)+'_'+str(extract_size*(i+1)))
#     del outputs
#     del title_all_embed
#     gc.collect()

# last portion

# num_patchs = num_data//extract_size
# outputs = bert_model(title_all_tokenized[(extract_size*num_patchs):, :])
# title_all_embed = outputs[0] 
# print(title_all_embed.shape) 
# print(f"From size {str(extract_size*num_patchs)} to {str(num_data)}")

# # save Data
# torch.save(title_all_embed, dir+'/embeddings/titles_'+str(extract_size*num_patchs)+'_'+str(num_data))
# del outputs
# del title_all_embed
# gc.collect()

# Combine batches

# Xt = torch.zeros(num_data, 20, 768)
# for i in range(num_data//800):
#     Xt[extract_size*i:extract_size*(i+1), :,: ] = torch.load(dir+'/embeddings/titles_'+str(extract_size*i)+'_'+str(extract_size*(i+1)))
# Xt[extract_size*num_patchs:,:,:] = torch.load(dir+'/embeddings/titles_'+str(extract_size*num_patchs)+'_'+str(num_data))

# print(Xt.shape)

# # Save
# torch.save(Xt, dir+'/titles_all.pt')

## Load data

In [213]:
# Load CLS data
from torch.utils.data import TensorDataset, DataLoader

dir = "data/webis17/"

Xt_all = torch.load(dir+'/titles_all.pt')[0:1000]
yt_all = torch.load(dir+'/scores.pt')[0:1000]
print(Xt_all.shape)
print(yt_all.shape)

num_data = 1000
train_size = 800
# val_size = 2000
# test_size = num_data - train_size - val_size
test_size = num_data - train_size
batch_size = 64
train_set = TensorDataset(Xt_all[:train_size,0,:], yt_all[:train_size])
# val_set = TensorDataset(Xt_all[train_size:train_size+val_size,0,:], yt_all[train_size:train_size+val_size])
test_set = TensorDataset(Xt_all[train_size:,0,:], yt_all[train_size:])

train_dataloader = DataLoader(train_set, batch_size=batch_size)
# val_dataloader = DataLoader(val_set, batch_size=batch_size)
test_dataloader = DataLoader(test_set, batch_size=batch_size)

torch.Size([1000, 20, 768])
torch.Size([1000])


# 3 Simple LSTM model

In [214]:
import torch
import torch.nn as nn
import numpy as np

class LSTM(nn.Module):
    def __init__(self, batch_size, num_tokens, embed_dim, hidden_dim,  n_layers = 1, dropout = 0.0):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm=nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.flatten = nn.Flatten(1)
        # self.fc1=nn.Linear(num_tokens*hidden_dim, 64)
        # self.fc1=nn.Linear(num_tokens*hidden_dim, 1)
        # take CLS token, birection
        self.fc1=nn.Linear(2*hidden_dim, 64)

        self.fc2=nn.Linear(64, 1)
        
    def forward(self, x, hidden):
        '''
            x: batch_size x num_tokens x embed_dim
        '''
        # take CLS token
        # print(x[:,0,:].unsqueeze(1).shape)
        lstm_out, hidden = self.lstm(x.unsqueeze(1), hidden) # batch_size x 1 x (2*hidden_dim)

        # flat = self.flatten(lstm_out) 
        flat = lstm_out.squeeze() # batch_size x hidden_dim

        out1 = self.fc1(flat) # batch_size x 64
        out2 = self.fc2(torch.relu(out1)) # batch_size x 1
        out = torch.sigmoid(out2)

        # # single layer
        # out = torch.sigmoid(out1)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        # birections -> *2
        hidden = (weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

    def init_weights(m):
        '''
        Initialize weights
        '''
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.0)

In [215]:
# load GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
print(torch.cuda.get_device_name(0))

Using cuda device
NVIDIA GeForce RTX 2070 SUPER


## Hyperparameters

In [216]:
hidden_dim = 10 # num of tokens is typically 20
_ , num_tokens, embed_dim = Xt_all.shape
# dropout = 0.0
dropout = 0.2

model = LSTM(batch_size, num_tokens, embed_dim, hidden_dim, n_layers=2, dropout = dropout).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

from torch.optim.lr_scheduler import ReduceLROnPlateau # learning rate scheduler
lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.25, patience=0, threshold=0.05,min_lr=3e-5, verbose=True)

model.init_weights

<bound method LSTM.init_weights of LSTM(
  (lstm): LSTM(768, 10, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=20, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)>

# 4 Training and testing

In [217]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from scipy.stats import pearsonr

### Training ###
def train(train_dataloader, y_truth, model, loss_fn, optimizer, mute = False):
    model.train()

    size = len(train_dataloader.dataset)

    y_pred_train = []
    for batch, (X, y) in enumerate(train_dataloader):
        hidden = model.init_hidden(X.shape[0])
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()

        # Compute prediction error
        pred, hidden = model(X, hidden)
        y_pred_train.extend(pred.squeeze().cpu())
        loss = loss_fn(pred.squeeze(), y)
        # Backpropagation

        loss.backward()
        optimizer.step()

        if batch % 20 == 0:
            loss, current = loss.item(), batch * len(X)
            if not mute:
                print(f"Training loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    y_pred_train = torch.tensor(y_pred_train, dtype=float)
    performance = loss_fn(y_pred_train, y_truth)
    clf_performance = ((y_pred_train>0.5)==(y_truth>0.5)).float().mean()

    if not mute:
        print(f"Training Loss: {performance}")
        print(f"Training Classifier Accuracy: {clf_performance}")
    return y_pred_train

### Testing ###
def test(test_dataloader, y_truth, model, loss_fn, lr_scheduler, mute = False):
    hidden_val = model.init_hidden(batch_size)
    model.eval()

    y_pred_val = []
    for batch, (X, y) in enumerate(test_dataloader):
        hidden_val = model.init_hidden(X.shape[0])
        X, y = X.to(device), y.to(device)

        pred, hidden_val = model(X, hidden_val)
        y_pred_val.extend(pred.squeeze().cpu())

    y_pred_val = torch.tensor(y_pred_val, dtype=float)
    performance = loss_fn(y_pred_val, y_truth)
 
    clf_performance = ((y_pred_val>0.5)==(y_truth>0.5)).float().mean()

    pre_performance = precision_score((y_truth>0.5).float().numpy(), (y_pred_val>0.5).float().numpy())
    rec_performance = recall_score((y_truth>0.5).float().numpy(), (y_pred_val>0.5).float().numpy())

    f1_performance = f1_score((y_pred_val>0.5).float().numpy(), (y_truth>0.5).float().numpy())
    p_performance = pearsonr(y_pred_val.detach().numpy(), y_truth.detach().numpy())[0]
    if not mute:
        print(f"Test Precision: {pre_performance}")
        print(f"Test Recall: {rec_performance}")
        print(f"Test Accuracy: {clf_performance}")
        print(f"Test F1 Score: {f1_performance}")
        print(f"Test Pearson Coefficient: {p_performance}")

    return performance  

In [None]:
## Training & validation

# epochs = 50
epochs = 20

model.train()

best_test_performance = 1.0 # any number works
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, yt_all[:train_size], model, loss_fn, optimizer)
    test_performance = test(test_dataloader, yt_all[train_size:train_size+test_size], model, loss_fn, lr_scheduler)

    if test_performance < best_test_performance:
        best_test_performance = test_performance
        print(f'NEW BEST MODEL! Performance: {best_test_performance}')
        torch.save(model, dir+'/best_model')
print("Done!")

torch.save(model, dir+'model_CLS_10_bi')

# 5. Evaluation

In [219]:
import torch

dir = 'data/webis17/'

hidden_dim = 10 # num of tokens is typically 20
_ , num_tokens, embed_dim = Xt_all.shape
# dropout = 0.0
dropout = 0.2

model = LSTM(batch_size, num_tokens, embed_dim, hidden_dim, n_layers=2, dropout = dropout).to(device)
model = torch.load(dir+'/best_model')

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

from torch.optim.lr_scheduler import ReduceLROnPlateau # learning rate scheduler
lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.25, patience=0, threshold=0.05,min_lr=3e-5, verbose=True)

In [220]:
_ = test(test_dataloader, yt_all[train_size:], model, loss_fn, lr_scheduler)

Test Precision: 0.92
Test Recall: 0.48936170212765956
Test Accuracy: 0.8700000047683716
Test F1 Score: 0.6388888888888888
Test Pearson Coefficient: 0.684032037329564
