In [1]:
import os, json
import pandas as pd

import numpy as np

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_MODEL = 'bert-base-uncased'
LEARNING_RATE = 2e-4
max_length_num = 512
BATCH_SIZE = 4
EPOCHS = 20

model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})    
    model.resize_token_embeddings(len(tokenizer))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
data = []
pure_data = []
path_to_json = './data/press_data/'

for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    with open(path_to_json + file_name, encoding="utf8") as json_file:
        file = json.load(json_file)
        data.extend(file)
for x in data:
    pure_data.extend(x['data'])
    

In [4]:
x = [f'{x[1]} {x[3]}' for x in pure_data]
y = [x[4] for x in pure_data]


x = tokenizer(x, padding='max_length', max_length=max_length_num)
comb = {'input_ids': x['input_ids'], 'labels': y}

dataset = Dataset.from_dict(comb)


In [5]:

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [6]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [7]:
import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs,return_outputs=False):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [8]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Mse,Mae,R2,Accuracy
1,6.9719,7.145626,7.145626,1.790703,-0.001194,0.233693
2,6.8211,7.13983,7.13983,1.788685,-0.000381,0.236256
3,7.1214,7.159552,7.159552,1.794901,-0.003145,0.234488
4,7.5326,7.166811,7.166811,1.792089,-0.004162,0.235991
5,7.7539,7.207006,7.207006,1.807733,-0.009794,0.230599
6,7.3817,7.262967,7.262967,1.822303,-0.017635,0.221054



KeyboardInterrupt



In [2]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()  

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  5% |  5% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  8% |  6% |
