In [1]:
import os
import pandas as pd
import numpy as np
import transformers

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from datasets import Dataset, load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from transformers import TrainingArguments, Trainer

In [2]:
SEED = 42
DATAPATH = '/home/alexuvarovskyi/course_competition/data'
VAL_SIZE = 0.2
TARGET = 'target'

In [3]:
data = pd.read_csv(os.path.join(DATAPATH,'train.csv'))
data = data[['excerpt', TARGET]]
data = data.rename(columns={TARGET: "labels"})
data

Unnamed: 0,excerpt,labels
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197
...,...,...
2829,When you think of dinosaurs and where they liv...,1.711390
2830,So what is a solid? Solids are usually hard be...,0.189476
2831,The second state of matter we will discuss is ...,0.255209
2832,Solids are shapes that you can actually touch....,-0.215279


In [4]:
np.random.seed(SEED)
train_df, val_df = train_test_split(data, test_size=VAL_SIZE, random_state=SEED)

In [5]:
scaler = StandardScaler()
train_df['labels'] = scaler.fit_transform(train_df[['labels']])
train_df

Unnamed: 0,excerpt,labels
2743,The building of rotary presses for printing il...,-0.534778
2347,The idea of a trip on Bob's yacht suited every...,0.401209
2387,"Seeing the front door wide open, the enchanter...",0.744448
2202,"The widow she cried over me, and called me a p...",-0.066997
786,"Jacobitism was (and, to a much smaller extent,...",-0.734861
...,...,...
1638,The steam is supplied by two circular return t...,-2.263625
1095,Living things are different from things that a...,0.721784
1130,"I'd always longed for adventures. You see, my ...",-0.598527
1294,In these times one dread lies heavy on heart a...,-1.033246


In [6]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
train_dataset = train_dataset.train_test_split(test_size=0.2)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False) 

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', model_max_length=350)


def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def tokenize(batch):
    return tokenizer(batch['excerpt'], padding='max_length', truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    metric = rmse(labels, predictions)
    return {"rmse": metric}


train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["excerpt"])
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["excerpt"])

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels=1 # for regression
)

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Map:   0%|          | 0/1813 [00:00<?, ? examples/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Map:   0%|          | 0/454 [00:00<?, ? examples/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="trainer",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    num_train_epochs=10,
    save_strategy='epoch',
    seed=SEED,
    report_to='none',
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset["train"],
    eval_dataset=train_dataset["test"],
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rmse
1,0.5984,0.456918,0.675957
2,0.29,0.335988,0.579645
3,0.1995,0.380812,0.617099
4,0.1498,0.379211,0.615801
5,0.1134,0.321509,0.567018
6,0.0893,0.404412,0.635934
7,0.07,0.34246,0.585201
8,0.0522,0.387628,0.622598
9,0.047,0.377313,0.614258
10,0.0425,0.412217,0.642042


TrainOutput(global_step=1140, training_loss=0.16520738267062002, metrics={'train_runtime': 177.1852, 'train_samples_per_second': 102.322, 'train_steps_per_second': 6.434, 'total_flos': 3260851975353000.0, 'train_loss': 0.16520738267062002, 'epoch': 10.0})

In [10]:
val_preds = trainer.predict(val_dataset)
val_preds= scaler.inverse_transform(pd.DataFrame(val_preds[0].reshape(1,-1)[0]))
val_preds

array([[-4.3812343e-01],
       [-1.6082102e+00],
       [-1.4449103e+00],
       [-5.6164759e-01],
       [-5.1779324e-01],
       [-2.1134017e+00],
       [-1.7098380e+00],
       [-1.8827658e+00],
       [-6.1090457e-01],
       [-7.3406857e-01],
       [ 1.5641138e-01],
       [-1.8350234e+00],
       [-1.7790058e-01],
       [-7.6512104e-01],
       [ 6.7615885e-01],
       [-2.1710427e+00],
       [ 3.6201796e-01],
       [-2.0438979e+00],
       [ 6.0814971e-01],
       [-1.1078633e+00],
       [ 8.9532870e-01],
       [-5.3136170e-01],
       [-2.1636393e+00],
       [ 2.5572297e-01],
       [-2.1151176e-01],
       [-1.6092489e+00],
       [-1.9419369e-01],
       [ 4.4963667e-01],
       [-6.6336942e-01],
       [-1.3294612e+00],
       [-2.0123670e+00],
       [-3.1438000e+00],
       [ 1.7343971e-01],
       [-1.6630870e+00],
       [ 3.2031938e-01],
       [ 5.2647954e-01],
       [-7.1092355e-01],
       [-9.2119932e-01],
       [-1.8183249e+00],
       [-1.3941526e+00],


In [13]:
print(f'RMSE: {rmse(val_df["labels"].values, val_preds.reshape(1,-1)[0])}')


RMSE: 0.5643916897042569


In [24]:
from transformers import AutoModelForSequenceClassification
import torch


eval_model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=1)
eval_model.load_state_dict(torch.load('/home/alexuvarovskyi/course_competition/trainer/checkpoint-1140/pytorch_model.bin'))
eval_model.to('cuda:0')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [25]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', model_max_length=350)
def preprocess_text(text, tokenizer=tokenizer, max_len=350, n_tokens=20):
    inputs = tokenizer.encode_plus(
        text,
        None,
        truncation=True,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
    )
    inputs['input_ids']=torch.cat((torch.full((1, n_tokens), 500).resize(n_tokens),torch.tensor(inputs['input_ids'], dtype=torch.long)))
    inputs['attention_mask'] = torch.cat((torch.full((1, n_tokens), 1).resize(n_tokens), torch.tensor(inputs['attention_mask'], dtype=torch.long)))
    return inputs

In [36]:
text = """With trembling hands the lad took the shavings from Jack's hand. Carefully shielding the tiny flame from possible draughts of air, the boy held the point of one of the thin pieces of wood over the flare. In a moment it had caught fire. Licking up the curl, the flame gradually leaped from one piece of wood to another until the entire handful was ablaze. The dancing light played upon the three faces and sent a glow out into the surrounding blackness. Harry deposited the burning shavings upon the floor, where the fire was soon transmitted to the larger piece of wood Jack had used in whittling.
As the boys saw that the matter of fire was assured, they glanced first at each other, then let their gaze wander about the apartment.
""Goodness, the rats don't seem to be much afraid of fire!"" exclaimed Jack, pointing toward a horde of rodents swarming about the place."""
model_input = preprocess_text(text)

with torch.no_grad():
    result = eval_model(model_input['input_ids'].unsqueeze(0).to('cuda:0'), model_input['attention_mask'].unsqueeze(0).to('cuda:0'))

print(result["logits"].item())

0.5918722748756409




In [37]:
from transformers import pipeline
pipe = pipeline('text-classification', model=eval_model, tokenizer=tokenizer, device=0)

In [39]:
text = """
It was a wet day, so none of the plans for seeing all the sights of London that can be seen for nothing could be carried out. Everyone had been thinking all the morning about the wonderful adventures of the day before, when Jane had held up the charm and it had turned into an arch, through which they had walked straight out of the present time and the Regent's Park into the land of Egypt eight thousand years ago. The memory of yesterday's happenings was still extremely fresh and frightening, so that everyone hoped that no one would suggest another excursion into the past, for it seemed to all that yesterday's adventures were quite enough to last for at least a week. Yet each felt a little anxious that the others should not think it was afraid, and presently Cyril, who really was not a coward, began to see that it would not be at all nice if he should have to think himself one. So he said—
‘I say—about that charm—Jane—come out. We ought to talk about it, anyhow.
"""

pipe(text)

[{'label': 'LABEL_0', 'score': 0.6146688461303711}]