In [1]:
from datasets import load_dataset, Dataset, DatasetDict 
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd

dataset = load_dataset('sentiment140')
dictionary = {}
mapping_dict = {0:0, 2: 1, 4: 2}

test_dataset = dataset["test"].to_pandas()
train_negative_df = test_dataset.query("sentiment == 0")[:50]
train_neutral_df  = test_dataset.query("sentiment == 2")[:50]
train_positive_df = test_dataset.query("sentiment == 4")[:50]

dictionary["train"] = pd.concat([train_negative_df, train_neutral_df, train_positive_df])

dictionary["train"]["sentiment"] = dictionary["train"]["sentiment"].map(mapping_dict)

validation_dataset = dataset["test"].to_pandas()
validation_negative_df = validation_dataset.query("sentiment == 0")[50:100]
validation_neutral_df  = validation_dataset.query("sentiment == 2")[50:100]
validation_positive_df = validation_dataset.query("sentiment == 4")[50:100]

dictionary["validation"] = pd.concat([validation_negative_df, validation_neutral_df, validation_positive_df])

dictionary["validation"]["sentiment"] = dictionary["validation"]["sentiment"].map(mapping_dict)

train_dataset = Dataset.from_dict(dictionary["train"])
validation_dataset = Dataset.from_dict(dictionary["validation"])
dataset = DatasetDict({"train":train_dataset, "validation":validation_dataset})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.rename_column("sentiment", "label")

Map: 100%|██████████| 150/150 [00:00<00:00, 16560.82 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 25132.65 examples/s]


In [3]:
from transformers import TrainingArguments

output_dir = "./output"
training_args = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=3,  # Number of training epochs
  per_device_train_batch_size=8,  # Batch size per GPU
  per_device_eval_batch_size=8,   # Batch size for evaluation per GPU
  warmup_steps=400,   # Number of warmup steps for learning rate scheduler
  weight_decay=0.01,   # Strength of weight decay
  logging_dir="./logs",   # Directory for storing logs
  logging_steps=100,   # Log every N steps
  evaluation_strategy="steps",   # Evaluation strategy during training
  eval_steps=200,   # Run evaluation every N steps
  save_total_limit=2,   # Only save the last N checkpoints
  save_steps=400,   # Save checkpoint every N steps
  load_best_model_at_end=True,   # Load the best model at the end of training
  metric_for_best_model="accuracy",   # Metric to use for the best model
)

print(training_args.output_dir)
print(training_args.num_train_epochs)

./output
3


In [41]:
from transformers import AutoModelForSequenceClassification, BertConfig, BertModel
import torch.nn as nn

class MyBertForSequenceClassification(nn.Module):
    def __init__(self, config):
        super(MyBertForSequenceClassification, self).__init__()
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.num_labels = config.num_labels

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs['pooler_output']
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            print(logits)
            print(logits.view(-1, self.num_labels))
            loss = loss_fct(logits, labels)
            #print(f"Logits view: {logits.view(-1, self.num_labels)}")
            #print(f"Labels view: {labels.view(-1)}")
            #print(f"Labels: {labels}")
            #print(f"Logits: {logits}")
            print(f"Loss: {loss}")
            return {"loss": loss} 
        else:
            return {"logits": logits}


config = BertConfig.from_pretrained(checkpoint)
config.num_labels = 3
model = MyBertForSequenceClassification(config)

In [42]:
from transformers import Trainer
import numpy as np
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [43]:
trainer.train()

  4%|▎         | 2/57 [00:00<00:03, 14.04it/s]

8
tensor([[-0.4291,  0.0012, -0.4895],
        [-0.4646, -0.0839,  0.0284],
        [-0.1326, -0.2471, -0.1731],
        [-0.1289,  0.1501, -0.2593],
        [-0.2558,  0.2687, -0.2446],
        [-0.1555,  0.0538, -0.2914],
        [-0.3496,  0.0566, -0.1932],
        [-0.2588, -0.0428, -0.2448]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.4291,  0.0012, -0.4895],
        [-0.4646, -0.0839,  0.0284],
        [-0.1326, -0.2471, -0.1731],
        [-0.1289,  0.1501, -0.2593],
        [-0.2558,  0.2687, -0.2446],
        [-0.1555,  0.0538, -0.2914],
        [-0.3496,  0.0566, -0.1932],
        [-0.2588, -0.0428, -0.2448]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1019532680511475
8
tensor([[-0.2998, -0.0330, -0.2948],
        [-0.5283, -0.0890, -0.3729],
        [-0.0803, -0.1065, -0.2383],
        [-0.1691, -0.4643, -0.1672],
        [-0.2517, -0.0080, -0.2059],
        [-0.7316, -0.0677, -0.0522],
        [-0.1975, -0.2162, -0.1926],
        [-0.3478,  0.0727,

  7%|▋         | 4/57 [00:00<00:04, 12.85it/s]

8
tensor([[-0.1272,  0.1044, -0.3857],
        [-0.1188,  0.0340, -0.0746],
        [-0.3326,  0.0920, -0.2519],
        [-0.3268,  0.0505, -0.0943],
        [-0.2655, -0.0747, -0.2208],
        [-0.1288,  0.0127, -0.3124],
        [-0.1745,  0.0300, -0.1289],
        [-0.3354, -0.0559, -0.0037]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.1272,  0.1044, -0.3857],
        [-0.1188,  0.0340, -0.0746],
        [-0.3326,  0.0920, -0.2519],
        [-0.3268,  0.0505, -0.0943],
        [-0.2655, -0.0747, -0.2208],
        [-0.1288,  0.0127, -0.3124],
        [-0.1745,  0.0300, -0.1289],
        [-0.3354, -0.0559, -0.0037]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1360270977020264
8
tensor([[-0.2611, -0.0384, -0.0553],
        [-0.3478, -0.0095, -0.1853],
        [-0.3869, -0.0582, -0.1329],
        [-0.3819,  0.0711, -0.1593],
        [-0.4075,  0.1803, -0.2976],
        [-0.1606, -0.1210, -0.1194],
        [-0.6244, -0.1225, -0.6483],
        [-0.2849,  0.0435,

 14%|█▍        | 8/57 [00:00<00:03, 12.51it/s]

8
tensor([[-0.1371, -0.3315, -0.1291],
        [-0.2819, -0.3054, -0.0147],
        [-0.2600, -0.0537, -0.1190],
        [-0.4066, -0.1338, -0.2002],
        [-0.2245,  0.0119, -0.4504],
        [-0.3333, -0.0210,  0.0100],
        [-0.6158,  0.1247, -0.3895],
        [-0.1855, -0.0178, -0.4278]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.1371, -0.3315, -0.1291],
        [-0.2819, -0.3054, -0.0147],
        [-0.2600, -0.0537, -0.1190],
        [-0.4066, -0.1338, -0.2002],
        [-0.2245,  0.0119, -0.4504],
        [-0.3333, -0.0210,  0.0100],
        [-0.6158,  0.1247, -0.3895],
        [-0.1855, -0.0178, -0.4278]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.0590060949325562
8
tensor([[-0.2731, -0.1379, -0.3491],
        [-0.3701, -0.1024, -0.0272],
        [-0.5073, -0.0763, -0.2562],
        [-0.3852,  0.0005, -0.1996],
        [-0.4163,  0.0220, -0.2669],
        [-0.0805,  0.1417, -0.0658],
        [-0.3388,  0.1065, -0.0757],
        [-0.1968, -0.1266,

 21%|██        | 12/57 [00:00<00:03, 12.70it/s]

8
tensor([[-0.3577,  0.0223, -0.2016],
        [-0.1482, -0.1506, -0.2924],
        [-0.2147,  0.0677, -0.1810],
        [-0.0224,  0.0266, -0.1663],
        [-0.0215, -0.1301, -0.1951],
        [-0.2368, -0.3307, -0.0972],
        [-0.1398, -0.2050, -0.1424],
        [-0.3916,  0.1322,  0.0875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.3577,  0.0223, -0.2016],
        [-0.1482, -0.1506, -0.2924],
        [-0.2147,  0.0677, -0.1810],
        [-0.0224,  0.0266, -0.1663],
        [-0.0215, -0.1301, -0.1951],
        [-0.2368, -0.3307, -0.0972],
        [-0.1398, -0.2050, -0.1424],
        [-0.3916,  0.1322,  0.0875]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1868081092834473
8
tensor([[-0.3011,  0.0563, -0.2324],
        [-0.3846, -0.3444, -0.2746],
        [-0.0265,  0.1351, -0.5055],
        [-0.1307, -0.0954, -0.0590],
        [-0.3581, -0.3620, -0.2839],
        [-0.2271, -0.1997, -0.2742],
        [-0.3632,  0.0668, -0.0814],
        [ 0.0687, -0.1456,

 25%|██▍       | 14/57 [00:01<00:03, 12.69it/s]

8
tensor([[ 0.0552, -0.2019, -0.0196],
        [ 0.0155, -0.6368, -0.1823],
        [ 0.0936, -0.1907, -0.2740],
        [-0.0564, -0.2322, -0.1805],
        [-0.1221, -0.0437, -0.1357],
        [-0.2143, -0.2167, -0.2899],
        [-0.3623, -0.1313, -0.2956],
        [-0.2156,  0.0162, -0.3875]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[ 0.0552, -0.2019, -0.0196],
        [ 0.0155, -0.6368, -0.1823],
        [ 0.0936, -0.1907, -0.2740],
        [-0.0564, -0.2322, -0.1805],
        [-0.1221, -0.0437, -0.1357],
        [-0.2143, -0.2167, -0.2899],
        [-0.3623, -0.1313, -0.2956],
        [-0.2156,  0.0162, -0.3875]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.118346929550171
8
tensor([[ 0.2266, -0.1434,  0.0956],
        [-0.2007, -0.3799, -0.1263],
        [ 0.0262, -0.2361, -0.2867],
        [-0.3833, -0.3510, -0.2094],
        [-0.2034, -0.2518, -0.3607],
        [-0.0520, -0.2324,  0.0026],
        [-0.0638, -0.2066, -0.0503],
        [ 0.0875, -0.0215, 

 32%|███▏      | 18/57 [00:01<00:03, 12.63it/s]

8
tensor([[-0.2736, -0.3556, -0.2617],
        [ 0.2462, -0.2647, -0.2933],
        [-0.3552, -0.3143, -0.2569],
        [-0.0555, -0.2722, -0.1209],
        [-0.1521, -0.2540, -0.0878],
        [-0.3275, -0.0462, -0.2422],
        [-0.0433, -0.4421, -0.3331],
        [-0.3306, -0.5699, -0.1332]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.2736, -0.3556, -0.2617],
        [ 0.2462, -0.2647, -0.2933],
        [-0.3552, -0.3143, -0.2569],
        [-0.0555, -0.2722, -0.1209],
        [-0.1521, -0.2540, -0.0878],
        [-0.3275, -0.0462, -0.2422],
        [-0.0433, -0.4421, -0.3331],
        [-0.3306, -0.5699, -0.1332]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.207238793373108
8
tensor([[-0.2072, -0.2139, -0.1160],
        [-0.0674, -0.4434, -0.1161],
        [-0.0773, -0.2727, -0.3119],
        [-0.1537, -0.3280, -0.0131],
        [-0.1642, -0.1244, -0.1093],
        [-0.0727, -0.2095, -0.0237],
        [-0.0869, -0.5552, -0.2122],
        [ 0.0976, -0.4865, 

 35%|███▌      | 20/57 [00:01<00:02, 12.82it/s]

6
tensor([[-0.0720, -0.3575, -0.0963],
        [-0.0526, -0.2285, -0.0857],
        [-0.1796, -0.4044, -0.2247],
        [-0.1931, -0.3443, -0.0095],
        [-0.2645, -0.2949, -0.1244],
        [-0.2194, -0.2360, -0.2248]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.0720, -0.3575, -0.0963],
        [-0.0526, -0.2285, -0.0857],
        [-0.1796, -0.4044, -0.2247],
        [-0.1931, -0.3443, -0.0095],
        [-0.2645, -0.2949, -0.1244],
        [-0.2194, -0.2360, -0.2248]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.140224814414978
8
tensor([[-9.2994e-02, -5.1364e-03, -4.6468e-01],
        [-2.1883e-01, -2.4547e-01, -4.5573e-01],
        [-9.2309e-02, -5.0782e-01,  3.8321e-02],
        [-2.4293e-01, -2.9197e-01, -1.5006e-01],
        [-1.4986e-01, -3.5560e-01, -3.1610e-01],
        [-2.5221e-01, -3.2173e-01, -4.0031e-01],
        [-2.6805e-04, -3.9675e-02, -6.0668e-02],
        [-2.0046e-01, -3.4198e-01, -9.4797e-02]], device='cuda:0',
       grad_fn=<AddmmBac

 39%|███▊      | 22/57 [00:01<00:02, 12.72it/s]

8
tensor([[-1.2978e-01, -1.4177e-01,  5.5598e-02],
        [-1.7175e-01, -2.6660e-01, -6.7745e-02],
        [-6.2709e-01, -8.5476e-02, -2.9438e-01],
        [-1.2589e-02, -2.0141e-01, -1.2513e-01],
        [-3.3185e-01, -1.3345e-01,  5.8271e-02],
        [-8.8956e-02, -1.6475e-01, -1.9394e-01],
        [-4.7768e-01, -7.1242e-02, -1.9823e-01],
        [-1.6021e-01, -5.6694e-04,  7.4183e-02]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-1.2978e-01, -1.4177e-01,  5.5598e-02],
        [-1.7175e-01, -2.6660e-01, -6.7745e-02],
        [-6.2709e-01, -8.5476e-02, -2.9438e-01],
        [-1.2589e-02, -2.0141e-01, -1.2513e-01],
        [-3.3185e-01, -1.3345e-01,  5.8271e-02],
        [-8.8956e-02, -1.6475e-01, -1.9394e-01],
        [-4.7768e-01, -7.1242e-02, -1.9823e-01],
        [-1.6021e-01, -5.6694e-04,  7.4183e-02]], device='cuda:0',
       grad_fn=<ViewBackward0>)
Loss: 1.0018610954284668
8
tensor([[-0.4536,  0.0684, -0.2546],
        [-0.4211, -0.3359, -0.1663],
        [-0.

 46%|████▌     | 26/57 [00:02<00:02, 12.70it/s]

8
tensor([[-0.2401,  0.0863, -0.2803],
        [-0.3581, -0.1566, -0.3779],
        [-0.2244, -0.1804, -0.2686],
        [-0.4068, -0.0568, -0.2074],
        [-0.0603, -0.0016, -0.5107],
        [-0.2108, -0.1115, -0.3762],
        [-0.4650, -0.2578, -0.2062],
        [-0.5433, -0.1124, -0.4969]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.2401,  0.0863, -0.2803],
        [-0.3581, -0.1566, -0.3779],
        [-0.2244, -0.1804, -0.2686],
        [-0.4068, -0.0568, -0.2074],
        [-0.0603, -0.0016, -0.5107],
        [-0.2108, -0.1115, -0.3762],
        [-0.4650, -0.2578, -0.2062],
        [-0.5433, -0.1124, -0.4969]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.0329804420471191
8
tensor([[-0.2267,  0.1971, -0.3587],
        [-0.3341, -0.2051, -0.2749],
        [-0.1810, -0.1277, -0.3551],
        [-0.2828, -0.0932, -0.2271],
        [-0.1225,  0.0104, -0.0548],
        [-0.2084, -0.2541, -0.2041],
        [ 0.0013,  0.1638, -0.5008],
        [-0.1385,  0.2255,

 53%|█████▎    | 30/57 [00:02<00:02, 12.65it/s]

8
tensor([[ 0.0032,  0.1128, -0.5245],
        [-0.1633,  0.1293, -0.4542],
        [-0.1534, -0.0877, -0.7422],
        [-0.4507,  0.0661, -0.5640],
        [-0.1369,  0.1656, -0.5552],
        [-0.3214,  0.0052, -0.3350],
        [-0.3157, -0.1340, -0.2351],
        [-0.3175,  0.1167, -0.4557]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[ 0.0032,  0.1128, -0.5245],
        [-0.1633,  0.1293, -0.4542],
        [-0.1534, -0.0877, -0.7422],
        [-0.4507,  0.0661, -0.5640],
        [-0.1369,  0.1656, -0.5552],
        [-0.3214,  0.0052, -0.3350],
        [-0.3157, -0.1340, -0.2351],
        [-0.3175,  0.1167, -0.4557]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.229061245918274
8
tensor([[-0.0217, -0.0393, -0.2043],
        [-0.1202,  0.0261, -0.1319],
        [-0.2029,  0.1423, -0.2569],
        [-0.2760, -0.0415, -0.0813],
        [-0.3649,  0.0942, -0.6007],
        [-0.0244, -0.0094, -0.3583],
        [-0.2731,  0.0431, -0.4261],
        [-0.3927, -0.0982, 

 56%|█████▌    | 32/57 [00:02<00:01, 12.60it/s]

8
tensor([[-0.2940, -0.0915, -0.4170],
        [-0.1461,  0.1048, -0.4985],
        [-0.0024, -0.0781, -0.1993],
        [-0.5173,  0.1871, -0.3975],
        [ 0.0782,  0.3864, -0.7500],
        [ 0.0648,  0.3707, -0.3835],
        [-0.3518, -0.0864, -0.0241],
        [-0.0604, -0.0425, -0.5179]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.2940, -0.0915, -0.4170],
        [-0.1461,  0.1048, -0.4985],
        [-0.0024, -0.0781, -0.1993],
        [-0.5173,  0.1871, -0.3975],
        [ 0.0782,  0.3864, -0.7500],
        [ 0.0648,  0.3707, -0.3835],
        [-0.3518, -0.0864, -0.0241],
        [-0.0604, -0.0425, -0.5179]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.147819995880127
8
tensor([[-0.4367,  0.0557, -0.2342],
        [-0.0391,  0.1975, -0.3117],
        [ 0.0495,  0.1475, -0.1748],
        [-0.1048, -0.1612, -0.3011],
        [-0.1154, -0.1160, -0.4988],
        [-0.2010,  0.0331, -0.3889],
        [-0.4187, -0.0587, -0.5093],
        [-0.1185, -0.0062, 

 63%|██████▎   | 36/57 [00:02<00:01, 12.59it/s]

8
tensor([[-0.2770,  0.0356, -0.4808],
        [-0.2479,  0.1342, -0.2821],
        [-0.3062,  0.0073, -0.4986],
        [-0.3274, -0.2393, -0.4285],
        [-0.3942, -0.0265, -0.0577],
        [-0.0914,  0.0184, -0.3191],
        [-0.3866,  0.1678, -0.3593],
        [-0.1569,  0.1166, -0.7120]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.2770,  0.0356, -0.4808],
        [-0.2479,  0.1342, -0.2821],
        [-0.3062,  0.0073, -0.4986],
        [-0.3274, -0.2393, -0.4285],
        [-0.3942, -0.0265, -0.0577],
        [-0.0914,  0.0184, -0.3191],
        [-0.3866,  0.1678, -0.3593],
        [-0.1569,  0.1166, -0.7120]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.3297476768493652
8
tensor([[-0.2405,  0.1265, -0.3269],
        [-0.2169, -0.2257,  0.0205],
        [-0.3345, -0.0027, -0.1407],
        [-0.2415,  0.0691, -0.2459],
        [-0.0550, -0.2850, -0.2340],
        [-0.6420,  0.0496, -0.3222],
        [-0.1755, -0.3136, -0.4699],
        [-0.4938,  0.1531,

 67%|██████▋   | 38/57 [00:02<00:01, 12.87it/s]

8
tensor([[-0.1616, -0.1088, -0.0066],
        [-0.1384,  0.0294, -0.4140],
        [-0.1355, -0.0984, -0.4232],
        [-0.3353, -0.0708, -0.3965],
        [-0.2310, -0.1174, -0.1449],
        [-0.2623, -0.1431, -0.0362],
        [-0.4171, -0.1279, -0.2736],
        [-0.0817, -0.1443, -0.1935]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.1616, -0.1088, -0.0066],
        [-0.1384,  0.0294, -0.4140],
        [-0.1355, -0.0984, -0.4232],
        [-0.3353, -0.0708, -0.3965],
        [-0.2310, -0.1174, -0.1449],
        [-0.2623, -0.1431, -0.0362],
        [-0.4171, -0.1279, -0.2736],
        [-0.0817, -0.1443, -0.1935]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1226252317428589
6
tensor([[-0.4998,  0.0990, -0.1933],
        [-0.1584, -0.1365,  0.0405],
        [-0.3606,  0.0028, -0.2457],
        [-0.3078, -0.2552, -0.3621],
        [-0.1316, -0.2104, -0.3324],
        [ 0.0274, -0.0236, -0.1843]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.

 74%|███████▎  | 42/57 [00:03<00:01, 12.73it/s]

8
tensor([[-0.2128, -0.2508, -0.0912],
        [-0.1200, -0.3308, -0.2308],
        [-0.1884, -0.1073, -0.4808],
        [ 0.0876, -0.4249, -0.2894],
        [ 0.0032, -0.2996, -0.4278],
        [-0.4217, -0.3391, -0.2597],
        [-0.0220, -0.4042,  0.2532],
        [-0.1332, -0.2553, -0.1424]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.2128, -0.2508, -0.0912],
        [-0.1200, -0.3308, -0.2308],
        [-0.1884, -0.1073, -0.4808],
        [ 0.0876, -0.4249, -0.2894],
        [ 0.0032, -0.2996, -0.4278],
        [-0.4217, -0.3391, -0.2597],
        [-0.0220, -0.4042,  0.2532],
        [-0.1332, -0.2553, -0.1424]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1337308883666992
8
tensor([[-0.1299, -0.4629, -0.1619],
        [-0.1814, -0.4981, -0.4850],
        [-0.2340, -0.5060, -0.2585],
        [-0.0652, -0.3482,  0.1109],
        [-0.2935, -0.2395, -0.0458],
        [-0.1156, -0.1284, -0.2812],
        [ 0.0017, -0.3230, -0.1884],
        [-0.0299, -0.0705,

 77%|███████▋  | 44/57 [00:03<00:01, 12.65it/s]

8
tensor([[-0.1062, -0.0220, -0.2196],
        [-0.1108, -0.4118, -0.2330],
        [ 0.1449, -0.2423, -0.5816],
        [-0.0243,  0.2039, -0.2626],
        [-0.0836, -0.2494, -0.2701],
        [ 0.1059, -0.2155, -0.0327],
        [-0.0347, -0.0233, -0.1346],
        [-0.0406, -0.0879, -0.1535]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.1062, -0.0220, -0.2196],
        [-0.1108, -0.4118, -0.2330],
        [ 0.1449, -0.2423, -0.5816],
        [-0.0243,  0.2039, -0.2626],
        [-0.0836, -0.2494, -0.2701],
        [ 0.1059, -0.2155, -0.0327],
        [-0.0347, -0.0233, -0.1346],
        [-0.0406, -0.0879, -0.1535]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1271593570709229
8
tensor([[ 0.0728, -0.1557, -0.4173],
        [-0.1897, -0.1977, -0.0653],
        [ 0.1790, -0.1579, -0.1736],
        [-0.3128, -0.2967, -0.5350],
        [ 0.0269, -0.2982, -0.3377],
        [-0.2415, -0.3481, -0.2796],
        [ 0.3458, -0.5340, -0.2701],
        [-0.0447, -0.1965,

 84%|████████▍ | 48/57 [00:03<00:00, 12.57it/s]

8
tensor([[ 0.0780, -0.1267, -0.3473],
        [-0.0978, -0.5156, -0.1665],
        [-0.1502, -0.2868, -0.1610],
        [-0.1199, -0.4890, -0.1282],
        [ 0.0140,  0.0388, -0.0209],
        [-0.1747, -0.1942, -0.2425],
        [-0.1128, -0.2012, -0.0672],
        [-0.0369,  0.0364, -0.1793]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[ 0.0780, -0.1267, -0.3473],
        [-0.0978, -0.5156, -0.1665],
        [-0.1502, -0.2868, -0.1610],
        [-0.1199, -0.4890, -0.1282],
        [ 0.0140,  0.0388, -0.0209],
        [-0.1747, -0.1942, -0.2425],
        [-0.1128, -0.2012, -0.0672],
        [-0.0369,  0.0364, -0.1793]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.0798081159591675
8
tensor([[ 0.0281, -0.3588, -0.3099],
        [-0.1742, -0.2490, -0.6168],
        [-0.0893, -0.2247, -0.3486],
        [ 0.0198, -0.0623, -0.1470],
        [-0.4549, -0.1010, -0.2090],
        [-0.1198,  0.0528, -0.2090],
        [-0.0569, -0.2108, -0.2922],
        [-0.1500, -0.2118,

 88%|████████▊ | 50/57 [00:03<00:00, 12.60it/s]

8
tensor([[-0.1352, -0.2172, -0.2551],
        [-0.1333, -0.1731, -0.3366],
        [-0.3829, -0.2379, -0.2417],
        [ 0.2146, -0.1275, -0.3133],
        [ 0.1481,  0.0919, -0.1779],
        [-0.1812,  0.0205, -0.2092],
        [-0.1250, -0.3689, -0.0110],
        [-0.1604,  0.0818, -0.1900]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.1352, -0.2172, -0.2551],
        [-0.1333, -0.1731, -0.3366],
        [-0.3829, -0.2379, -0.2417],
        [ 0.2146, -0.1275, -0.3133],
        [ 0.1481,  0.0919, -0.1779],
        [-0.1812,  0.0205, -0.2092],
        [-0.1250, -0.3689, -0.0110],
        [-0.1604,  0.0818, -0.1900]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.062215805053711
8
tensor([[-0.2468,  0.1041, -0.3863],
        [-0.3127, -0.2863, -0.2145],
        [-0.3311,  0.0791, -0.3290],
        [-0.4619, -0.1671, -0.3037],
        [-0.4243,  0.0058, -0.3911],
        [-0.2164,  0.0049, -0.0898],
        [-0.2629, -0.2656, -0.5027],
        [-0.4205, -0.3247, 

 95%|█████████▍| 54/57 [00:04<00:00, 12.56it/s]

8
tensor([[-0.3249, -0.1357, -0.3053],
        [-0.1720, -0.2816, -0.1530],
        [-0.2485, -0.0569, -0.1174],
        [-0.4143,  0.0129, -0.2127],
        [-0.3964,  0.0227, -0.0484],
        [-0.4383,  0.0065, -0.2524],
        [-0.1741, -0.1931, -0.2307],
        [-0.3556, -0.0847, -0.2112]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.3249, -0.1357, -0.3053],
        [-0.1720, -0.2816, -0.1530],
        [-0.2485, -0.0569, -0.1174],
        [-0.4143,  0.0129, -0.2127],
        [-0.3964,  0.0227, -0.0484],
        [-0.4383,  0.0065, -0.2524],
        [-0.1741, -0.1931, -0.2307],
        [-0.3556, -0.0847, -0.2112]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.1386696100234985
8
tensor([[-0.6372, -0.3778,  0.0142],
        [-0.2812, -0.1301, -0.2013],
        [-0.6250,  0.0910, -0.0954],
        [-0.4539, -0.2829, -0.0335],
        [-0.2970,  0.0256, -0.2049],
        [-0.4467,  0.0638, -0.2424],
        [-0.2521,  0.0264, -0.1708],
        [-0.3217,  0.0611,

 98%|█████████▊| 56/57 [00:04<00:00, 12.47it/s]

8
tensor([[-0.4318, -0.2000, -0.0414],
        [-0.5001, -0.1103, -0.1526],
        [-0.3307, -0.1567, -0.0401],
        [-0.4163, -0.0111,  0.2162],
        [-0.3269, -0.1868, -0.1301],
        [-0.6385, -0.4182,  0.0234],
        [-0.3238, -0.0111,  0.0472],
        [-0.3288, -0.1761,  0.1160]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[-0.4318, -0.2000, -0.0414],
        [-0.5001, -0.1103, -0.1526],
        [-0.3307, -0.1567, -0.0401],
        [-0.4163, -0.0111,  0.2162],
        [-0.3269, -0.1868, -0.1301],
        [-0.6385, -0.4182,  0.0234],
        [-0.3238, -0.0111,  0.0472],
        [-0.3288, -0.1761,  0.1160]], device='cuda:0', grad_fn=<ViewBackward0>)
Loss: 1.0919084548950195
8
tensor([[-0.5317, -0.3676, -0.2356],
        [-0.4939, -0.2925,  0.2196],
        [-0.5475, -0.2722, -0.1556],
        [-0.6220, -0.5013,  0.1364],
        [-0.2905,  0.1567, -0.0502],
        [-0.2920, -0.3755,  0.1940],
        [-0.4147, -0.2794,  0.1395],
        [-0.4008, -0.2951,

                                               
100%|██████████| 57/57 [00:04<00:00, 12.58it/s]

{'train_runtime': 4.5312, 'train_samples_per_second': 99.311, 'train_steps_per_second': 12.579, 'train_loss': 1.1224767450700726, 'epoch': 3.0}





TrainOutput(global_step=57, training_loss=1.1224767450700726, metrics={'train_runtime': 4.5312, 'train_samples_per_second': 99.311, 'train_steps_per_second': 12.579, 'train_loss': 1.1224767450700726, 'epoch': 3.0})

In [46]:
predictions = trainer.predict(tokenized_datasets["validation"])
#print(predictions.predictions.shape, predictions.label_ids.shape)
predictions

45it [00:11,  3.01it/s]

8
tensor([[-0.2115, -0.4553,  0.0560],
        [-0.2710, -0.5812,  0.0960],
        [-0.2317, -0.4152,  0.0072],
        [-0.2819, -0.4031,  0.0526],
        [-0.2043, -0.3498,  0.0839],
        [-0.2934, -0.4640,  0.0550],
        [-0.2862, -0.4561,  0.0661],
        [-0.2165, -0.4650,  0.0917]], device='cuda:0')
tensor([[-0.2115, -0.4553,  0.0560],
        [-0.2710, -0.5812,  0.0960],
        [-0.2317, -0.4152,  0.0072],
        [-0.2819, -0.4031,  0.0526],
        [-0.2043, -0.3498,  0.0839],
        [-0.2934, -0.4640,  0.0550],
        [-0.2862, -0.4561,  0.0661],
        [-0.2165, -0.4650,  0.0917]], device='cuda:0')
Loss: 1.1596969366073608
8
tensor([[-0.2680, -0.4033,  0.0797],
        [-0.2604, -0.4825,  0.0770],
        [-0.2412, -0.3884,  0.0039],
        [-0.2991, -0.3717,  0.0626],
        [-0.2842, -0.5506,  0.0834],
        [-0.3248, -0.4259,  0.1072],
        [-0.3862, -0.3194,  0.1026],
        [-0.3880, -0.3094, -0.0078]], device='cuda:0')
tensor([[-0.2680, -0.4033,  0

57it [00:12,  5.91it/s]

8
tensor([[-0.3463, -0.4157,  0.0689],
        [-0.3520, -0.4393,  0.0252],
        [-0.2741, -0.4432,  0.0638],
        [-0.3301, -0.3863,  0.0469],
        [-0.3013, -0.4050,  0.0753],
        [-0.3685, -0.3282,  0.0345],
        [-0.3369, -0.3155,  0.0093],
        [-0.3720, -0.3308, -0.0025]], device='cuda:0')
tensor([[-0.3463, -0.4157,  0.0689],
        [-0.3520, -0.4393,  0.0252],
        [-0.2741, -0.4432,  0.0638],
        [-0.3301, -0.3863,  0.0469],
        [-0.3013, -0.4050,  0.0753],
        [-0.3685, -0.3282,  0.0345],
        [-0.3369, -0.3155,  0.0093],
        [-0.3720, -0.3308, -0.0025]], device='cuda:0')
Loss: 1.2747195959091187
8
tensor([[-0.2872, -0.4694,  0.0636],
        [-0.3378, -0.3716,  0.0590],
        [-0.2941, -0.4148,  0.1319],
        [-0.2994, -0.3850,  0.0459],
        [-0.3070, -0.4340,  0.0725],
        [-0.2707, -0.7005,  0.1009],
        [-0.3053, -0.4711,  0.0676],
        [-0.3120, -0.4848,  0.0427]], device='cuda:0')
tensor([[-0.2872, -0.4694,  0

ValueError: attempt to get argmax of an empty sequence

57it [00:28,  5.91it/s]