In [1]:
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
from transformers import Trainer, TrainingArguments
from transformers import BertForSequenceClassification, BertTokenizer

### Define a Pre-Trained Model and Tokenizer

In [3]:
pretrained_model = "microsoft/MiniLM-L12-H384-uncased"

tokenizer = BertTokenizer.from_pretrained(pretrained_model)
model = BertForSequenceClassification.from_pretrained(pretrained_model, num_labels=1) #, num_labels=1, ignore_mismatched_sizes=True

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

### Data

In [None]:
#Read each file
tr = pd.read_csv('si630w22-hw3-train.csv')
de = pd.read_csv('si630w22-hw3-dev.csv')
te = pd.read_csv('si630w22-hw3-test.public.csv')
og = pd.read_csv('si630w22-hw3-data.csv')

#Concatenate question and reply into a single string
og['concat'] = og['question_text'] + ' [SEP] ' + og['reply_text']

#Add concat to each dataset, merged on question ID
train = tr.merge(og, left_on='id', right_on='question_id')[['id', 'rating', 'concat']]
dev   = de.merge(og, left_on='id', right_on='question_id')[['id', 'rating', 'concat']]
test  = te.merge(og, left_on='id', right_on='question_id')[['id', 'concat']]

#Groupby concat in order to get a median rating per concat
train = train.groupby(by=['concat']).median().reset_index()
dev   = dev.groupby(by=['concat']).median().reset_index()
test  = test.groupby(by=['concat']).min().reset_index()

train

In [None]:
train['rating'].value_counts()

In [None]:
class Data(Dataset):
  def __init__(self, features, labels=None):
    self.X = features
    self.y = labels

  def __getitem__(self, index):
    """
    """
    item = {key: torch.tensor(val[index]) for key, val in self.X.items()}
    if self.y:
      item['labels'] = torch.tensor(self.y[index])
    return item

  def __len__(self):
    return len(self.X['input_ids'])

In [None]:
train_X = list(train['concat'])
train_y = list(train['rating'])

dev_X = list(dev['concat'])
dev_y = list(dev['rating']) 

test_X = list(test['concat'])

# train_X, train_y, dev_X, dev_y = train_X[:64], train_y[:64], dev_X[:64], dev_y[:64]

X_train_tokenized = tokenizer(train_X, padding=True, truncation=True, max_length=512)
X_dev_tokenized = tokenizer(dev_X, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(test_X, padding=True, truncation=True, max_length=512)

train_data = Data(X_train_tokenized, train_y)
dev_data = Data(X_dev_tokenized, dev_y)
test_data = Data(X_test_tokenized)

### Define a Pre-Trained Model and Tokenizer

In [8]:
def compute_metrics(eval_pred):
  '''
  '''
  predictions, labels = eval_pred
  rmse = mean_squared_error(labels, predictions, squared=False)
  return {"rmse": rmse}  

In [9]:
args = TrainingArguments(
    seed=0,

    #Training Loss
    save_strategy = "steps",
    logging_strategy = 'steps',
    logging_steps = 50,

    #Validation Loss
    evaluation_strategy = "steps",
    eval_steps = 50,

    #Model Details
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    report_to="none",
    weight_decay=0.01,
#     metric_for_best_model='rmse', #accuracy
    
    load_best_model_at_end=True,
    output_dir='outputs'
)
trainer = Trainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=dev_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [10]:
trainer.train()

***** Running training *****
  Num examples = 3779
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1185


Step,Training Loss,Validation Loss,Rmse
50,13.8937,10.147826,3.185565
100,7.9945,5.968771,2.443107
150,5.0647,3.812878,1.952659
200,3.2471,2.434822,1.560392
250,2.0235,1.609923,1.268827
300,1.4482,1.159115,1.076622
350,0.951,1.177468,1.085112
400,0.8806,0.748262,0.865022
450,0.7501,0.688461,0.829735
500,0.6532,0.679099,0.824075


***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
***** Running Evaluation *****
  Num examples = 811
  Batch size = 16
Saving model checkpoint to si630/outputs/checkpoint-500
Configuration saved in si630/outputs/checkpoint-500/config.json
Model weights saved in si630/outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in si630/outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in

TrainOutput(global_step=1185, training_loss=1.8771577086629747, metrics={'train_runtime': 386.3759, 'train_samples_per_second': 48.903, 'train_steps_per_second': 3.067, 'total_flos': 1244642887971840.0, 'train_loss': 1.8771577086629747, 'epoch': 5.0})

### Load Trained Model and Predict on Test Data

In [11]:
# model_path = "outputs/checkpoint-500"
# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=1)

In [12]:
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
raw_pred, _, _ = test_trainer.predict(test_data)

***** Running Prediction *****
  Num examples = 810
  Batch size = 8


In [14]:
raw_pred

array([[2.0367732],
       [4.585798 ],
       [2.9006007],
       [2.808524 ],
       [2.4595993],
       [4.572204 ],
       [2.005933 ],
       [4.560387 ],
       [2.8498402],
       [4.5778923],
       [3.7874427],
       [4.5866413],
       [3.8238869],
       [4.4681277],
       [4.584654 ],
       [4.5653195],
       [2.7019272],
       [3.4341083],
       [4.2867327],
       [4.315602 ],
       [4.358562 ],
       [2.6866388],
       [4.5731516],
       [4.522634 ],
       [4.2166677],
       [4.2451696],
       [2.6221614],
       [4.0889087],
       [3.580111 ],
       [4.527705 ],
       [4.584757 ],
       [3.5545015],
       [4.5612535],
       [4.3354197],
       [4.5800247],
       [4.58098  ],
       [3.994193 ],
       [4.576596 ],
       [4.41212  ],
       [4.541747 ],
       [4.5744476],
       [4.071423 ],
       [4.3537583],
       [4.5866036],
       [4.5754275],
       [4.5838304],
       [4.3288403],
       [4.586869 ],
       [4.3787756],
       [4.3301992],


In [15]:
processed = list(np.round(raw_pred.flatten()))
ints = [int(x) for x in processed]
ints[:10]

[2, 5, 3, 3, 2, 5, 2, 5, 3, 5]

In [16]:
from collections import Counter

counts = Counter(ints)

print(counts[5])
print(counts[4])
print(counts[3])
print(counts[2])
print(counts[1])

232
310
217
51
0


In [22]:
test['predicted'] = list(raw_pred.flatten())

test = test[['id', 'predicted']]
test

Unnamed: 0,id,predicted
0,t3_n32uyo,2.036773
1,t3_ndshv8,4.585798
2,t3_ne9u3b,2.900601
3,t3_nglct8,2.808524
4,t3_nlru4f,2.459599
...,...,...
805,t3_n94pj7,3.835932
806,t3_nby5bt,3.841792
807,t3_nm1fw7,3.503341
808,t3_nd91pj,3.116344


In [24]:
test.to_csv('g16_submission.csv', index=False)