In [None]:
!pip install -q transformers evaluate pyspark

In [3]:
!pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.20.3
[0m

In [None]:
import re
import torch
import evaluate
import pyspark.pandas as ps
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

!wget -O 'processed_comments.parquet' -q https://www.dropbox.com/s/l119hqyxavor6ns/processed_comments.parquet?dl=0
!wget -O 'processed_gt.csv' -q https://www.dropbox.com/s/wjyb80t1vdx0q7v/processed_gt.csv?dl=0

In [None]:
df = ps.read_parquet('processed_comments.parquet').dropna()
gt = ps.read_csv('processed_gt.csv').dropna()

In [None]:
y = df['is_catastrophizing']
X = df['tokens']

X_train, X_test, y_train, y_test = train_test_split(X.to_list(), y.to_list(), random_state=0)

In [None]:
y_gt = gt['is_catastrophizing'].to_list()
X_gt = gt['tokens'].to_list()

In [8]:
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModelForSequenceClassification.from_pretrained('cointegrated/rubert-tiny2')

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device);

In [10]:
device

device(type='cuda')

In [11]:
def tokenize_function(df):
    return tokenizer(df, padding=True, truncation=True)

In [12]:
tokenized_train = tokenize_function(X_train)
tokenized_test = tokenize_function(X_test)
tokenized_gt = tokenize_function(X_gt)

In [13]:
class CatastDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CatastDataset(tokenized_train, y_train)
test_dataset = CatastDataset(tokenized_test, y_test)
gt_dataset = CatastDataset(tokenized_gt, y_gt)

In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    save_strategy='steps',
    evaluation_strategy='steps',
    save_steps=100,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True
)

In [15]:
metric1 = evaluate.load('precision')
metric2 = evaluate.load('recall')
metric3 = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels)['precision']
    recall = metric2.compute(predictions=predictions, references=labels)['recall']
    accuracy = metric3.compute(predictions=predictions, references=labels)['accuracy']

    return {'precision': precision, 'recall': recall, 'accuracy': accuracy}

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Precision,Recall,Accuracy
100,0.6971,0.693904,0.496059,0.892905,0.499166
200,0.695,0.69284,0.537996,0.27027,0.525025
300,0.6948,0.694474,0.496792,0.784797,0.501168
400,0.6992,0.692963,0.502607,0.748986,0.51001
500,0.6939,0.693698,0.494409,0.836486,0.49683
600,0.6931,0.695023,0.498235,0.810811,0.503337
700,0.6959,0.693359,0.497116,0.786149,0.501668
800,0.6966,0.693239,0.484392,0.293581,0.49683
900,0.6953,0.69312,0.48918,0.290203,0.499833
1000,0.6945,0.694244,0.493736,0.998649,0.49366


TrainOutput(global_step=4496, training_loss=0.6892848014831543, metrics={'train_runtime': 1197.9762, 'train_samples_per_second': 30.021, 'train_steps_per_second': 3.753, 'total_flos': 408686558537808.0, 'train_loss': 0.6892848014831543, 'epoch': 2.0})

In [18]:
trainer.evaluate(gt_dataset)

{'eval_loss': 0.7564699649810791,
 'eval_precision': 1.0,
 'eval_recall': 0.1588785046728972,
 'eval_accuracy': 0.1588785046728972,
 'eval_runtime': 0.1532,
 'eval_samples_per_second': 698.598,
 'eval_steps_per_second': 91.405,
 'epoch': 2.0}

In [19]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.6928404569625854,
 'eval_precision': 0.5379959650302623,
 'eval_recall': 0.2702702702702703,
 'eval_accuracy': 0.525025025025025,
 'eval_runtime': 15.9703,
 'eval_samples_per_second': 375.322,
 'eval_steps_per_second': 46.962,
 'epoch': 2.0}

In [20]:
trainer.save_model('rubert-tiny2_catast')

In [24]:
!zip -r ./model.zip ./rubert-tiny2_catast

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: rubert-tiny2_catast/ (stored 0%)
  adding: rubert-tiny2_catast/pytorch_model.bin (deflated 8%)
  adding: rubert-tiny2_catast/training_args.bin (deflated 49%)
  adding: rubert-tiny2_catast/config.json (deflated 49%)


In [None]:
# !zip -r /content/checkpoint.zip /content/results/checkpoint-4100