In [None]:
!pip install -q transformers evaluate pyspark

In [2]:
!pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.20.3
[0m

In [None]:
import re
import torch
import evaluate
import pyspark.pandas as ps
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

!wget -O 'processed_comments.parquet' -q https://www.dropbox.com/s/t4hog3o4qf1cu7a/processed_comments%20%281%29.parquet?dl=0
!wget -O 'processed_gt.csv' -q https://www.dropbox.com/s/0n6w06m9g8tyjbl/processed_gt%20%281%29.csv?dl=0

In [None]:
df = ps.read_parquet('processed_comments.parquet').dropna()
gt = ps.read_csv('processed_gt.csv').dropna()

In [None]:
y = df['is_catastrophizing']
X = df['tokens']

X_train, X_test, y_train, y_test = train_test_split(X.to_list(), y.to_list(), random_state=0)

In [None]:
y_gt = gt['is_catastrophizing'].to_list()
X_gt = gt['tokens'].to_list()

In [7]:
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModelForSequenceClassification.from_pretrained('cointegrated/rubert-tiny2')

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device);

In [9]:
device

device(type='cuda')

In [10]:
def tokenize_function(df):
    return tokenizer(df, padding=True, truncation=True)

In [11]:
tokenized_train = tokenize_function(X_train)
tokenized_test = tokenize_function(X_test)
tokenized_gt = tokenize_function(X_gt)

In [12]:
class CatastDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CatastDataset(tokenized_train, y_train)
test_dataset = CatastDataset(tokenized_test, y_test)
gt_dataset = CatastDataset(tokenized_gt, y_gt)

In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    save_strategy='steps',
    evaluation_strategy='steps',
    save_steps=100,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True
)

In [14]:
metric1 = evaluate.load('precision')
metric2 = evaluate.load('recall')
metric3 = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels)['precision']
    recall = metric2.compute(predictions=predictions, references=labels)['recall']
    accuracy = metric3.compute(predictions=predictions, references=labels)['accuracy']

    return {'precision': precision, 'recall': recall, 'accuracy': accuracy}

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [16]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Precision,Recall,Accuracy
100,0.6957,0.690293,0.606838,0.186719,0.525859
200,0.6764,0.661234,0.615669,0.547666,0.59693
300,0.6492,0.635257,0.684615,0.49737,0.628629
400,0.641,0.599897,0.680836,0.642341,0.665666
500,0.5749,0.581228,0.788496,0.572321,0.705038
600,0.5861,0.544917,0.729388,0.721236,0.722723
700,0.5523,0.539586,0.82287,0.603222,0.732733
800,0.5358,0.51831,0.788447,0.677515,0.744077
900,0.526,0.514437,0.774265,0.692308,0.741408
1000,0.5492,0.505659,0.761449,0.732413,0.747748


TrainOutput(global_step=4496, training_loss=0.4883415783851596, metrics={'train_runtime': 1280.2073, 'train_samples_per_second': 28.092, 'train_steps_per_second': 3.512, 'total_flos': 422154049693680.0, 'train_loss': 0.4883415783851596, 'epoch': 2.0})

In [17]:
trainer.evaluate(gt_dataset)

{'eval_loss': 0.5918805599212646,
 'eval_precision': 1.0,
 'eval_recall': 0.6261682242990654,
 'eval_accuracy': 0.6261682242990654,
 'eval_runtime': 0.1543,
 'eval_samples_per_second': 693.579,
 'eval_steps_per_second': 90.749,
 'epoch': 2.0}

In [18]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.4544934630393982,
 'eval_precision': 0.8768115942028986,
 'eval_recall': 0.6761998685075609,
 'eval_accuracy': 0.7874541207874541,
 'eval_runtime': 18.4443,
 'eval_samples_per_second': 324.979,
 'eval_steps_per_second': 40.663,
 'epoch': 2.0}

In [19]:
trainer.save_model('rubert-tiny2_catast')

In [20]:
!zip -r ./model.zip ./rubert-tiny2_catast

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: rubert-tiny2_catast/ (stored 0%)
  adding: rubert-tiny2_catast/config.json (deflated 49%)
  adding: rubert-tiny2_catast/pytorch_model.bin (deflated 8%)
  adding: rubert-tiny2_catast/training_args.bin (deflated 49%)


In [21]:
# !zip -r /content/checkpoint.zip /content/results/checkpoint-4100