In [15]:
# Install needed libraries
!pip install transformers datasets
!pip install sentencepiece
!pip install wandb



In [16]:

# Import all the needed libraries
import numpy as np
import pandas as pd
import torch
import functools
import wandb
import random
import os

from datasets import Dataset, DatasetDict, load_metric

from sklearn.metrics import classification_report, confusion_matrix

from keras.callbacks import EarlyStopping

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback, \
 EncoderDecoderModel, RobertaTokenizerFast

In [17]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
  print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
else:
  raise Exception('Currently using CPU, change the type of the runtime in the \'runtime\' tab')

GPU detected. Currently using: "NVIDIA GeForce RTX 4090"


In [18]:
'''Loading data'''
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split


df = pd.read_json(os.getcwd()+'/SubtaskA/subtaskA_train_monolingual.jsonl', lines=True)
# Just interested so far in text and label
df = df[['text', 'label']]

'So testing doesnt takes too much time processing, 10k seems ok'
df=df.sample(20000)

print('\nExample of dataframe (text|label)\n')
print(df.sample(5))
print(f'\nSize{df.shape}')
print(f'\nValue count \n')
print(df['label'].value_counts())

# Splitting the DataFrame into train, dev, and test sets
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)  # 60% training, 40% temporary
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # 20% development, 20% test

# Convert dataframes to datasets objects
train_dataset = Dataset.from_pandas(train_df, split='train')
valid_dataset = Dataset.from_pandas(dev_df, split='valid')
test_dataset = Dataset.from_pandas(test_df, split='test')

dataset = DatasetDict({'train': train_dataset, 'valid': valid_dataset, 'test': test_dataset})
dataset



Example of dataframe (text|label)

                                                     text  label
115179    Recent applications have proved that the Sur...      0
88021   You've misread the article that you linked. \n...      0
63746    Knowing your own strengths and what you care ...      0
77921   The Roman Catholic Diocese of Jaipur (Dioecesi...      0
756     How to Make Crepe Paper Peonies\n\nIf you're l...      1

Size(20000, 2)

Value count 

label
0    10406
1     9594
Name: count, dtype: int64


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 12000
    })
    valid: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4000
    })
})

In [19]:
accuracy = load_metric('accuracy')
f1 = load_metric('f1')

In [20]:
def compute_metric(eval_pred):
  predictions, labels = eval_pred

  predictions = np.argmax(predictions, axis=1)

  result_acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
  result_f1 = f1.compute(predictions=predictions, references=labels)['f1']

  return {'accuracy': result_acc, 'f1-score': result_f1}

In [21]:
model_checkpoint="bert-base-uncased"
model_name = model_checkpoint.split("/")[-1]
model_name

'bert-base-uncased'

In [22]:
n_labels=2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=n_labels)

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 570kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:05<00:00, 75.8MB/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.12MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.29MB/s]


In [24]:
RANDOM_SEED = 42
BATCH_SIZE = 6
EPOCHS = 10
METRIC_FOR_BEST_MODEL = "eval_loss"
GENERATE_GRAPHIC = True
LEARNING_RATE = 2e-05
WEIGHT_DECAY = 0.01

# Define the training parameters
num_train_samples = dataset['train'].num_rows
train_dataset = dataset['train'].shuffle(seed=RANDOM_SEED).select(range(num_train_samples))
logging_steps = len(train_dataset) // (2 * BATCH_SIZE * EPOCHS)

# If the chosen metric for best model is 'eval_loss' we have to adjust
# a parameter so it chooses the smallest value.
if METRIC_FOR_BEST_MODEL == 'eval_loss':
  metric_condition = False
else:
  metric_condition = True

# Tell to the trainer wether it needs to generate the graphic or not
if GENERATE_GRAPHIC:
  report_option = 'wandb'
else:
  report_option = None

training_args = TrainingArguments(
    output_dir='results',
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=metric_condition,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=logging_steps,
    save_total_limit=3,
    report_to=report_option,
    push_to_hub=False
)

In [31]:
import tensorflow as tf

train_text=train_df['text'].tolist()
train_label=train_df['label'].tolist()

val_text=dev_df['text'].tolist()
val_label=dev_df['label'].tolist()

test_text=test_df['text'].tolist()
test_label=test_df['label'].tolist()

MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME )

train_encodings = tokenizer(train_text, truncation=True, padding=True)
val_encodings = tokenizer(val_text, truncation=True, padding=True)
test_encodings = tokenizer(test_text, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_label
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_label
))


In [32]:
ES_PATIENCE = 3
# Create a Trainer object that will do the work for us
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=ES_PATIENCE)],
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [33]:
print(torch.cuda.memory_summary())


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 428786 KiB |    895 MiB |    895 MiB | 488176 KiB |
|       from large pool | 428288 KiB |    894 MiB |    894 MiB | 487680 KiB |
|       from small pool |    498 KiB |      0 MiB |      0 MiB |    496 KiB |
|---------------------------------------------------------------------------|
| Active memory         | 428786 KiB |    895 MiB |    895 MiB | 488176 KiB |
|       from large pool | 428288 KiB |    894 MiB |    894 MiB | 487680 KiB |
|       from small pool |    498 KiB |      0 MiB |      0 MiB |    496 KiB |
|---------------------------------------------------------------

In [34]:
next(model.parameters()).is_cuda

True

In [35]:
trainer.train()



TypeError: '_TensorSliceDataset' object is not subscriptable