In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset, DatasetDict


In [2]:



data = load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Train.csv', split='train')

data = DatasetDict({
    'train': load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Train.csv', split='train'),
    'val': load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Valid.csv', split='train'),
    'test':load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Test.csv', split='train')
})

data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 5000
    })
})

In [3]:

df = data['train'].to_pandas()
df

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [4]:
df[['text', 'label']].describe()

Unnamed: 0,label
count,40000.0
mean,0.499525
std,0.500006
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [5]:
df['label'].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

In [6]:
df.isna().sum()

text     0
label    0
dtype: int64

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

checkpoint = 'google-bert/bert-base-uncased'
# checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, cache_dir='./models_weights')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, ignore_mismatched_sizes=True, cache_dir='./models_weights')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# Pre-process Data

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 5000
    })
})

In [5]:
model.config.max_position_embeddings

512

In [6]:
def preprocess_data(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

data = data.map(preprocess_data, batched=True)
data

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

# TrainerAPI

In [7]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_metric


def compute_metrics(eval_pred):
    metric = load_metric('accuracy')
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)


train_args = TrainingArguments(
    output_dir='results_bert',
    logging_dir='logs',
    report_to=[],
    
    evaluation_strategy='steps',
    eval_steps=200,
    logging_steps=200,
    save_steps=200,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.005,
    num_train_epochs=5,
    #fp16=True,
    #use_cpu=True,
    
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=3,
)

train_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=200,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_mode

In [8]:
trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=data['train'],
    eval_dataset=data['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(3)]
)



In [9]:
trainer.train()

  0%|          | 0/12500 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3811, 'learning_rate': 1.968e-05, 'epoch': 0.08}


  0%|          | 0/313 [00:00<?, ?it/s]

  metric = load_metric('accuracy')


{'eval_loss': 0.2778383791446686, 'eval_accuracy': 0.9018, 'eval_runtime': 26.2359, 'eval_samples_per_second': 190.579, 'eval_steps_per_second': 11.93, 'epoch': 0.08}
{'loss': 0.266, 'learning_rate': 1.936e-05, 'epoch': 0.16}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.22265322506427765, 'eval_accuracy': 0.9138, 'eval_runtime': 25.0642, 'eval_samples_per_second': 199.488, 'eval_steps_per_second': 12.488, 'epoch': 0.16}
{'loss': 0.246, 'learning_rate': 1.904e-05, 'epoch': 0.24}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.20689204335212708, 'eval_accuracy': 0.9262, 'eval_runtime': 25.6515, 'eval_samples_per_second': 194.92, 'eval_steps_per_second': 12.202, 'epoch': 0.24}
{'loss': 0.2491, 'learning_rate': 1.8720000000000004e-05, 'epoch': 0.32}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.2765049636363983, 'eval_accuracy': 0.9112, 'eval_runtime': 25.1381, 'eval_samples_per_second': 198.901, 'eval_steps_per_second': 12.451, 'epoch': 0.32}
{'loss': 0.2085, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.4}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.19890427589416504, 'eval_accuracy': 0.9262, 'eval_runtime': 25.2042, 'eval_samples_per_second': 198.38, 'eval_steps_per_second': 12.419, 'epoch': 0.4}
{'loss': 0.2158, 'learning_rate': 1.8080000000000003e-05, 'epoch': 0.48}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.20913510024547577, 'eval_accuracy': 0.9268, 'eval_runtime': 25.9327, 'eval_samples_per_second': 192.807, 'eval_steps_per_second': 12.07, 'epoch': 0.48}
{'loss': 0.1972, 'learning_rate': 1.7760000000000003e-05, 'epoch': 0.56}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.1917170286178589, 'eval_accuracy': 0.9338, 'eval_runtime': 26.6576, 'eval_samples_per_second': 187.564, 'eval_steps_per_second': 11.742, 'epoch': 0.56}
{'loss': 0.2115, 'learning_rate': 1.7440000000000002e-05, 'epoch': 0.64}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.21883085370063782, 'eval_accuracy': 0.9232, 'eval_runtime': 25.757, 'eval_samples_per_second': 194.122, 'eval_steps_per_second': 12.152, 'epoch': 0.64}
{'loss': 0.1987, 'learning_rate': 1.7120000000000002e-05, 'epoch': 0.72}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.2111566960811615, 'eval_accuracy': 0.9296, 'eval_runtime': 25.5346, 'eval_samples_per_second': 195.813, 'eval_steps_per_second': 12.258, 'epoch': 0.72}
{'loss': 0.197, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.8}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.210626021027565, 'eval_accuracy': 0.9346, 'eval_runtime': 25.4539, 'eval_samples_per_second': 196.434, 'eval_steps_per_second': 12.297, 'epoch': 0.8}
{'loss': 0.2095, 'learning_rate': 1.648e-05, 'epoch': 0.88}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.2183724194765091, 'eval_accuracy': 0.9204, 'eval_runtime': 25.6768, 'eval_samples_per_second': 194.728, 'eval_steps_per_second': 12.19, 'epoch': 0.88}
{'loss': 0.1959, 'learning_rate': 1.616e-05, 'epoch': 0.96}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.23029033839702606, 'eval_accuracy': 0.9278, 'eval_runtime': 25.4136, 'eval_samples_per_second': 196.745, 'eval_steps_per_second': 12.316, 'epoch': 0.96}
{'loss': 0.1759, 'learning_rate': 1.584e-05, 'epoch': 1.04}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.19758418202400208, 'eval_accuracy': 0.9378, 'eval_runtime': 25.3577, 'eval_samples_per_second': 197.179, 'eval_steps_per_second': 12.343, 'epoch': 1.04}
{'loss': 0.0994, 'learning_rate': 1.552e-05, 'epoch': 1.12}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.2523944675922394, 'eval_accuracy': 0.9356, 'eval_runtime': 26.339, 'eval_samples_per_second': 189.833, 'eval_steps_per_second': 11.884, 'epoch': 1.12}
{'loss': 0.1218, 'learning_rate': 1.5200000000000002e-05, 'epoch': 1.2}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.2643783688545227, 'eval_accuracy': 0.9328, 'eval_runtime': 26.8466, 'eval_samples_per_second': 186.243, 'eval_steps_per_second': 11.659, 'epoch': 1.2}
{'loss': 0.1422, 'learning_rate': 1.4880000000000002e-05, 'epoch': 1.28}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.21238921582698822, 'eval_accuracy': 0.938, 'eval_runtime': 25.4037, 'eval_samples_per_second': 196.821, 'eval_steps_per_second': 12.321, 'epoch': 1.28}
{'loss': 0.1321, 'learning_rate': 1.4560000000000001e-05, 'epoch': 1.36}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.24160243570804596, 'eval_accuracy': 0.9332, 'eval_runtime': 25.5888, 'eval_samples_per_second': 195.398, 'eval_steps_per_second': 12.232, 'epoch': 1.36}
{'loss': 0.1091, 'learning_rate': 1.4240000000000001e-05, 'epoch': 1.44}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.249823197722435, 'eval_accuracy': 0.935, 'eval_runtime': 26.3385, 'eval_samples_per_second': 189.836, 'eval_steps_per_second': 11.884, 'epoch': 1.44}
{'loss': 0.1308, 'learning_rate': 1.392e-05, 'epoch': 1.52}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.2804271876811981, 'eval_accuracy': 0.9276, 'eval_runtime': 24.5725, 'eval_samples_per_second': 203.479, 'eval_steps_per_second': 12.738, 'epoch': 1.52}
{'train_runtime': 1397.6418, 'train_samples_per_second': 143.098, 'train_steps_per_second': 8.944, 'train_loss': 0.19409427843595806, 'epoch': 1.52}


TrainOutput(global_step=3800, training_loss=0.19409427843595806, metrics={'train_runtime': 1397.6418, 'train_samples_per_second': 143.098, 'train_steps_per_second': 8.944, 'train_loss': 0.19409427843595806, 'epoch': 1.52})

In [10]:
trainer.evaluate()

  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.21238921582698822,
 'eval_accuracy': 0.938,
 'eval_runtime': 26.1772,
 'eval_samples_per_second': 191.006,
 'eval_steps_per_second': 11.957,
 'epoch': 1.52}

In [11]:
data['test'] = data['test'].remove_columns(['label'])

# Prediction

In [12]:
preds = trainer.predict(data['test'])
preds

  0%|          | 0/313 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 1.0800781, -1.7431641],
       [ 2.9902344, -3.4980469],
       [ 3.0214844, -3.5214844],
       ...,
       [ 3.0683594, -3.5585938],
       [ 3.1621094, -3.6484375],
       [ 2.5507812, -3.1152344]], dtype=float32), label_ids=None, metrics={'test_runtime': 14.173, 'test_samples_per_second': 352.783, 'test_steps_per_second': 22.084})

In [13]:
pd.DataFrame({
    'id': data['test']['id'],
    'label': np.argmax(preds.predictions, axis=1)
}).to_csv('submissions/submission_bert.csv', header=True, index=False)