In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset, DatasetDict


In [3]:



data = load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Train.csv', split='train')

data = DatasetDict({
    'train': load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Train.csv', split='train'),
    'val': load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Valid.csv', split='train'),
    'test':load_dataset('csv', data_files='./data/shai-training-2024-a-level-2/Test.csv', split='train')
})

data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 5000
    })
})

In [3]:
df = data['train'].to_pandas()
df

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [5]:
df[['text', 'label']].describe()

Unnamed: 0,label
count,40000.0
mean,0.499525
std,0.500006
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [4]:
df['label'].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

In [6]:
df.isna().sum()

text     0
label    0
dtype: int64

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

checkpoint = 'JamesH/Movie_review_sentiment_analysis_model'
# checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, cache_dir='./models_weights')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, cache_dir='./models_weights')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 

In [11]:
model.base_model_prefix

'deberta'

In [10]:
model.deberta

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0): DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermediate): Deb

# Pre-process Data

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 5000
    })
})

In [5]:
def preprocess_data(batch):
    #batch['text'] = [x1 + f' {tokenizer.sep_token} ' + x2 for x1,x2 in zip(batch['premise'],batch['hypothesis'])]
    return tokenizer(batch['text'])

data = data.map(preprocess_data, batched=True)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

# TrainerAPI

In [8]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_metric


def compute_metrics(eval_pred):
    metric = load_metric('accuracy')
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)


train_args = TrainingArguments(
    output_dir='results_debertav2',
    logging_dir='logs',
    report_to=[],
    
    evaluation_strategy='steps',
    eval_steps=5000,
    logging_steps=5000,
    save_steps=5000,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    learning_rate=1e-6,
    weight_decay=0.005,
    num_train_epochs=5,
    fp16=True,
    
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=3,
)

train_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=5000,
evaluation_strategy=steps,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_mode

In [9]:
trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=data['train'],
    eval_dataset=data['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(3)]
)



In [10]:
trainer.train()

  0%|          | 0/200000 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.2975, 'learning_rate': 9.7504e-07, 'epoch': 0.12}


  0%|          | 0/5000 [00:00<?, ?it/s]

  metric = load_metric('accuracy')


{'eval_loss': 0.3068280816078186, 'eval_accuracy': 0.9508, 'eval_runtime': 33.8421, 'eval_samples_per_second': 147.745, 'eval_steps_per_second': 147.745, 'epoch': 0.12}
{'loss': 0.315, 'learning_rate': 9.500499999999999e-07, 'epoch': 0.25}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.30203741788864136, 'eval_accuracy': 0.9462, 'eval_runtime': 34.347, 'eval_samples_per_second': 145.573, 'eval_steps_per_second': 145.573, 'epoch': 0.25}
{'loss': 0.3069, 'learning_rate': 9.2506e-07, 'epoch': 0.38}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.3368346691131592, 'eval_accuracy': 0.9482, 'eval_runtime': 33.8452, 'eval_samples_per_second': 147.731, 'eval_steps_per_second': 147.731, 'epoch': 0.38}
{'loss': 0.2865, 'learning_rate': 9.0007e-07, 'epoch': 0.5}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.27460843324661255, 'eval_accuracy': 0.9564, 'eval_runtime': 33.9213, 'eval_samples_per_second': 147.4, 'eval_steps_per_second': 147.4, 'epoch': 0.5}
{'loss': 0.2461, 'learning_rate': 8.7508e-07, 'epoch': 0.62}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.26058799028396606, 'eval_accuracy': 0.9618, 'eval_runtime': 33.2865, 'eval_samples_per_second': 150.211, 'eval_steps_per_second': 150.211, 'epoch': 0.62}
{'loss': 0.2707, 'learning_rate': 8.5009e-07, 'epoch': 0.75}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.2539912462234497, 'eval_accuracy': 0.9624, 'eval_runtime': 33.2672, 'eval_samples_per_second': 150.298, 'eval_steps_per_second': 150.298, 'epoch': 0.75}
{'loss': 0.25, 'learning_rate': 8.25095e-07, 'epoch': 0.88}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.2723315358161926, 'eval_accuracy': 0.9588, 'eval_runtime': 34.5309, 'eval_samples_per_second': 144.798, 'eval_steps_per_second': 144.798, 'epoch': 0.88}
{'loss': 0.2643, 'learning_rate': 8.001049999999999e-07, 'epoch': 1.0}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.26927995681762695, 'eval_accuracy': 0.9572, 'eval_runtime': 33.5632, 'eval_samples_per_second': 148.973, 'eval_steps_per_second': 148.973, 'epoch': 1.0}
{'loss': 0.2056, 'learning_rate': 7.75115e-07, 'epoch': 1.12}


  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.2774675488471985, 'eval_accuracy': 0.959, 'eval_runtime': 33.6733, 'eval_samples_per_second': 148.486, 'eval_steps_per_second': 148.486, 'epoch': 1.12}
{'train_runtime': 2417.9789, 'train_samples_per_second': 82.714, 'train_steps_per_second': 82.714, 'train_loss': 0.2714110649956597, 'epoch': 1.12}


TrainOutput(global_step=45000, training_loss=0.2714110649956597, metrics={'train_runtime': 2417.9789, 'train_samples_per_second': 82.714, 'train_steps_per_second': 82.714, 'train_loss': 0.2714110649956597, 'epoch': 1.12})

In [11]:
trainer.evaluate()

  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.2539912462234497,
 'eval_accuracy': 0.9624,
 'eval_runtime': 33.9996,
 'eval_samples_per_second': 147.06,
 'eval_steps_per_second': 147.06,
 'epoch': 1.12}

In [12]:
data['test'] = data['test'].remove_columns(['label'])

# Prediction

In [13]:
preds = trainer.predict(data['test'])
preds

  0%|          | 0/5000 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 2.9101562, -3.203125 ],
       [ 4.59375  , -5.2226562],
       [ 4.84375  , -5.640625 ],
       ...,
       [ 4.6210938, -5.2070312],
       [ 4.7890625, -5.5507812],
       [ 4.0507812, -4.4492188]], dtype=float32), label_ids=None, metrics={'test_runtime': 33.0078, 'test_samples_per_second': 151.479, 'test_steps_per_second': 151.479})

In [16]:
preds.predictions

array([[ 2.9101562, -3.203125 ],
       [ 4.59375  , -5.2226562],
       [ 4.84375  , -5.640625 ],
       ...,
       [ 4.6210938, -5.2070312],
       [ 4.7890625, -5.5507812],
       [ 4.0507812, -4.4492188]], dtype=float32)

In [17]:
import pickle

with open('predictions/deberta_finetuned.pkl', 'wb') as f:
    pickle.dump(preds.predictions, f)

In [14]:
pd.DataFrame({
    'id': data['test']['id'],
    'label': np.argmax(preds.predictions, axis=1)
}).to_csv('submissions/submission_deberta_finetuned.csv', header=True, index=False)