In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

In [3]:
train_ds = load_dataset("csv", data_files="../data/train_df.csv").shuffle(seed=42)
valid_ds = load_dataset("csv", data_files="../data/valid_df.csv")

train_ds = train_ds.remove_columns("essay_id")
train_ds = train_ds.rename_column("full_text", "text")
train_ds = train_ds.rename_column("score", "label")
valid_ds = valid_ds.remove_columns("essay_id")
valid_ds = valid_ds.rename_column("full_text", "text")
valid_ds = valid_ds.rename_column("score", "label")



In [4]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 13845
    })
})

In [5]:
valid_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3462
    })
})

In [6]:
from transformers import AutoTokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [8]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification

In [9]:
config = DistilBertConfig.from_pretrained(
    "distilbert/distilbert-base-uncased"
)

In [10]:
model = DistilBertForSequenceClassification(config=config).from_pretrained("distilbert/distilbert-base-uncased", num_labels=6)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)


def preprocess_labels(examples):
    labels = []
    for label in examples["label"]:
        labels += [label -1]
    return {"label": labels}

In [13]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_valid = valid_ds.map(preprocess_function, batched=True)

In [14]:
tokenized_train = tokenized_train.map(preprocess_labels, batched=True)
tokenized_valid = tokenized_valid.map(preprocess_labels, batched=True)

In [15]:
import evaluate

2024-04-10 10:54:14.759762: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 10:54:14.803177: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
accuracy = evaluate.load("accuracy")

In [17]:
import numpy as np

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [19]:
from transformers import TrainingArguments, Trainer

In [20]:
training_args = TrainingArguments(
    output_dir="clf",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_cpu=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_valid["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mayush-thakur[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1064,0.904034,0.608319
2,0.8685,0.870036,0.618718


TrainOutput(global_step=1732, training_loss=0.9487058097716017, metrics={'train_runtime': 628.635, 'train_samples_per_second': 44.048, 'train_steps_per_second': 2.755, 'total_flos': 3667088528105040.0, 'train_loss': 0.9487058097716017, 'epoch': 2.0})

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

In [3]:
train_ds = load_dataset("csv", data_files="../data/train_df.csv").shuffle(seed=42)
valid_ds = load_dataset("csv", data_files="../data/valid_df.csv")

train_ds = train_ds.remove_columns("essay_id")
train_ds = train_ds.rename_column("full_text", "text")
train_ds = train_ds.rename_column("score", "label")
valid_ds = valid_ds.remove_columns("essay_id")
valid_ds = valid_ds.rename_column("full_text", "text")
valid_ds = valid_ds.rename_column("score", "label")



In [4]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 13845
    })
})

In [5]:
valid_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3462
    })
})

In [6]:
from transformers import AutoTokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [8]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification

In [9]:
config = DistilBertConfig.from_pretrained(
    "distilbert/distilbert-base-uncased"
)

In [10]:
model = DistilBertForSequenceClassification(config=config).from_pretrained("distilbert/distilbert-base-uncased", num_labels=6)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)


def preprocess_labels(examples):
    labels = []
    for label in examples["label"]:
        labels += [label -1]
    return {"label": labels}

In [13]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_valid = valid_ds.map(preprocess_function, batched=True)

In [14]:
tokenized_train = tokenized_train.map(preprocess_labels, batched=True)
tokenized_valid = tokenized_valid.map(preprocess_labels, batched=True)

In [15]:
import evaluate

2024-04-10 10:54:14.759762: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 10:54:14.803177: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
accuracy = evaluate.load("accuracy")

In [17]:
import numpy as np

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [19]:
from transformers import TrainingArguments, Trainer

In [20]:
training_args = TrainingArguments(
    output_dir="clf",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_cpu=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_valid["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mayush-thakur[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1064,0.904034,0.608319
2,0.8685,0.870036,0.618718


TrainOutput(global_step=1732, training_loss=0.9487058097716017, metrics={'train_runtime': 628.635, 'train_samples_per_second': 44.048, 'train_steps_per_second': 2.755, 'total_flos': 3667088528105040.0, 'train_loss': 0.9487058097716017, 'epoch': 2.0})