In [41]:
import datasets
import transformers
from datasets import load_dataset
from evaluate import load

In [40]:
train_dataset = load_dataset('json', data_files="data/naive_random.json", field="train",split="train")
val_dataset = load_dataset('json', data_files="data/naive_random.json", field="val",split="train")
test_dataset = load_dataset('json', data_files="data/naive_random.json", field="test",split="train")


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [47]:

test_dataset[0]

{'idx': 135430,
 'lengths_punctuation': [','],
 'number': 0,
 'clue': 'Achy shaking stopped by iodine, salt and kaolin',
 'soln': 'chinaclay',
 'across_or_down': '',
 'id': '',
 'creator': 'Arachne',
 'type': 'cryptic',
 'unique_clue_id': '',
 'orig_lengths': '5,4',
 'pos': [0, 0],
 'lengths': [5, 4],
 'dataset': '',
 'soln_with_spaces': 'china clay'}

In [49]:
# Concat clue, with length. 
def concat_length(example):

    example["clue"] = f'{example["clue"]}.  {example["orig_lengths"]}.'

    return example

In [50]:
train_dataset = train_dataset.map(concat_length)
val_dataset = val_dataset.map(concat_length)
test_dataset = test_dataset.map(concat_length)


Map:   0%|          | 0/28476 [00:00<?, ? examples/s]

Map:   0%|          | 0/28476 [00:00<?, ? examples/s]

In [53]:
train_dataset = train_dataset.select_columns(["clue", "soln"])
val_dataset = val_dataset.select_columns(["clue", "soln"])
test_dataset = test_dataset.select_columns(["clue", "soln"])


In [54]:
train_dataset[:2]

{'clue': ['Suffering to grasp edge of plant.  8.',
  'Honour Ben and Noel with new order.  7.'],
 'soln': ['agrimony', 'ennoble']}

In [56]:
model_checkpoint = "t5-small"
metric = load("rouge")

In [57]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_input_length = 1024
max_target_length = 32
def preprocess_function(examples):
    inputs = [doc for doc in examples["clue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["soln"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [60]:
tokenized_train= train_dataset.map(preprocess_function, batched=True)
tokenized_val= val_dataset.map(preprocess_function, batched=True)
tokenized_test= test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/85428 [00:00<?, ? examples/s]

Map:   0%|          | 0/28476 [00:00<?, ? examples/s]

Map:   0%|          | 0/28476 [00:00<?, ? examples/s]

In [61]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


  warn(msg)


CUDA SETUP: CUDA runtime path found: /home/abdelrahman.sadallah/local/cuda-11.7/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/abdelrahman.sadallah/.conda/envs/nlp/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [67]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [70]:
train_batch_size = 256
val_batch_size = 128
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"experiments/{model_name}-finetuned-random",
    evaluation_strategy = "steps",
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [71]:
trainer.train()

  0%|          | 0/13350 [00:00<?, ?it/s]

{'loss': 3.9054, 'learning_rate': 1.9253932584269666e-05, 'epoch': 0.37}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.623079299926758, 'eval_rouge1': 0.0702, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0702, 'eval_rougeLsum': 0.0702, 'eval_gen_len': 5.5641, 'eval_runtime': 70.2341, 'eval_samples_per_second': 405.444, 'eval_steps_per_second': 12.672, 'epoch': 0.37}
{'loss': 3.8716, 'learning_rate': 1.850486891385768e-05, 'epoch': 0.75}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.5834407806396484, 'eval_rouge1': 0.0737, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0737, 'eval_rougeLsum': 0.0737, 'eval_gen_len': 5.6181, 'eval_runtime': 72.676, 'eval_samples_per_second': 391.821, 'eval_steps_per_second': 12.246, 'epoch': 0.75}
{'loss': 3.8344, 'learning_rate': 1.7755805243445693e-05, 'epoch': 1.12}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.549156427383423, 'eval_rouge1': 0.0773, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0773, 'eval_rougeLsum': 0.0773, 'eval_gen_len': 5.6896, 'eval_runtime': 75.6224, 'eval_samples_per_second': 376.555, 'eval_steps_per_second': 11.769, 'epoch': 1.12}
{'loss': 3.7785, 'learning_rate': 1.700674157303371e-05, 'epoch': 1.5}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.524308919906616, 'eval_rouge1': 0.0927, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0927, 'eval_rougeLsum': 0.092, 'eval_gen_len': 5.6986, 'eval_runtime': 77.619, 'eval_samples_per_second': 366.869, 'eval_steps_per_second': 11.466, 'epoch': 1.5}
{'loss': 3.7476, 'learning_rate': 1.625917602996255e-05, 'epoch': 1.87}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.498232841491699, 'eval_rouge1': 0.0994, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0997, 'eval_rougeLsum': 0.0997, 'eval_gen_len': 5.7379, 'eval_runtime': 79.8816, 'eval_samples_per_second': 356.478, 'eval_steps_per_second': 11.141, 'epoch': 1.87}
{'loss': 3.7336, 'learning_rate': 1.5510112359550563e-05, 'epoch': 2.25}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.4806065559387207, 'eval_rouge1': 0.0948, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0962, 'eval_rougeLsum': 0.0962, 'eval_gen_len': 5.7307, 'eval_runtime': 80.5971, 'eval_samples_per_second': 353.313, 'eval_steps_per_second': 11.043, 'epoch': 2.25}
{'loss': 3.7191, 'learning_rate': 1.4761048689138577e-05, 'epoch': 2.62}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.463360071182251, 'eval_rouge1': 0.1124, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1138, 'eval_rougeLsum': 0.1131, 'eval_gen_len': 5.77, 'eval_runtime': 80.1386, 'eval_samples_per_second': 355.334, 'eval_steps_per_second': 11.106, 'epoch': 2.62}
{'loss': 3.6838, 'learning_rate': 1.4011985018726592e-05, 'epoch': 3.0}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.4470057487487793, 'eval_rouge1': 0.1054, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1068, 'eval_rougeLsum': 0.1054, 'eval_gen_len': 5.7878, 'eval_runtime': 80.642, 'eval_samples_per_second': 353.116, 'eval_steps_per_second': 11.036, 'epoch': 3.0}
{'loss': 3.679, 'learning_rate': 1.3264419475655432e-05, 'epoch': 3.37}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.4345221519470215, 'eval_rouge1': 0.1032, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1032, 'eval_rougeLsum': 0.1032, 'eval_gen_len': 5.7553, 'eval_runtime': 81.643, 'eval_samples_per_second': 348.787, 'eval_steps_per_second': 10.901, 'epoch': 3.37}
{'loss': 3.6409, 'learning_rate': 1.2515355805243448e-05, 'epoch': 3.75}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.4219400882720947, 'eval_rouge1': 0.1208, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1208, 'eval_rougeLsum': 0.1201, 'eval_gen_len': 5.7883, 'eval_runtime': 81.2521, 'eval_samples_per_second': 350.465, 'eval_steps_per_second': 10.954, 'epoch': 3.75}
{'loss': 3.6442, 'learning_rate': 1.1766292134831461e-05, 'epoch': 4.12}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.4110565185546875, 'eval_rouge1': 0.1208, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1208, 'eval_rougeLsum': 0.1201, 'eval_gen_len': 5.7823, 'eval_runtime': 80.4573, 'eval_samples_per_second': 353.927, 'eval_steps_per_second': 11.062, 'epoch': 4.12}
{'loss': 3.6309, 'learning_rate': 1.1017228464419476e-05, 'epoch': 4.49}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.4030439853668213, 'eval_rouge1': 0.1173, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1173, 'eval_rougeLsum': 0.1159, 'eval_gen_len': 5.83, 'eval_runtime': 87.2087, 'eval_samples_per_second': 326.527, 'eval_steps_per_second': 10.205, 'epoch': 4.49}
{'loss': 3.6197, 'learning_rate': 1.0269662921348315e-05, 'epoch': 4.87}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.395129442214966, 'eval_rouge1': 0.1243, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1243, 'eval_rougeLsum': 0.1236, 'eval_gen_len': 5.8093, 'eval_runtime': 83.5148, 'eval_samples_per_second': 340.97, 'eval_steps_per_second': 10.657, 'epoch': 4.87}
{'loss': 3.6153, 'learning_rate': 9.52059925093633e-06, 'epoch': 5.24}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.385956287384033, 'eval_rouge1': 0.1306, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1313, 'eval_rougeLsum': 0.1299, 'eval_gen_len': 5.7987, 'eval_runtime': 82.244, 'eval_samples_per_second': 346.238, 'eval_steps_per_second': 10.821, 'epoch': 5.24}
{'loss': 3.6006, 'learning_rate': 8.771535580524345e-06, 'epoch': 5.62}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.3795816898345947, 'eval_rouge1': 0.1334, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1349, 'eval_rougeLsum': 0.1327, 'eval_gen_len': 5.7979, 'eval_runtime': 83.2986, 'eval_samples_per_second': 341.855, 'eval_steps_per_second': 10.684, 'epoch': 5.62}
{'loss': 3.5937, 'learning_rate': 8.02247191011236e-06, 'epoch': 5.99}




  0%|          | 0/890 [00:00<?, ?it/s]

{'eval_loss': 3.37384295463562, 'eval_rouge1': 0.1454, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1457, 'eval_rougeLsum': 0.144, 'eval_gen_len': 5.8059, 'eval_runtime': 83.9837, 'eval_samples_per_second': 339.066, 'eval_steps_per_second': 10.597, 'epoch': 5.99}


KeyboardInterrupt: 

In [88]:
# test 

model_name = "t5-small-finetuned-random/checkpoint-8000"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 512

inputs = ['With a degree, I leave this subject. 5.']
inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_title)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


etiquette
