# Solving Cryptic Crosswords with LLMs: Part 1
date created: 04.08.2023

## Installing modules

In [None]:
! pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ruhom1kq
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ruhom1kq
  Resolved https://github.com/huggingface/transformers to commit 2bd7a27a671fd1d98059124024f580f8f5c0f3b5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.32.0.dev0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.0.dev0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━

In [None]:
! pip install torch datasets evaluate accelerate sentencepiece

Collecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# import modules
import pandas as pd
import numpy as np
import ast
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# import parameters
import parameters

## Data Extraction and Transformation

In [None]:
# Import data
clues_raw = pd.read_csv(clues_path_raw).dropna().sample(frac=1)
clues = clues_raw.copy()[['rowid', 'clue', 'answer', 'definition']]

# Transform columns into the format required by the trainer module
clues['rowid'] = clues['rowid'].astype(str)
clues['question'] = clues['clue']
clues['context'] = clues['definition'] 
clues['answers'] = clues['answer'].map(lambda x : {"text" : [x], "answer_start" : [0]})
clues['answers'].apply(lambda x : ast.literal_eval(str(x)))
clues = clues.rename(columns={'rowid' : 'id'})
clues = clues[['id', 'question', 'context', 'answers']].dropna()

# Print examples
clues.head()

Unnamed: 0,id,question,context,answers
17504,84889,Star man has not completed study (8),Star,"{'text': ['HESPERUS'], 'answer_start': [0]}"
54485,47857,Revised design in case of emissions prompts an...,anxiety,"{'text': ['EDGINESS'], 'answer_start': [0]}"
18038,148635,Rock and roll primarily lacking in training of...,Rock,"{'text': ['GNEISS'], 'answer_start': [0]}"
27748,160715,"Previously in unison (2,3,4)",Previously/in unison,"{'text': ['AT ONE TIME'], 'answer_start': [0]}"
56655,560759,New drapes due to be brought round (9),brought round,"{'text': ['PERSUADED'], 'answer_start': [0]}"


In [None]:
# Split data into train, validation, test

train_val = clues.sample(frac=0.9,random_state=200)
test = clues.drop(train_val.index)
train = train_val.sample(frac=0.9,random_state=200)
validation = train_val.drop(train.index)

# Save data
clues.to_csv(clues_path_processed, index=False)
train.to_csv(clues_path_train, index=False)
validation.to_csv(clues_path_validation, index=False)
test.to_csv(clues_path_test, index=False)

## Fine Tuning

N.B. the modelling parameters are in the parameter.py file.

In [None]:
! python run_seq2seq_qa.py \
  --model_name_or_path model_name_t5 \
  --train_file clues_path_train \
  --validation_file clues_path_validation \
  --test_file clues_path_test \
  --question_column question \
  --context_column context \
  --answer_column answers \
  --do_train \
  --do_eval \
  --do_pred \
  --predict_with_generate \
  --version_2_with_negative \
  --per_device_train_batch_size batch_size \
  --learning_rate lr \
  --num_train_epochs num_epochs \
  --max_seq_length max_seq_length \
  --overwrite_output_dir overwrite_dir \
  --output_dir output_dir

08/02/2023 20:10:55 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=None,
generation_max_length=None,
generation_num_beams=None,
gradient_a

## Predictions

In [None]:
# Read prediction output file
predictions = pd.read_json(predictions_path)
predictions = predictions.rename(columns={'id' : 'rowid'})

# Show some examples
predictions.head()

Unnamed: 0,rowid,prediction_text,no_answer_probability
0,160715,IN THE MAIL,0
1,501666,MARTINI,0
2,488088,PORT A CAKE,0
3,135879,ANIMAL,0
4,392986,PRIORUM,0


In [None]:
# Join and compare with clues dataset to see correct / incorrect answers

compare = clues_raw.merge(predictions, on='rowid')[['clue', 'definition', 'answer', 'prediction_text']]
compare['correct_len'] = np.where(compare['prediction_text'].str.len() == compare['answer'].str.len(), 1, 0)
compare['correct'] = np.where(compare['prediction_text'] == compare['answer'], 1, 0)
compare['correct_len_1'] = np.where(abs(compare['prediction_text'].str.len() - compare['answer'].str.len()) <=1, 1, 0)

# Get stats of correct vs incorrect cols
compare.groupby(['correct', 'correct_len', 'correct_len_1']).count()

Unnamed: 0,clue,definition,answer,prediction_text,correct_len,correct,correct_len_1
0,"Previously in unison (2,3,4)",Previously/in unison,AT ONE TIME,IN THE MAIL,1,0,1
1,Massachusetts Republican: “Can I drink?” (7),drink,MARTINI,MARTINI,1,1,1
2,"Czar due to travel round holiday region (4,1’4)",holiday region,COTE D,PORT A CAKE,0,0,0
3,Awful fear that is brought by magical creature...,magical creature no longer,FAERIE,ANIMAL,1,0,1
4,Highest rising American with power over Britis...,Highest,T,PRIORUM,0,0,0


## Plot loss vs epochs

In [None]:
with open(f'{output_dir}/trainer_state.json', 'rb') as f:
    tr = json.load(f)

epoch_list = [0]
loss_list = [None]
learning_rate_list = [lr]

# Collect the list of each metric
for x in tr['log_history'][:-1]:
    epoch_list.append(x['epoch'])
    loss_list.append(x['loss'])
    learning_rate_list.append(x['learning_rate'])

In [None]:
df = pd.DataFrame(dict(
    epoch = epoch_list,
    loss = loss_list,
    learning_rate = learning_rate_list
))


# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Create traces
fig.add_trace(go.Scatter(x=epoch_list, y=loss_list,
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(x=epoch_list, y=learning_rate_list, 
                    mode='lines+markers',
                    name='learning_rate'), secondary_y=True)

# Set x-axis title
fig.update_xaxes(title_text="Epoch")

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Learning Rate", secondary_y=True)
