In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from google.cloud import storage
from io import StringIO
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re
import string

# Evaluation

## Generate Outputs

We will first generate results on our trained T5 model using difference numbers of beams.

In [2]:
def get_df_from_gcs_blob(blob, bucket='recipe-data-bucket'):
    # START: COPIED FROM https://github.com/googleapis/python-storage/blob/HEAD/samples/snippets/storage_fileio_write_read.py
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket)

    blob = bucket.blob(blob)
    blob = blob.download_as_string()
    blob = blob.decode()
    blob = StringIO(blob)  #tranform bytes to string here
    df = pd.read_csv(blob)
    return df
    # END: COPIED FROM https://github.com/googleapis/python-storage/blob/HEAD/samples/snippets/storage_fileio_write_read.py

In [3]:
DEVICE = 'cuda'
MOD_PATH = './inp_cal_ingred_cal/final'

model = T5ForConditionalGeneration.from_pretrained(MOD_PATH).to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained(MOD_PATH)

oob_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(DEVICE)
oob_tokenizer = T5Tokenizer.from_pretrained('t5-small')

train_df = get_df_from_gcs_blob('train_only_cal.csv')
test_df = get_df_from_gcs_blob('test_only_cal.csv')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def get_output(inp, num_beams, max_len, model=model, tokenizer=tokenizer):
    input_ids = tokenizer(inp, return_tensors="pt").input_ids.to(DEVICE)
    outputs = model.generate(input_ids, max_length=max_len, num_beams=num_beams)
    out = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return out

In [12]:
train_df['gen_beams1'] = train_df['input'].map(lambda x: get_output(x, 1, train_df['output'].map(len).max()))
train_df['gen_beams2'] = train_df['input'].map(lambda x: get_output(x, 2, train_df['output'].map(len).max()))
train_df['gen_beams3'] = train_df['input'].map(lambda x: get_output(x, 3, train_df['output'].map(len).max()))

test_df['gen_beams1'] = test_df['input'].map(lambda x: get_output(x, 1, test_df['output'].map(len).max()))
test_df['gen_beams2'] = test_df['input'].map(lambda x: get_output(x, 2, test_df['output'].map(len).max()))
test_df['gen_beams3'] = test_df['input'].map(lambda x: get_output(x, 3, test_df['output'].map(len).max()))

In [5]:
train_df['gen_beams1_oob'] = train_df['input'].map(lambda x: get_output(x, 1, train_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
train_df['gen_beams2_oob'] = train_df['input'].map(lambda x: get_output(x, 2, train_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
train_df['gen_beams3_oob'] = train_df['input'].map(lambda x: get_output(x, 3, train_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))

test_df['gen_beams1_oob'] = test_df['input'].map(lambda x: get_output(x, 1, test_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
test_df['gen_beams2_oob'] = test_df['input'].map(lambda x: get_output(x, 2, test_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
test_df['gen_beams3_oob'] = test_df['input'].map(lambda x: get_output(x, 3, test_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))

Now, let's append this to our results from the `GPT` baseline.

In [7]:
train_df_gpt_results = get_df_from_gcs_blob('train_only_cal_gpt_results.csv')
test_df_gpt_results = get_df_from_gcs_blob('test_only_cal_gpt_results.csv')

In [12]:
train_df['output_gpt'] = train_df_gpt_results['gpt_out_formatted']
train_df = train_df.rename(columns={'output': 'output_gt', 'gen_beams1': 'output_t5_b1', 'gen_beams2': 'output_t5_b2', 'gen_beams3': 'output_t5_b3', 'gen_beams1_oob': 'output_t5_oob_b1', 'gen_beams2_oob': 'output_t5_oob_b2', 'gen_beams3_oob': 'output_t5_oob_b3'})

# upload to GCS
storage_client = storage.Client()
bucket = storage_client.get_bucket('recipe-data-bucket')
blob = bucket.blob('train_only_cal_final_results.csv')
csv_buffer = StringIO()
train_df.to_csv(csv_buffer, index=False)
blob.upload_from_string(csv_buffer.getvalue())

train_df.head()

Unnamed: 0,input,output_gt,output_t5_oob_b1,output_t5_oob_b2,output_t5_oob_b3,output_t5_b1,output_t5_b2,output_t5_b3,output_gpt
0,<ingredients>Spanish sherry vinegar (19 calori...,<title>Argentine Red Sauce<title><ingredients>...,ingredients>calories>2250calories>2250calories...,ingredients>calories>2250calories>2250calories...,ingredients>calories>2250calories>2250calories...,ingredients>1 tablespoon spanish sherry vinega...,"Spanish sherry vinegar, virgin olive oil, papr...","Spanish sherry vinegar, virgin olive oil, papr...",<title>Spicy Sherry Vinegar Chicken with Papri...
1,"<ingredients>mayonnaise (689 calories), basil ...",<title>Basil Aïoli<title><ingredients>3/4 cup ...,"), garlic (144 calories), lemon peel (2 calori...","), basil (24 calories), lemon juice (16 calori...","), basil (24 calories), lemon juice (16 calori...","I use 1 tablespoon of basil, 1 tablespoon of l...","1-2 hours mayonnaise, 2 tablespoons basil, 1 t...","6 ounces mayonnaise, 2 tablespoons basil, 1 ta...",<title>Creamy Lemon Basil Pasta with Grilled C...
2,"<ingredients>vanilla bean (296 calories), appl...",<title>Cider Sauce<title><ingredients>1 vanill...,calories>1360calories>1360calories>1360calorie...,calories>1360calories>1360calories>1360calorie...,"vanilla bean (296 calories), apple (96 calorie...","vanilla bean, apple, ed apple cider, ted butte...","vanilla bean, apple, ed apple cider, ted butte...","vanilla bean, apple, ed apple cider, ted butte...",<title>Vanilla-Apple Cider Cake with Browned B...
3,"<ingredients>olive oil (869 calories), lime ju...",<title>Mojo Sauce<title><ingredients>1 cup oli...,calories>1861calories>1861calories>1861calorie...,"(24 calories), orange juice (46 calories), cil...","ingredients>olive oil (869 calories), lime jui...","1 tablespoon olive oil, 1 tablespoon lime juic...","olive oil, lime juice, orange juice, cilantro,...","olive oil, lime juice, orange juice, cilantro,...",<title>Citrus-Marinated Grilled Shrimp with Ci...
4,"<ingredients>brandy (228 calories), crème de c...",<title>Brandy Alexander Ii<title><ingredients>...,"(406 calories), heavy cream (348 calories), nu...","(406 calories), heavy cream (348 calories), nu...","(406 calories), heavy cream (348 calories), nu...","Brandy, crème de cacao, heavy cream, nutmeging...","Brandy, crème de cacao, heavy cream, nutmeging...","Brandy, crème de cacao, heavy cream, nutmeging...",<title>Nutmeg Brandy Cream Delight<title><ingr...


In [16]:
test_df['output_gpt'] = test_df_gpt_results['gpt_out_formatted']
test_df = test_df.rename(columns={'output': 'output_gt', 'gen_beams1': 'output_t5_b1', 'gen_beams2': 'output_t5_b2', 'gen_beams3': 'output_t5_b3', 'gen_beams1_oob': 'output_t5_oob_b1', 'gen_beams2_oob': 'output_t5_oob_b2', 'gen_beams3_oob': 'output_t5_oob_b3'})

# upload to GCS
storage_client = storage.Client()
bucket = storage_client.get_bucket('recipe-data-bucket')

blob = bucket.blob('test_only_cal_final_results.csv')

csv_buffer = StringIO()
test_df.to_csv(csv_buffer, index=False)
blob.upload_from_string(csv_buffer.getvalue())

test_df.head()

Unnamed: 0,input,output_gt,output_t5_oob_b1,output_t5_oob_b2,output_t5_oob_b3,output_t5_b1,output_t5_b2,output_t5_b3,output_gpt
0,"<ingredients>pecan halves (707 calories), ted ...",<title>Sea Salt-Roasted Pecans<title><ingredie...,"), sea salt (0 calories)ingredients>1428calori...","), sea salt (0 calories)ingredients>1428calori...","), sea salt (0 calories)ingredients>1428calori...","Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...",<title>Salted Pecan Butter<title><ingredients>...
1,"<ingredients>olive oil (869 calories), red cab...",<title>Red Cabbage And Onions<title><ingredien...,calories>1286calories>1286calories>1286calorie...,"ingredients>olive oil (869 calories), red cabb...","ingredients>olive oil (869 calories), red cabb...","olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...",<title>Sautéed Red Cabbage with Caramelized Re...
2,"<ingredients>chocolate (540 calories), sugar (...",<title>Chocolate Almond Butter<title><ingredie...,"ingredients>chocolate (540 calories), sugar (1...","ingredients>chocolate (540 calories), sugar (1...","ingredients>chocolate (540 calories), sugar (1...","Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...",<title>Chocolate Amaretto Truffles<title><ingr...
3,"<ingredients>milk (51 calories), yellow corn m...",<title>Old-Fashioned Indian Pudding<title><ing...,"), salt (0 calories), molasses (287 calories)i...","), salt (0 calories), molasses (287 calories)i...","), salt (0 calories), molasses (287 calories)i...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...",<title>Cornmeal Porridge with Molasses<title><...
4,"<ingredients>vodka (235 calories), lime juice ...",<title>Kamikaze<title><ingredients>2 ounces vo...,"), triple sec (270 calories)ingredients>calori...","), triple sec (270 calories)ingredients>519cal...","), triple sec (270 calories)ingredients>519cal...","vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...",<title>Vodka Lime Cooler\n\nServings: 1\nCalor...


## BLEU

Now, let's compute BLEU scores of each model on train set and test set.

In [17]:
def compute_bleu(target, pred):
    if type(pred) != str: return 0
    target = re.sub('<title>', '', target)
    target = re.sub('<ingredients>', '', target)
    target = re.sub('<directions>', '', target)
    target = re.sub(r"[<>,.;@#?!&$]+", ' ', target)
    target = [target.split()]

    pred = re.sub(r'[<]?title[>]?', '', pred)
    pred = re.sub(r'[<]?ingredients[>]?', '', pred)
    pred = re.sub(r'[<]?directions[>]?', '', pred)
    pred = re.sub(r"[<>,.;@#?!&$]+", ' ', pred)
    pred = pred.split()

    return sentence_bleu(target, pred)

In [29]:
print(f"Train BLEU Fine-Tuned T5 1 Beam: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b1']), axis=1).mean()}")
print(f"Train BLEU Fine-Tuned T5 2 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b2']), axis=1).mean()}")
print(f"Train BLEU Fine-Tuned T5 3 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b3']), axis=1).mean()}")

print(f"Train BLEU OOB T5 1 Beam: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b1']), axis=1).mean()}")
print(f"Train BLEU OOB T5 2 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b2']), axis=1).mean()}")
print(f"Train BLEU OOB T5 3 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b3']), axis=1).mean()}")

print(f"Train BLEU GPT 3.5 Turbo: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_gpt']), axis=1).mean()}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Train BLEU Fine-Tuned T5 1 Beam: 0.0377206472850098
Train BLEU Fine-Tuned T5 2 Beams: 0.041059883217307296
Train BLEU Fine-Tuned T5 3 Beams: 0.04025010761502282
Train BLEU OOB T5 1 Beam: 0.0003713024583506109
Train BLEU OOB T5 2 Beams: 0.0004570537716064423
Train BLEU OOB T5 3 Beams: 0.0005293131735996155
Train BLEU GPT 3.5 Turbo: 0.0224524978018088


In [30]:
print(f"Test BLEU Fine-Tuned T5 1 Beam: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b1']), axis=1).mean()}")
print(f"Test BLEU Fine-Tuned T5 2 Beams: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b2']), axis=1).mean()}")
print(f"Test BLEU Fine-Tuned T5 3 Beams: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b3']), axis=1).mean()}")

print(f"Test BLEU OOB T5 1 Beam: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b1']), axis=1).mean()}")
print(f"Test BLEU OOB T5 2 Beams: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b2']), axis=1).mean()}")
print(f"Test BLEU OOB T5 3 Beams: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b3']), axis=1).mean()}")

print(f"Test BLEU GPT 3.5 Turbo: {test_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_gpt']), axis=1).mean()}")

Test BLEU Fine-Tuned T5 1 Beam: 0.039166296968198344
Test BLEU Fine-Tuned T5 2 Beams: 0.04118350854708197
Test BLEU Fine-Tuned T5 3 Beams: 0.042099379393260555
Test BLEU OOB T5 1 Beam: 0.00041829887518731715
Test BLEU OOB T5 2 Beams: 0.0006481759741706717
Test BLEU OOB T5 3 Beams: 0.0006215952939715139
Test BLEU GPT 3.5 Turbo: 0.023625313134378906


## ROUGE

Let's compute ROUGE scores on train set and test set.

In [19]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rogue(target, pred, scorer=scorer):
    if type(pred) != str: return {key: 0 for key in scorer.rouge_types} 
    scores = scorer.score(target, pred)
    return {key: scores[key].fmeasure for key in scores}


In [31]:
rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b1']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 Fine-Tuned T5 1 Beam: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 Fine-Tuned T5 1 Beam: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N Fine-Tuned T5 1 Beam: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b2']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 Fine-Tuned T5 2 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 Fine-Tuned T5 2 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N Fine-Tuned T5 2 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b3']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 Fine-Tuned T5 3 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 Fine-Tuned T5 3 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N Fine-Tuned T5 3 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b1']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 OOB T5 1 Beam: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 OOB T5 1 Beam: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N OOB T5 1 Beam: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b2']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 OOB T5 2 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 OOB T5 2 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N OOB T5 2 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b3']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 OOB T5 3 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 OOB T5 3 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N OOB T5 3 Beams: {rouge_summary['rougeL'].mean()}")


rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_gpt']), axis=1, result_type='expand').mean()
print(f"Train ROUGE-1 GPT 3.5 Turbo: {rouge_summary['rouge1'].mean()}")
print(f"Train ROUGE-2 GPT 3.5 Turbo: {rouge_summary['rouge2'].mean()}")
print(f"Train ROUGE-N GPT 3.5 Turbo: {rouge_summary['rougeL'].mean()}")

Train ROUGE-1 Fine-Tuned T5 1 Beam: 0.3828935973634431
Train ROUGE-2 Fine-Tuned T5 1 Beam: 0.1355646813729195
Train ROUGE-N Fine-Tuned T5 1 Beam: 0.29665207741626554
Train ROUGE-1 Fine-Tuned T5 2 Beams: 0.3828813613723136
Train ROUGE-2 Fine-Tuned T5 2 Beams: 0.13648591808749316
Train ROUGE-N Fine-Tuned T5 2 Beams: 0.2975619283984136
Train ROUGE-1 Fine-Tuned T5 3 Beams: 0.37269723081940703
Train ROUGE-2 Fine-Tuned T5 3 Beams: 0.13361982130056063
Train ROUGE-N Fine-Tuned T5 3 Beams: 0.2909881027237875
Train ROUGE-1 OOB T5 1 Beam: 0.13902773564446413
Train ROUGE-2 OOB T5 1 Beam: 0.044143361496976195
Train ROUGE-N OOB T5 1 Beam: 0.13286119135584668
Train ROUGE-1 OOB T5 2 Beams: 0.16700907476867188
Train ROUGE-2 OOB T5 2 Beams: 0.05204422922917151
Train ROUGE-N OOB T5 2 Beams: 0.1589939229655112
Train ROUGE-1 OOB T5 3 Beams: 0.1802496406460773
Train ROUGE-2 OOB T5 3 Beams: 0.055393240622588845
Train ROUGE-N OOB T5 3 Beams: 0.17132130752898742
Train ROUGE-1 GPT 3.5 Turbo: 0.35583426819073904

In [32]:
rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b1']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 Fine-Tuned T5 1 Beam: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 Fine-Tuned T5 1 Beam: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N Fine-Tuned T5 1 Beam: {rouge_summary['rougeL'].mean()}")

rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b2']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 Fine-Tuned T5 2 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 Fine-Tuned T5 2 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N Fine-Tuned T5 2 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b3']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 Fine-Tuned T5 3 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 Fine-Tuned T5 3 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N Fine-Tuned T5 3 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b1']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 OOB T5 1 Beam: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 OOB T5 1 Beam: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N OOB T5 1 Beam: {rouge_summary['rougeL'].mean()}")

rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b2']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 OOB T5 2 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 OOB T5 2 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N OOB T5 2 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b3']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 OOB T5 3 Beams: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 OOB T5 3 Beams: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N OOB T5 3 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = test_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_gpt']), axis=1, result_type='expand').mean()
print(f"Test ROUGE-1 GPT 3.5 Turbo: {rouge_summary['rouge1'].mean()}")
print(f"Test ROUGE-2 GPT 3.5 Turbo: {rouge_summary['rouge2'].mean()}")
print(f"Test ROUGE-N GPT 3.5 Turbo: {rouge_summary['rougeL'].mean()}")

Test ROUGE-1 Fine-Tuned T5 1 Beam: 0.38109328892877126
Test ROUGE-2 Fine-Tuned T5 1 Beam: 0.1373375837581973
Test ROUGE-N Fine-Tuned T5 1 Beam: 0.2964796316350115
Test ROUGE-1 Fine-Tuned T5 2 Beams: 0.38419610127429965
Test ROUGE-2 Fine-Tuned T5 2 Beams: 0.13911496231998474
Test ROUGE-N Fine-Tuned T5 2 Beams: 0.3010770935950453
Test ROUGE-1 Fine-Tuned T5 3 Beams: 0.37522020029991804
Test ROUGE-2 Fine-Tuned T5 3 Beams: 0.13689206583040714
Test ROUGE-N Fine-Tuned T5 3 Beams: 0.2941924512420922
Test ROUGE-1 OOB T5 1 Beam: 0.14063006779187417
Test ROUGE-2 OOB T5 1 Beam: 0.04421440654267187
Test ROUGE-N OOB T5 1 Beam: 0.1353385159250876
Test ROUGE-1 OOB T5 2 Beams: 0.1681159564346114
Test ROUGE-2 OOB T5 2 Beams: 0.05226845251931622
Test ROUGE-N OOB T5 2 Beams: 0.16085107172239202
Test ROUGE-1 OOB T5 3 Beams: 0.18153840293691165
Test ROUGE-2 OOB T5 3 Beams: 0.055981415742911364
Test ROUGE-N OOB T5 3 Beams: 0.1728848525490866
Test ROUGE-1 GPT 3.5 Turbo: 0.3516726324297726
Test ROUGE-2 GPT 3.5

## Recipe Component Tags (Title, Ingredients, Directions)

Now let's get an idea for how the different models capture the formatting tags (`<title>`, `<ingredients>`, `<directions>`). Note that we don't need to evaluate this on the GPT model since GPT didn't learn this - we manually appended these tags to the output to make the format consistent.

In [38]:
print(f"Prop of Train Examples with <title> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'<title>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <title> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'<title>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <title> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'<title>').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with <title> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<title>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <title> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<title>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <title> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<title>').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with <title> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'<title>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <title> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'<title>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <title> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'<title>').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with <title> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<title>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <title> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<title>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <title> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<title>').sum() / test_df.shape[0]}")

Prop of Train Examples with <title> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Train Examples with <title> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Train Examples with <title> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Train Examples with <title> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with <title> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with <title> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with <title> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Test Examples with <title> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Test Examples with <title> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Test Examples with <title> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples with <title> tags in OOB T5 2 Beam Output: 0.0
Prop of Test Examples with <title> tags in OOB T5 3 Beam Output: 0.0


In [39]:
print(f"Prop of Train Examples with partial <title> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?title[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <title> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?title[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <title> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?title[>]?').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with partial <title> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?title[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <title> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?title[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <title> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?title[>]?').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with partial <title> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?title[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <title> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?title[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <title> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?title[>]?').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with partial <title> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?title[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <title> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?title[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <title> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?title[>]?').sum() / test_df.shape[0]}")

Prop of Train Examples with partial <title> tags in Fine-Tuned T5 1 Beam Output: 0.05103280680437424
Prop of Train Examples with partial <title> tags in Fine-Tuned T5 2 Beam Output: 0.05103280680437424
Prop of Train Examples with partial <title> tags in Fine-Tuned T5 3 Beam Output: 0.05103280680437424
Prop of Train Examples with partial <title> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with partial <title> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with partial <title> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with partial <title> tags in Fine-Tuned T5 1 Beam Output: 0.05928085519922255
Prop of Test Examples with partial <title> tags in Fine-Tuned T5 2 Beam Output: 0.05928085519922255
Prop of Test Examples with partial <title> tags in Fine-Tuned T5 3 Beam Output: 0.05928085519922255
Prop of Test Examples with partial <title> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples with partial <title> tags in OOB T5 2 Beam Output: 0.0
Prop of 

Seems like the model was able to generate a few title tags partially. Let's look at some of these examples.

In [41]:
with pd.option_context('display.max_colwidth', None):
    display(train_df.loc[train_df['output_t5_b1'].str.contains(r'[<]?title[>]?'), 'output_t5_b1'].to_frame())

Unnamed: 0,output_t5_b1
12,"2 tablespoons jarlic, 1 tablespoon salt, 1 tablespoon ted butter, 1 tablespoon leaf parsley, 1 tablespoon thymetitle>ingredients>directions>1. Combine jarlic, salt, ted butter, parsley, and thyme in small bowl. Season with salt and pepper. Season with salt and pepper.directions>"
38,"Baby Red Potatoestitle>ingredients>1 1/2 cups baby red potatoes, 2 cups carrots, 3 ounces broccoli florets, 2 1/2 cups red bell peppersingredients>directions>1. Combine potatoes and carrots in small bowl. Season with salt and pepper. Season with salt and pepper.directions>"
76,"Castyl Alcoholtitle>ingredients>1 cup bottle gible, 1 cup chicken broth, 1 cup water, 1 cup celery rib, 1 cup carrotingredients>directions>1. Bring bottle gible, chicken broth, 1 cup celery rib to simmer, 1 cup carrot. Cover and chill until rib is tender, about 2 minutes. Season with salt and pepper.directions>"
78,"3 ounces roma tomatoes, 3 ounces calamata olives, 3 ounces red onion, 3 ounces caperstitle>ingredients>directions>1. Place a large pizza pan in a small bowl and serve hot sauce. Season with salt and pepper.directions>"
119,"3 tablespoons extra-virgin olive oil, 1 tablespoon Champagne vinegar, 1 tablespoon shallot, 1 tablespoon Dijon mustard, 1 tablespoon butter lettuce, 2 tablespoons radishestitle>ingredients>directions>1. In a large skillet, cook olive oil in a large skillet over medium heat until the sun is moistened. Season with salt and pepper. Season with salt and pepper., 2. Place butter lettuce in a large bowl and serve hot.directions>"
...,...
4072,"Child Pork Cheesetitle>ingredients>2 cheddar cheese, 1 ounce ruby Port, 1 ounce ted butter, 1 tablespoon Dijon mustard, 1 tablespoon black pepperingredients>directions>1. Place cheese in a small bowl and serve hot sauce. Season with salt and pepper.directions>"
4090,", 1 tablespoon extra-virgin olive oil, 1 tablespoon garlic, 1 tablespoon vine tomato, 1/2 teaspoon paprikatitle>ingredients>Italian bread, 2 tablespoons extra-virgin olive oil, 1 tablespoon garlic, 1 tablespoon vine tomato, 1/2 teaspoon paprikaingredients>directions>1. Preheat oven to 350°F., 2. Preheat oven to 350°F., 2. Place bread in oven and cook in a large oven-safe pan"
4092,"Rhubarb Sturgeonstitle>ingredients>1 1/2 cups rhubarb stalks, 1 1/2 cups water, 1 1/2 cups sugar, 1 1/2 cups gingeringredients>directions>1. Bring stalks and ginger to a boil in a small saucepan over medium heat, stirring occasionally, until rhubarb stalks are dissolved, stirring occasionally, about 2 minutes. Bring to a boil, stirring occasionally, about 2 minutes. Season with salt and pepper to taste"
4097,"1 ounce lobster, 2 shrimp, 1 cup mayonnaise, 1 cup lemon juice, 2 tablespoons dill, 1 ounce chivestitle>ingredients>directions>1. Combine lobster, shrimp, mayonnaise, lemon juice, dill, chives and chives in a 12-ounce glass container. Pour lemon juice into a 12-ounce glass container. Transfer chives to a 12-ounce glass container. Transfer to a 12-ounce glass container"


In [42]:
print(f"Prop of Train Examples with <ingredients> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredients> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredients> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with <ingredients> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredients> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredients> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with <ingredients> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredients> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredients> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with <ingredients> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredients> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredients> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / test_df.shape[0]}")

Prop of Train Examples with <ingredients> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Train Examples with <ingredients> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Train Examples with <ingredients> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Train Examples with <ingredients> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with <ingredients> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with <ingredients> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with <ingredients> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Test Examples with <ingredients> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Test Examples with <ingredients> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Test Examples with <ingredients> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples with <ingredients> tags in OOB T5 2 Beam Output: 0.0
Prop of Test Examples with <ingredients> tags in OOB T5 3 Beam Output: 0.0


In [44]:
print(f"Prop of Train Examples with <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredient[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredient[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredient[s]?>').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with <ingredient(s)> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredient[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredient(s)> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredient[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <ingredient(s)> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredient[s]?>').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'<ingredient[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'<ingredient[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'<ingredient[s]?>').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with <ingredient(s)> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<ingredient[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredient(s)> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<ingredient[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <ingredient(s)> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<ingredient[s]?>').sum() / test_df.shape[0]}")

Prop of Train Examples with <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Train Examples with <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Train Examples with <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Train Examples with <ingredient(s)> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with <ingredient(s)> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with <ingredient(s)> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Test Examples with <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Test Examples with <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Test Examples with <ingredient(s)> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples with <ingredient(s)> tags in OOB T5 2 Beam Output: 0.0
Prop of Test Examples with <ingredient(s)> tags in OOB T5 3 Beam Output: 0.0


In [43]:
print(f"Prop of Train Examples with partial <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with partial <ingredient(s)> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <ingredient(s)> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <ingredient(s)> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with partial <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with partial <ingredient(s)> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <ingredient(s)> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <ingredient(s)> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?ingredient[s]?[>]?').sum() / test_df.shape[0]}")

Prop of Train Examples with partial <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: 0.9844471445929526
Prop of Train Examples with partial <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: 0.9844471445929526
Prop of Train Examples with partial <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: 0.9844471445929526
Prop of Train Examples with partial <ingredient(s)> tags in OOB T5 1 Beam Output: 0.8226002430133658
Prop of Train Examples with partial <ingredient(s)> tags in OOB T5 2 Beam Output: 0.8226002430133658
Prop of Train Examples with partial <ingredient(s)> tags in OOB T5 3 Beam Output: 0.8226002430133658

Prop of Test Examples with partial <ingredient(s)> tags in Fine-Tuned T5 1 Beam Output: 0.9795918367346939
Prop of Test Examples with partial <ingredient(s)> tags in Fine-Tuned T5 2 Beam Output: 0.9795918367346939
Prop of Test Examples with partial <ingredient(s)> tags in Fine-Tuned T5 3 Beam Output: 0.9795918367346939
Prop of Test Examples with partial <ingredient(s)> 

Once again, the models are able to generate ingredient tags partially but not fully.

In [45]:
print(f"Prop of Train Examples with <directions> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <directions> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <directions> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with <directions> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <directions> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <directions> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<ingredients>').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with <directions> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'<directions>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <directions> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'<directions>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <directions> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'<directions>').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with <directions> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<directions>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <directions> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<directions>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <directions> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<directions>').sum() / test_df.shape[0]}")

Prop of Train Examples with <directions> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Train Examples with <directions> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Train Examples with <directions> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Train Examples with <directions> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with <directions> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with <directions> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with <directions> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Test Examples with <directions> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Test Examples with <directions> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Test Examples with <directions> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples with <directions> tags in OOB T5 2 Beam Output: 0.0
Prop of Test Examples with <directions> tags in OOB T5 3 Beam Output: 0.0


In [46]:
print(f"Prop of Train Examples with <direction(s)> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'<direction[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <direction(s)> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'<direction[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <direction(s)> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'<direction[s]?>').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with <direction(s)> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<direction[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <direction(s)> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<direction[s]?>').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with <direction(s)> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'<direction[s]?>').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with <direction(s)> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'<direction[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <direction(s)> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'<direction[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <direction(s)> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'<direction[s]?>').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with <direction(s)> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<direction[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <direction(s)> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<direction[s]?>').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with <direction(s)> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'<direction[s]?>').sum() / test_df.shape[0]}")

Prop of Train Examples with <direction(s)> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Train Examples with <direction(s)> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Train Examples with <direction(s)> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Train Examples with <direction(s)> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with <direction(s)> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with <direction(s)> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with <direction(s)> tags in Fine-Tuned T5 1 Beam Output: 0.0
Prop of Test Examples with <direction(s)> tags in Fine-Tuned T5 2 Beam Output: 0.0
Prop of Test Examples with <direction(s)> tags in Fine-Tuned T5 3 Beam Output: 0.0
Prop of Test Examples with <direction(s)> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples with <direction(s)> tags in OOB T5 2 Beam Output: 0.0
Prop of Test Examples with <direction(s)> tags in OOB T5 3 Beam Output: 0.0


In [47]:
print(f"Prop of Train Examples with partial <direction(s)> tags in Fine-Tuned T5 1 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <direction(s)> tags in Fine-Tuned T5 2 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <direction(s)> tags in Fine-Tuned T5 3 Beam Output: {train_df['output_t5_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / train_df.shape[0]}")

print(f"Prop of Train Examples with partial <direction(s)> tags in OOB T5 1 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <direction(s)> tags in OOB T5 2 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / train_df.shape[0]}")
print(f"Prop of Train Examples with partial <direction(s)> tags in OOB T5 3 Beam Output: {train_df['output_t5_oob_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / train_df.shape[0]}")

print()

print(f"Prop of Test Examples with partial <direction(s)> tags in Fine-Tuned T5 1 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <direction(s)> tags in Fine-Tuned T5 2 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <direction(s)> tags in Fine-Tuned T5 3 Beam Output: {test_df['output_t5_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / test_df.shape[0]}")

print(f"Prop of Test Examples with partial <direction(s)> tags in OOB T5 1 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <direction(s)> tags in OOB T5 2 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / test_df.shape[0]}")
print(f"Prop of Test Examples with partial <direction(s)> tags in OOB T5 3 Beam Output: {test_df['output_t5_oob_b1'].str.contains(r'[<]?direction[s]?[>]?').sum() / test_df.shape[0]}")

Prop of Train Examples with partial <direction(s)> tags in Fine-Tuned T5 1 Beam Output: 0.9924665856622115
Prop of Train Examples with partial <direction(s)> tags in Fine-Tuned T5 2 Beam Output: 0.9924665856622115
Prop of Train Examples with partial <direction(s)> tags in Fine-Tuned T5 3 Beam Output: 0.9924665856622115
Prop of Train Examples with partial <direction(s)> tags in OOB T5 1 Beam Output: 0.0
Prop of Train Examples with partial <direction(s)> tags in OOB T5 2 Beam Output: 0.0
Prop of Train Examples with partial <direction(s)> tags in OOB T5 3 Beam Output: 0.0

Prop of Test Examples with partial <direction(s)> tags in Fine-Tuned T5 1 Beam Output: 0.9883381924198251
Prop of Test Examples with partial <direction(s)> tags in Fine-Tuned T5 2 Beam Output: 0.9883381924198251
Prop of Test Examples with partial <direction(s)> tags in Fine-Tuned T5 3 Beam Output: 0.9883381924198251
Prop of Test Examples with partial <direction(s)> tags in OOB T5 1 Beam Output: 0.0
Prop of Test Examples

## Ingredient Overlap

Now, let's find the overlap ratio of ingredient names between inputs and outputs.

In [72]:
def parse_ingred_inp(inp, fully_tokenize=False):
    inp = inp.split('<ingredients>')
    ingreds = inp[1].split(', ')
    ingreds = [i.split(' (')[0].lower() for i in ingreds]
    if fully_tokenize:
        return sum([i.split() for i in ingreds], [])
    return ingreds

def compute_ingred_overlap_ratio(inp, out, fully_tokenize_inp_ingreds=False):
    if type(out) != str: return 0
    ingreds = parse_ingred_inp(inp, fully_tokenize=fully_tokenize_inp_ingreds)
    ingreds_set = set(ingreds)

    out_cleaned = re.sub(r"[,.;@#?!&$<>]+\ *", " ", out).lower()
    out_cleaned_token_set = set(out_cleaned.split())

    return len(ingreds_set.intersection(out_cleaned_token_set)) / len(ingreds_set)

In [73]:
print(f"Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 1 Beam Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_b1'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 2 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_b2'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 3 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_b3'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for OOB T5 1 Beam Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_oob_b1'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for OOB T5 2 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_oob_b2'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for OOB T5 3 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_oob_b3'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for GPT 3.5 Turbo Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_gpt'], fully_tokenize_inp_ingreds=False), axis=1).mean()}")


Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 1 Beam Output: 0.37889428918590523
Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 2 Beams Output: 0.3844471445929526
Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 3 Beams Output: 0.3840745240988254
Train Avg. Ingredient Overlap Ratio for OOB T5 1 Beam Output: 0.3024746861077359
Train Avg. Ingredient Overlap Ratio for OOB T5 2 Beams Output: 0.36879303361684895
Train Avg. Ingredient Overlap Ratio for OOB T5 3 Beams Output: 0.3954718509518024
Train Avg. Ingredient Overlap Ratio for GPT 3.5 Turbo Output: 0.44374645605508306


Some input ingredients are very specific, for example "baby red potatoes: vs. "red potatoes" vs. "potatoes". In this case, it may be better to full tokenize the ingredient names and then compute overlap between the reference ingredients and generated ingredients.

In [74]:
print(f"Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 1 Beam Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_b1'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 2 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_b2'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 3 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_b3'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for OOB T5 1 Beam Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_oob_b1'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for OOB T5 2 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_oob_b2'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for OOB T5 3 Beams Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_t5_oob_b3'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")
print(f"Train Avg. Ingredient Overlap Ratio for GPT 3.5 Turbo Output: {train_df.apply(lambda x: compute_ingred_overlap_ratio(x['input'], x['output_gpt'], fully_tokenize_inp_ingreds=True), axis=1).mean()}")


Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 1 Beam Output: 0.8792292151617791
Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 2 Beams Output: 0.8930176152320747
Train Avg. Ingredient Overlap Ratio for Fine-Tuned T5 3 Beams Output: 0.8950335680924989
Train Avg. Ingredient Overlap Ratio for OOB T5 1 Beam Output: 0.6993107823727508
Train Avg. Ingredient Overlap Ratio for OOB T5 2 Beams Output: 0.8338215793926607
Train Avg. Ingredient Overlap Ratio for OOB T5 3 Beams Output: 0.8889839821310052
Train Avg. Ingredient Overlap Ratio for GPT 3.5 Turbo Output: 0.9534145991041496
