In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from google.cloud import storage
from io import StringIO
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re
import string

# Evaluation

We will first generate results on our trained T5 model using difference numbers of beams.

In [1]:
def get_df_from_gcs_blob(blob, bucket='recipe-data-bucket'):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket)

    blob = bucket.blob(blob)
    blob = blob.download_as_string()
    blob = blob.decode()
    blob = StringIO(blob)  #tranform bytes to string here
    df = pd.read_csv(blob)
    return df

In [32]:
DEVICE = 'cuda'
MOD_PATH = './inp_cal_ingred_cal/final'

model = T5ForConditionalGeneration.from_pretrained(MOD_PATH).to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained(MOD_PATH)

oob_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(DEVICE)
oob_tokenizer = T5Tokenizer.from_pretrained('t5-small')

train_df = get_df_from_gcs_blob('train_only_cal.csv')
test_df = get_df_from_gcs_blob('test_only_cal.csv')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
def get_output(inp, num_beams, max_len, model=model, tokenizer=tokenizer):
    input_ids = tokenizer(inp, return_tensors="pt").input_ids.to(DEVICE)
    outputs = model.generate(input_ids, max_length=max_len, num_beams=num_beams)
    out = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return out

In [12]:
train_df['gen_beams1'] = train_df['input'].map(lambda x: get_output(x, 1, train_df['output'].map(len).max()))
train_df['gen_beams2'] = train_df['input'].map(lambda x: get_output(x, 2, train_df['output'].map(len).max()))
train_df['gen_beams3'] = train_df['input'].map(lambda x: get_output(x, 3, train_df['output'].map(len).max()))

test_df['gen_beams1'] = test_df['input'].map(lambda x: get_output(x, 1, test_df['output'].map(len).max()))
test_df['gen_beams2'] = test_df['input'].map(lambda x: get_output(x, 2, test_df['output'].map(len).max()))
test_df['gen_beams3'] = test_df['input'].map(lambda x: get_output(x, 3, test_df['output'].map(len).max()))

In [38]:
train_df['gen_beams1_oob'] = train_df['input'].map(lambda x: get_output(x, 1, train_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
train_df['gen_beams2_oob'] = train_df['input'].map(lambda x: get_output(x, 2, train_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
train_df['gen_beams3_oob'] = train_df['input'].map(lambda x: get_output(x, 3, train_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))

test_df['gen_beams1_oob'] = test_df['input'].map(lambda x: get_output(x, 1, test_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
test_df['gen_beams2_oob'] = test_df['input'].map(lambda x: get_output(x, 2, test_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))
test_df['gen_beams3_oob'] = test_df['input'].map(lambda x: get_output(x, 3, test_df['output'].map(len).max(), model=oob_model, tokenizer=oob_tokenizer))

Now, let's append this to our results from the `GPT` baseline.

In [43]:
train_df_gpt_results = get_df_from_gcs_blob('train_only_cal_gpt_results.csv')
test_df_gpt_results = get_df_from_gcs_blob('test_only_cal_gpt_results.csv')

In [42]:
train_df['output_gpt'] = train_df_gpt_results['gpt_out_formatted']
train_df = train_df.rename(columns={'output': 'output_gt', 'gen_beams1': 'output_t5_b1', 'gen_beams2': 'output_t5_b2', 'gen_beams3': 'output_t5_b3', 'gen_beams1_oob': 'output_t5_oob_b1', 'gen_beams2_oob': 'output_t5_oob_b2', 'gen_beams3_oob': 'output_t5_oob_b3'})

# upload to GCS
storage_client = storage.Client()
bucket = storage_client.get_bucket('recipe-data-bucket')
blob = bucket.blob('train_only_cal_final_results.csv')
csv_buffer = StringIO()
train_df.to_csv(csv_buffer, index=False)
blob.upload_from_string(csv_buffer.getvalue())

train_df.head()

Unnamed: 0,input,output_gt,output_t5_b1,output_t5_b2,output_t5_b3,output_gpt,output_t5_oob_b1,output_t5_oob_b2,output_t5_oob_b3
0,<ingredients>Spanish sherry vinegar (19 calori...,<title>Argentine Red Sauce<title><ingredients>...,ingredients>1 tablespoon spanish sherry vinega...,"Spanish sherry vinegar, virgin olive oil, papr...","Spanish sherry vinegar, virgin olive oil, papr...",<title>Spicy Sherry Vinegar Chicken with Papri...,ingredients>1 tablespoon spanish sherry vinega...,"Spanish sherry vinegar, virgin olive oil, papr...","Spanish sherry vinegar, virgin olive oil, papr..."
1,"<ingredients>mayonnaise (689 calories), basil ...",<title>Basil Aïoli<title><ingredients>3/4 cup ...,"I use 1 tablespoon of basil, 1 tablespoon of l...","1-2 hours mayonnaise, 2 tablespoons basil, 1 t...","6 ounces mayonnaise, 2 tablespoons basil, 1 ta...",<title>Creamy Lemon Basil Pasta with Grilled C...,"I use 1 tablespoon of basil, 1 tablespoon of l...","1-2 hours mayonnaise, 2 tablespoons basil, 1 t...","6 ounces mayonnaise, 2 tablespoons basil, 1 ta..."
2,"<ingredients>vanilla bean (296 calories), appl...",<title>Cider Sauce<title><ingredients>1 vanill...,"vanilla bean, apple, ed apple cider, ted butte...","vanilla bean, apple, ed apple cider, ted butte...","vanilla bean, apple, ed apple cider, ted butte...",<title>Vanilla-Apple Cider Cake with Browned B...,"vanilla bean, apple, ed apple cider, ted butte...","vanilla bean, apple, ed apple cider, ted butte...","vanilla bean, apple, ed apple cider, ted butte..."
3,"<ingredients>olive oil (869 calories), lime ju...",<title>Mojo Sauce<title><ingredients>1 cup oli...,"1 tablespoon olive oil, 1 tablespoon lime juic...","olive oil, lime juice, orange juice, cilantro,...","olive oil, lime juice, orange juice, cilantro,...",<title>Citrus-Marinated Grilled Shrimp with Ci...,"1 tablespoon olive oil, 1 tablespoon lime juic...","olive oil, lime juice, orange juice, cilantro,...","olive oil, lime juice, orange juice, cilantro,..."
4,"<ingredients>brandy (228 calories), crème de c...",<title>Brandy Alexander Ii<title><ingredients>...,"Brandy, crème de cacao, heavy cream, nutmeging...","Brandy, crème de cacao, heavy cream, nutmeging...","Brandy, crème de cacao, heavy cream, nutmeging...",<title>Nutmeg Brandy Cream Delight<title><ingr...,"Brandy, crème de cacao, heavy cream, nutmeging...","Brandy, crème de cacao, heavy cream, nutmeging...","Brandy, crème de cacao, heavy cream, nutmeging..."


In [45]:
test_df['output_gpt'] = test_df_gpt_results['gpt_out_formatted']
test_df = test_df.rename(columns={'output': 'output_gt', 'gen_beams1': 'output_t5_b1', 'gen_beams2': 'output_t5_b2', 'gen_beams3': 'output_t5_b3', 'gen_beams1_oob': 'output_t5_oob_b1', 'gen_beams2_oob': 'output_t5_oob_b2', 'gen_beams3_oob': 'output_t5_oob_b3'})

# upload to GCS
storage_client = storage.Client()
bucket = storage_client.get_bucket('recipe-data-bucket')

blob = bucket.blob('test_only_cal_final_results.csv')

csv_buffer = StringIO()
test_df.to_csv(csv_buffer, index=False)
blob.upload_from_string(csv_buffer.getvalue())

test_df.head()

Unnamed: 0,input,output_gt,output_t5_b1,output_t5_b2,output_t5_b3,gen_beams1_oob,gen_beams2_oob,gen_beams3_oob,output_gpt
0,"<ingredients>pecan halves (707 calories), ted ...",<title>Sea Salt-Roasted Pecans<title><ingredie...,"Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...","Pecan halves, ted butter, sea saltingredients>...",<title>Salted Pecan Butter<title><ingredients>...
1,"<ingredients>olive oil (869 calories), red cab...",<title>Red Cabbage And Onions<title><ingredien...,"olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...","olive oil, red cabbage, red onions, thymeingre...",<title>Sautéed Red Cabbage with Caramelized Re...
2,"<ingredients>chocolate (540 calories), sugar (...",<title>Chocolate Almond Butter<title><ingredie...,"Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...","Chocolate, Sugar, Amaretto, ted butteringredie...",<title>Chocolate Amaretto Truffles<title><ingr...
3,"<ingredients>milk (51 calories), yellow corn m...",<title>Old-Fashioned Indian Pudding<title><ing...,"1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...","1 cup yellow corn meal, 1 cup salt, 2 tablespo...",<title>Cornmeal Porridge with Molasses<title><...
4,"<ingredients>vodka (235 calories), lime juice ...",<title>Kamikaze<title><ingredients>2 ounces vo...,"vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...","vodka, lime juice, triple sec (270 calories)in...",<title>Vodka Lime Cooler\n\nServings: 1\nCalor...


Now, let's compute BLEU scores of each model.

In [None]:
def compute_bleu(target, pred):
    if type(pred) != str: return 0
    target = re.sub('<title>', '', target)
    target = re.sub('<ingredients>', '', target)
    target = re.sub('<directions>', '', target)
    target = re.sub(r"[,.;@#?!&$]+", ' ', target)
    target = [target.split()]

    pred = re.sub(r'[<]?title[>]?', '', pred)
    pred = re.sub(r'[<]?ingredients[>]?', '', pred)
    pred = re.sub(r'[<]?directions[>]?', '', pred)
    pred = re.sub(r"[,.;@#?!&$]+", ' ', pred)
    pred = pred.split()

    return sentence_bleu(target, pred)

In [None]:
print(f"BLEU Fine-Tuned T5 1 Beam: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b1']), axis=1).mean()}")
print(f"BLEU Fine-Tuned T5 2 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b2']), axis=1).mean()}")
print(f"BLEU Fine-Tuned T5 3 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_b3']), axis=1).mean()}")

print(f"BLEU OOB T5 1 Beam: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b1']), axis=1).mean()}")
print(f"BLEU OOB T5 2 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b2']), axis=1).mean()}")
print(f"BLEU OOB T5 3 Beams: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_t5_oob_b3']), axis=1).mean()}")

print(f"BLEU GPT 3.5 Turbo: {train_df.apply(lambda x: compute_bleu(x['output_gt'], x['output_gpt']), axis=1).mean()}")

Let's compute ROGUE scores.

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rogue(target, pred, scorer=scorer):
    if type(pred) != str: return {key: 0 for key in scorer.rouge_types} 
    scores = scorer.score(target, pred)
    return {key: scores[key].fmeasure for key in scores}


In [None]:
rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b1']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 Fine-Tuned T5 1 Beam: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 Fine-Tuned T5 1 Beam: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N Fine-Tuned T5 1 Beam: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b2']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 Fine-Tuned T5 2 Beams: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 Fine-Tuned T5 2 Beams: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N Fine-Tuned T5 2 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_b3']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 Fine-Tuned T5 3 Beams: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 Fine-Tuned T5 3 Beams: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N Fine-Tuned T5 3 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b1']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 OOB T5 1 Beam: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 OOB T5 1 Beam: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N OOB T5 1 Beam: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b2']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 OOB T5 2 Beams: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 OOB T5 2 Beams: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N OOB T5 2 Beams: {rouge_summary['rougeL'].mean()}")

rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_t5_oob_b3']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 OOB T5 3 Beams: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 OOB T5 3 Beams: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N OOB T5 3 Beams: {rouge_summary['rougeL'].mean()}")


rouge_summary = train_df.apply(lambda x: compute_rogue(x['output_gt'], x['output_gpt']), axis=1, result_type='expand').mean()
print(f"ROUGE-1 GPT 3.5 Turbo: {rouge_summary['rouge1'].mean()}")
print(f"ROUGE-2 GPT 3.5 Turbo: {rouge_summary['rouge2'].mean()}")
print(f"ROUGE-N GPT 3.5 Turbo: {rouge_summary['rougeL'].mean()}")

Now let's get an idea for how the different models capture the formatting tags (`<title>`, `<ingredients>`, `<directions>`).