# Debugging task prompts
This notebook can be used to debug task prompts.
To do that we want to modify the template (the example is with copa) by creating a new template class (e.g. COPAGPT3Template) at examples/few_shot/templates.py usually by inheriting the default class (COPATemplate). We do not want to update the template in place since we might have experiments run with the existing templates. 


In [1]:
%load_ext autoreload
%autoreload 2

import os    
import glob
from pathlib import Path
import datetime

We need to update the parameters below to correspond to our new defined class:

In [2]:
model_name = "125M_gpt3_setting"
predictor_name="clmprompting"
# model_name = "openai_ada"
# predictor_name="CLMPromptingOpenaiApi"

scoring = "sum"

#debug_task="cb"
#template_name="cb_gpt3_reproduce"

debug_task="xnli"
template_name="xnli_mt__bg"
calibrator_name = None #"average_option"
calibration_options = [] #["sentence1::|sentence2::"]
languages=["en"]
train_lang="de"

current_user = os.getenv("USER")
time_stamp = datetime.datetime.today().strftime("%Y-%m-%d-%H-%M-%S")
results_dir = f"/checkpoint/{current_user}/few_shot/debug_prompts/pred_{debug_task}_{template_name}_{model_name}_{time_stamp}"

# Run experiment 
We want to run experiment with some small parameters since the prompt generation for few-shot learning usually depends on some logic in the predictor class.

In [3]:
from examples.few_shot.gpt3_eval import run_evaluations_from_model_name

args = {
    "model_name": model_name,
    "tasks": [debug_task],
    f"{debug_task}_template": template_name,
    "nb_few_shot_samples_values": [1],
    "n_eval_samples": 3, # set 1 for debug
    "num_trials": 1,
    "max_positions": 1024,
    "max_tokens": 1024,
    "scoring": scoring,
    "calibrator_name": calibrator_name,
    f"{debug_task}_calibration_options": calibration_options,
    f"{debug_task}_languages": languages,
    "predictions_dump_dir": results_dir,
    "results_dir": results_dir,
    "add_prompt_to_meta": True,
    "add_positional_scores_to_meta": True, 
    "add_prompt_tokens_to_meta": True,
    "add_calib_meta": True,
    "train_sep": "\n",
    "xnli_train_lang": train_lang,
    #field_sep="\n",
    "uniform_sampling": False,
    "predictor_name": predictor_name, #"CLMPromptingOpenaiApi",
}
results = run_evaluations_from_model_name(**args)

model_name=125M_gpt3_setting
model_pretraining_valid_info={"epoch": 3, "valid_loss": "3.926", "valid_ppl": "15.2", "valid_wps": "857595", "valid_wpb": "216347", "valid_bsz": "105.7", "valid_num_updates": "572204", "valid_best_loss": "3.926", "time_stamp": "2020-11-04 04:05:02", "log_file": "/large_experiments/xlmg/models/dense/125M/few_shot.roberta+cc100.os.bm_none.tps2048.transformer_lm_gpt.share.adam.b2_0.98.eps1e-08.cl0.0.lr0.005.wu715.dr0.1.atdr0.1.wd0.01.ms4.uf2.mu572204.s1.ngpu32/train.log"}
distributed_training.distributed_port=-1
Loaded model
model_loading_time=4.0 seconds
model_loading_time_cuda=5.6 seconds
Changing max_positions from 2048 to 1024
task=xnli
eval_set=dev
eval language=en
train_set=test
train_lang=de
template=xnli_mt__bg
calibration_options=[]
nb_few_shot_samples=1
expected_max_tgt_len=193, max_positions=1024
Average number of train samples: 1.00
Predicting 3 samples with 9 prompts..
Before running model, bs=1, max_tgt_len=193 mem=0.24GB
Predictions dumped to /c

# Read predictions file
We will read the predictions file which has the generated prompt, token ids and scores tokens.

In [4]:
results

[{'model_name': '125M_gpt3_setting',
  'task': 'xnli',
  'language': 'en',
  'template': 'xnli_mt__bg',
  'nb_few_shot_samples': 1,
  'calibration_options': [],
  'calibrator_name': None,
  'train_set': 'test',
  'valid_set': None,
  'eval_set': 'dev',
  'train_lang': 'de',
  'valid_lang': None,
  'ppl_common_prefix': {'scores': [10.73302427927653],
   'mean': 10.73302427927653,
   'std': 0.0,
   'mean_confidence_interval': nan},
  'ppl_selected_candidate': {'scores': [1.0640958944956462],
   'mean': 1.0640958944956462,
   'std': 0.0,
   'mean_confidence_interval': nan},
  'ppl_full_selected_candidate': {'scores': [9.824786504109701],
   'mean': 9.824786504109701,
   'std': 0.0,
   'mean_confidence_interval': nan},
  'ppl_candidates_full_prompt__entailment': {'scores': [10.387644449869791],
   'mean': 10.387644449869791,
   'std': 0.0,
   'mean_confidence_interval': nan},
  'ppl_candidates_full_prompt__contradiction': {'scores': [9.824786504109701],
   'mean': 9.824786504109701,
   'st

In [5]:
predictions_files = glob.glob(f"{results_dir}/*.jsonl")
for pred_file in predictions_files:
    print(pred_file)
    # Here we get first pred_file. We will have more than one file if we run multiple runs. 

/checkpoint/tbmihaylov/few_shot/debug_prompts/pred_xnli_xnli_mt__bg_125M_gpt3_setting_2021-08-05-21-19-22/task.xnli_tmp.xnli_mt__bg_train.test.de_val.None.None_eval.dev.en_calib.None_fs1_seed0_predictions.jsonl


In [6]:
from collect_results import read_jsonl_file
import json

In [7]:
predictions = read_jsonl_file(pred_file)

# Prompts
Below we can print the prompts for the different choices. 

In [8]:
prediction = predictions[0]
for cand, cand_info in prediction["candidates"]:
    print(f"### Prompt for cand `{cand}`:")
    print(cand_info["meta"]["prompt"])
    
    print("#" * 10)
    #print(cand_info["meta"].keys())
    if "calib_metas" in cand_info["meta"]:
        print(f"### Calibrations prompts for cand `{cand}`:")
        for calib_id, calib_meta in enumerate(cand_info["meta"]["calib_metas"]):
            print(f"## calib option {calib_id} prompt:")
            print(calib_meta["prompt"])
        
    
    break  # print the first only

### Prompt for cand `entailment`:
Параграф: The folks at L'academie Internationale des Arts et des Sciences Numeriques have innovated a clever variant on this trick. Въпрос: The people at the school just followed their lead. Вярно, грешно или нито едното? Грешно
Параграф: And he said, Mama, I'm home. Въпрос: He called his mom as soon as the school bus dropped him off. Вярно, грешно или нито едното? Вярно
##########
