diff --git a/promptsource/seqio_tasks/__init__.py b/promptsource/seqio_tasks/__init__.py deleted file mode 100644 index f3ba72430..000000000 --- a/promptsource/seqio_tasks/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Tools for loading prompted tasks in seqio.""" - -from . import tasks, utils diff --git a/promptsource/seqio_tasks/dataset_subset_template.csv b/promptsource/seqio_tasks/dataset_subset_template.csv deleted file mode 100644 index 0358d5202..000000000 --- a/promptsource/seqio_tasks/dataset_subset_template.csv +++ /dev/null @@ -1,445 +0,0 @@ -comment,do_eval,skip_train,dataset_subset_template,nontrivial_choices_given,nontrivial_choices_hidden,trivial_choices_given,trivial_choices_hidden,generative_non_true_task,generative_non_true_implausible,generative_true_task,negated_answers,counting,non_true_task_other,awkward_phrasing,ungrammatical,template_bug,long_distance,no_sep_2_sentences,verbose,answer_span_indices,non_natural_language -,,,adversarial_qa_dbert_adversarial_qa_dbert_1,,,,,,,,,,,,,,,,,, -,,,adversarial_qa_dbert_adversarial_qa_dbert_10,,,,,,,,,,,,,,,,,True,True -,,,adversarial_qa_dbert_adversarial_qa_dbert_2,,,,,,,,,,,,,,True,,,, -,,,adversarial_qa_dbert_adversarial_qa_dbert_3,,,,,,,,,,,,,,,,,, -,,,adversarial_qa_dbert_adversarial_qa_dbert_4,,,,,True,,,,,,,,,,,,, -,,,adversarial_qa_dbert_adversarial_qa_dbert_5,,,,,True,,,,,,,,,,,,, -,,,adversarial_qa_dbert_adversarial_qa_dbert_6,,,,,,,,,,,,,,,,True,, -,,,adversarial_qa_dbert_adversarial_qa_dbert_7,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_dbert_adversarial_qa_dbert_8,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_dbert_adversarial_qa_dbert_9,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_1,,,,,,,,,,,,,,,,,, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_10,,,,,,,,,,,,,,,,,True,True -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_2,,,,,,,,,,,,,,True,,,, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_3,,,,,,,,,,,,,,,,,, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_4,,,,,True,,,,,,,,,,,,, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_5,,,,,True,,,,,,,,,,,,, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_6,,,,,,,,,,,,,,,,True,, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_7,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_8,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_9,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_droberta_adversarial_qa_droberta_1,,,,,,,,,,,,,,,,,, -,,,adversarial_qa_droberta_adversarial_qa_droberta_10,,,,,,,,,,,,,,,,,True,True -,,,adversarial_qa_droberta_adversarial_qa_droberta_2,,,,,,,,,,,,,,True,,,, -,,,adversarial_qa_droberta_adversarial_qa_droberta_3,,,,,,,,,,,,,,,,,, -,,,adversarial_qa_droberta_adversarial_qa_droberta_4,,,,,True,,,,,,,,,,,,, -,,,adversarial_qa_droberta_adversarial_qa_droberta_5,,,,,True,,,,,,,,,,,,, -,,,adversarial_qa_droberta_adversarial_qa_droberta_6,,,,,,,,,,,,,,,,True,, -,,,adversarial_qa_droberta_adversarial_qa_droberta_7,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_droberta_adversarial_qa_droberta_8,,,,,,,,,,,,,,,,,True, -,,,adversarial_qa_droberta_adversarial_qa_droberta_9,,,,,,,,,,,,,,,,,True, -,,,ag_news_classify,,True,,,,,,,,,,,,,,,, -,,,ag_news_classify_with_choices,True,,,,,,,,,,,,,,,,, -,,,ag_news_recommend,True,,,,,,,,,,,,,,,,, -,,,ag_news_which_section,,True,,,,,,,,,,,,,,,, -,,,ag_news_which_section_choices,True,,,,,,,,,,,,,,,,, -,,,amazon_polarity_Template_1,,,True,,,,,,,,,,,,,,, -,,,amazon_polarity_Template_2,,,,True,,,,,,,,,,True,,,, -,,,amazon_polarity_Template_3,,,,True,,,,,,,,,,,,,, -,,,amazon_polarity_Template_4,,,,True,,,,,,,,,,True,,,, -,,,amazon_polarity_Template_5,,,True,,,,,,,,,,,,,,, -,,,amazon_polarity_Template_6,,,True,,,,,,,,,,,True,,,, -,True,True,anli_GPT_3_style_r1,True,,,,,,,,,,,,,,,,, -,True,True,anli_based_on_the_previous_passage_r1,True,,,,,,,,,,,,,,,,, -,True,True,anli_does_S1_contradict_S2__r1,,,,,,,,True,,True,,,,,,,, -,True,True,anli_does_S1_entail_S2__r1,True,,,,,,,,,,,,,,,,, -,True,True,anli_given_does_it_follow_that__r1,True,,,,,,,,,,,,,,,,, -,True,True,anli_given_it_must_be_true_that__r1,True,,,,,,,,,,,,,,,,, -,True,True,anli_GPT_3_style_r2,True,,,,,,,,,,,,,,,,, -,True,True,anli_based_on_the_previous_passage_r2,True,,,,,,,,,,,,,,,,, -,True,True,anli_does_S1_contradict_S2__r2,,,,,,,,True,,True,,,,,,,, -,True,True,anli_does_S1_entail_S2__r2,True,,,,,,,,,,,,,,,,, -,True,True,anli_given_does_it_follow_that__r2,True,,,,,,,,,,,,,,,,, -,True,True,anli_given_it_must_be_true_that__r2,True,,,,,,,,,,,,,,,,, -,True,True,anli_GPT_3_style_r3,True,,,,,,,,,,,,,,,,, -,True,True,anli_based_on_the_previous_passage_r3,True,,,,,,,,,,,,,,,,, -,True,True,anli_does_S1_contradict_S2__r3,,,,,,,,True,,True,,,,,,,, -,True,True,anli_does_S1_entail_S2__r3,True,,,,,,,,,,,,,,,,, -,True,True,anli_given_does_it_follow_that__r3,True,,,,,,,,,,,,,,,,, -,True,True,anli_given_it_must_be_true_that__r3,True,,,,,,,,,,,,,,,,, -,,,app_reviews_categorize_rating_using_review,,True,,,,,,,,,,,,,,,, -,,,app_reviews_convert_to_rating,True,,,,,,,,,,,,,,,,, -,,,app_reviews_convert_to_star_rating,,,,,,,,,,True,,,,,,,, -,,,app_reviews_generate_review,,,,,True,True,,,,,,,,,,,, -,,,ai2_arc_ARC_Challenge_answer_qn,,,,,True,True,,,,,,,,,,,, -,,,ai2_arc_ARC_Challenge_false,,,,,,,,True,,,,,,,,,, -,,,ai2_arc_ARC_Challenge_qa_options,True,,,,,,,,,,,,,,,,, -,,,ai2_arc_ARC_Challenge_test,True,,,,,,,,,,,,,,,,, -,,,ai2_arc_ARC_Easy_answer_qn,,,,,True,True,,,,,,,,,,,, -,,,ai2_arc_ARC_Easy_false,,,,,,,,True,,,,,,,,,, -,,,ai2_arc_ARC_Easy_qa_options,True,,,,,,,,,,,,,,,,, -,,,ai2_arc_ARC_Easy_test,True,,,,,,,,,,,,,,,,, -,True,,circa_goldstandard1_judgement,True,,,,,,,,,,True,,,,,,, -,True,,circa_goldstandard2_judgement,True,,,,,,,,,,True,,,,,,, -,,,circa_judgement,,True,,,,,,,,True,True,,,,,,, -,,,circa_possible_qn,,,,,True,,,,,,,,,,,,, -,,,circa_question_declarative,,,,,,,,,,True,,,,,,,, -,,,cnn_dailymail_3.0.0_generate_story,,,,,True,,,,,,,,,,,,, -,,,cnn_dailymail_3.0.0_news_card_view,,,,,,,True,,,,,,,True,,,, -,,,cnn_dailymail_3.0.0_news_stock,,,,,,,True,,,,,,,True,,,, -,,,cnn_dailymail_3.0.0_news_summary,,,,,,,True,,,,,,,True,,True,, -,,,cnn_dailymail_3.0.0_spice_up_story,,,,,True,,,,,,,,,,,,, -,,,codah_codah_answer_no_option,,True,,,,,,,,,,,,,,,, -,,,codah_codah_answer_with_option,True,,,,,,,,,,,,,,,,, -,,,codah_codah_answer_with_option_idx,True,,,,,,,,,,,,,,,,, -,,,codah_codah_answer_with_option_post,True,,,,,,,,,,,,,,,,, -,,,codah_codah_choose_from_list,True,,,,,,,,,,,,,,,,, -,,,codah_codah_finish_from_the_list,True,,,,,,,,,,,,,,,,, -,,,codah_codah_finish_from_the_list_post,True,,,,,,,,,,,,,,,,, -,,,codah_codah_finish_pre,,True,,,,,,,,,,,,,,,, -,,,codah_codah_question_category,,,,,,,,,,True,,,,,,,, -,,,codah_codah_question_category_bis,,,,,,,,,,True,,,,,,,, -,,,common_gen_Example_prompt,,,,,,,True,,,,,,,,,,, -,,,common_gen_Given_concepts,,,,,,,True,,,,,,,,,,, -,,,common_gen_Put_together,,,,,,,True,,,,,,,,,,, -,,,common_gen_choice_in_concept_centric_sentence_generation,,,,,,,True,,,,,,,,,,, -,,,common_gen_sentence_to_concepts,,,,,,,,,,True,,,,,,,, -,,,cos_e_v1.11_description_question_option_id,True,,,,,,,,,,,,,,,,, -,,,cos_e_v1.11_description_question_option_text,True,,,,,,,,,,,,,,,,, -,,,cos_e_v1.11_generate_explanation_given_text,True,,,,,,True,,,,,,True,,,,, -,,,cos_e_v1.11_generate_explanation_no_given_answer,,True,,,,,True,,,,,,,,,,, -,,,cos_e_v1.11_question_description_option_id,True,,,,,,,,,,,,,,,,, -,,,cos_e_v1.11_question_description_option_text,True,,,,,,,,,,,,,,,,, -,,,cos_e_v1.11_question_option_description_id,True,,,,,,,,,,,,,,,,, -,,,cos_e_v1.11_question_option_description_text,True,,,,,,,,,,,,,,,,, -revisit,,,cosmos_qa_context_description_question_answer_id,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_description_question_answer_text,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_description_question_text,,True,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_question_answer_description_id,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_question_answer_description_text,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_question_description_answer_id,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_question_description_answer_text,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_context_question_description_text,,True,,,,,,,,,,,,,,,, -,,,cosmos_qa_description_context_question_answer_id,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_description_context_question_answer_text,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_description_context_question_text,,True,,,,,,,,,,,,,,,, -,,,cosmos_qa_no_prompt_id,True,,,,,,,,,,,,,,,,, -,,,cosmos_qa_no_prompt_text,True,,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_1,,True,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_10,True,,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_3,,True,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_5,,True,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_7,,True,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_8,,True,,,,,,,,,,,,,,,, -,,,dbpedia_14_dbpedia_9,True,,,,,,,,,,,,,,,,, -,,,dream_answer_to_dialogue,,,,,True,,,,,,,,,,,,, -,,,dream_baseline,True,,,,,,,,,,,,,,,,, -,,,dream_conversation,True,,,,,,,,,,,,,,,,, -,,,dream_generate_first_utterance,,,,,True,,,,,,,,,,,,, -,,,dream_generate_last_utterance,,,,,True,,,,,,,,,,,,, -,True,,emo_feeling,True,,,,,,,,,,,,,,,,, -,True,,emo_final_message,True,,,,,,,,,,,,,,,,, -,True,,emo_persons_describe,True,,,,,,,,,,,,,,,True,, -,True,,emo_persons_infer,True,,,,,,,,,,,,,,,,, -,True,,emo_spoke_last,True,,,,,,,,,,,,,,,,, -,,,freebase_qa_inference_chain_prompt,,,,,,,,,,True,,,,,,,, -,,,freebase_qa_inference_chain_prompt_context,,,,,,,,,,True,,,,,,,, -,,,freebase_qa_qa_context_1,,,,,,,,,,,,,,,,,, -,,,freebase_qa_qa_context_2,,,,,,,,,,,,,,,,,, -,,,freebase_qa_qa_template_basic,,,,,,,,,,,,,,,,,, -,,,gigaword_Document_,,,,,,,True,,,,,,,,,,, -,,,gigaword_Summarize_this_document_,,,,,,,True,,,,,,,,,,, -,,,gigaword_TLDR,,,,,,,True,,,,,,,,,,, -,,,gigaword_generate_summary_for_this,,,,,,,True,,,,,,,,,,, -,,,gigaword_in_a_nutshell,,,,,,,True,,,,,,,,,,, -,,,gigaword_reverse_writing,,,,,,,,,,True,,,,,,,, -,,,gigaword_reverse_writing_2,,,,,,,True,,,,,,,,,,, -,,,gigaword_summarize_,,,,,,,True,,,,,,,,,,, -,,,gigaword_write_one_sentence,,,,,,,True,,,,,,,,,,, -,True,True,glue_cola_Following_sentence_acceptable,True,,,,,,,,,,,,,,,,, -,True,True,glue_cola_Make_sense_yes_no,,,True,,,,,,,,,,,,,,, -,True,True,glue_cola_Previous_sentence_acceptable,,,,True,,,,,,,,,,,,,, -,True,True,glue_cola_editing,,,True,,,,,,,,,,,,,,, -,True,True,glue_cola_jinja_example,,,,True,,,,,,,,,,,,,, -,True,,glue_mrpc_equivalent,True,,,,,,,,,,,,,,True,,, -,True,,glue_mrpc_paraphrase,,,,True,,,,,,,,,,,,,, -,True,,glue_mrpc_replace,,,,True,,,,,,,,,,,,,, -,True,,glue_mrpc_same_thing,,,,True,,,,,,,,,,,True,,, -,True,,glue_mrpc_want_to_know,,,,True,,,,,,,,,,,True,,, -,,,glue_qqp_answer,,,,True,,,,,,,,,,,,,, -,,,glue_qqp_duplicate,,,,True,,,,,,,,,,,,,, -,,,glue_qqp_duplicate_or_not,True,,,,,,,,,,,,,,,,, -,,,glue_qqp_quora,,,,True,,,,,,,,,,,,True,, -,,,glue_qqp_same_thing,,,,True,,,,,,,,,,,,,, -,,,glue_sst2_following_positive_negative,True,,,,,,,,,,,,,,,,, -,,,glue_sst2_happy_or_mad,True,,,,,,,,,,,,,,,,, -,,,glue_sst2_positive_negative_after,True,,,,,,,,,,,,,,,,, -,,,glue_sst2_review,True,,,,,,,,,,,,,,,,, -,,,glue_sst2_said,True,,,,,,,,,,,,,,,,, -,,True,glue_stsb_examples,,,,,,,,,,,,,,,,,, -,,True,glue_stsb_rank,,,,,,,,,,,,,,,,,, -,,True,glue_stsb_rate,,,,,,,,,,,,,,,,,, -,,True,glue_stsb_score,,,,,,,,,,,,,,,,,, -,,True,glue_stsb_similarity,,,,,,,,,,,,,,,,,, -,True,True,hans_GPT_3_style,True,,,,,,,,,,,,,,,,, -,True,True,hans_Suppose_Can_we_infer_that_,,,,True,,,,,,,,,,,,,, -,True,True,hans_based_on_the_previous_passage,,,,True,,,,,,,,,,,,,, -,True,True,hans_does_S1_entail_S2_,,,True,,,,,,,,,,,,,,, -,True,True,hans_given_does_it_follow_that_,,,True,,,,,,,,,,,,,,, -,True,True,hans__does_the_previous_passage_support_the_claim_that,,,,True,,,,,,,,,,,,,, -,,,hellaswag_YesNo_0,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_1,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_2,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_3,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_reversed_0,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_reversed_1,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_reversed_2,,,True,,,,,,,,,,,,,,, -,,,hellaswag_YesNo_reversed_3,,,True,,,,,,,,,,,,,,, -,,,hellaswag_complete_first_then,True,,,,,,,,,,,,,,,,, -,,,hellaswag_first_then,True,,,,,,,,,,,,,,,,, -,,,hellaswag_how_ends,True,,,,,,,,,,,,,,,,, -,,,hellaswag_if_begins_how_continues,True,,,,,,,,,,,,,,,,, -,,,hellaswag_which_ending,True,,,,,,,,,,,,,,,,, -,,,imdb_imdb_1,,True,,,,,,,,,,,,,,,, -,,,imdb_imdb_2,,True,,,,,,True,,,,,,,,,, -,,,imdb_imdb_3,,True,,,,,,,,,,,,,,,, -,,,imdb_imdb_4,,True,,,,,,,,,,,,,,,, -,,,imdb_imdb_5,,True,,,,,,,,,,,,True,,,, -,,,imdb_imdb_6,,True,,,,,,,,,,,,,,,, -,,,imdb_imdb_7,,True,,,,,,,,,,,,,,,, -,,,imdb_imdb_8,,True,,,,,,,,,,,,,,,, -,,,imdb_imdb_9,,,,True,,,,,,,,,,,,,, -,True,,mc_taco_mc_taco_1,,,,True,,,,,,,,,,,,,, -,,,mc_taco_mc_taco_2,,,,,,,,,,True,,,,,,,, -,True,,mc_taco_mc_taco_3,,,True,,,,,,,,,,,True,,,, -,,,mc_taco_mc_taco_4,True,,,,,,,,,True,,,,,,,, -,,,mc_taco_mc_taco_5,,,,,True,,,,,,,,,,,,, -,,,mc_taco_mc_taco_6,,True,,,,,,,,,,,,,,,, -,True,True,nq_open_context_self_description,,,,,,,,,,,,,,,,,, -,,True,nq_open_guess_question,,,,,True,,,,,,,,,,,,, -,True,True,nq_open_question_answer,,,,,,,,,,,,,,,,,, -,True,True,nq_open_question_with_instruction,,,,,,,,,,,,,,,,,, -,,,onestop_english_ara_context,True,,,,,,,,,,,,,,,,, -,,,onestop_english_assess,True,,,,,,,,,,,,,True,,,, -,,,onestop_english_ats,True,,,,,,,,,,,,,,,,, -,,,onestop_english_esl_context,True,,,,,,,,,,,,,True,,,, -,,,onestop_english_esl_variation,True,,,,,,,,,,,,,True,,,, -,True,,openbookqa_main_choices,True,,,,,,,,,,,,,,,,, -,True,,openbookqa_main_choose_an_answer_with_options,True,,,,,,,,,,,,,,,,, -,True,,openbookqa_main_only_options,True,,,,,,,,,,,,,,,,, -,True,,openbookqa_main_pick_answer_with_options,True,,,,,,,,,,,,,,,,, -,True,,openbookqa_main_pick_using_id,True,,,,,,,,,,,,,,,,, -,True,,openbookqa_main_which_correct,True,,,,,,,,,,,,,,,,, -,,True,openbookqa_main_which_correct_inverse,True,,,,,,,,,,,,True,,,,, -,,,paws_labeled_final_Concatenation,,,True,,,,,,,,,,True,,,,, -,,,paws_labeled_final_Concatenation_no_label,,,,True,,,,,,,,,True,,,,, -,,,paws_labeled_final_Meaning,,,True,,,,,,,,,,True,,,,, -,,,paws_labeled_final_Meaning_no_label,,,,True,,,,,,,,,True,,,,, -,,,paws_labeled_final_PAWS_ANLI_GPT3,True,,,,,,,,,True,,,,,,,, -,,,paws_labeled_final_PAWS_ANLI_GPT3_no_label,,True,,,,,,,,True,,,,,,,, -,,,piqa_Correct_the_solution,,,,,True,,,,,,,,,,,,, -,,,piqa_Correct_the_solution_if_false_from_sol_1,,,,,True,,,,,,,,,,,,, -,,,piqa_Correct_the_solution_if_false_from_sol_2,,,,,True,,,,,,,,,,,,, -should use jinja choice,,,piqa_Does_this_solution_make_sense_sol1,,,,True,,,,,,,,,,,,,, -,,,piqa_Does_this_solution_make_sense_sol2,,,,True,,,,,,,,,,,,,, -,,,piqa_Generate_a_similar_but_wrong_solution,,,,,True,,,,,,,,,,,,, -,,,piqa_choose_the_most_appropriate_solution,True,,,,,,,,,,,,,,,,, -duplicate of above,,True,piqa_choose_the_most_appropriate_solution_reorder_solution,True,,,,,,,,,,,,,,,,, -,,,piqa_no_prompt_needed,,,,,True,,,,,,,,,,,,, -,,,qa_srl_aq,,,,,True,True,,,,,,,,,,,, -,,,qa_srl_context_answer,,,,,True,,,,,,,,,,,,, -,,,qa_srl_context_qn,,,,,True,,,,,,,,,,,,, -,,,qa_srl_predicate,,,,,,,,,,True,,,,,,,, -need non-naive metric,True,,qa_srl_qa,,,,,,,,,,,,,,,,,, -,,,qasc_is_correct_0,,,,True,,,,,,,,,,,,,, -,,,qasc_is_correct_1,,,,True,,,,,,,,,,,,,, -,,,qasc_qu_combined,True,,,,,,,,,,,,,,,,, -,,,qasc_sep_combined_can_tell,True,,,,,,,,,,,,,,,,, -,,,qasc_sep_qu,True,,,,,,,,,,,,,,,,, -,,,quail_context_description_question_answer_id,True,,,,,,,,,,,,,,,,, -,,,quail_context_description_question_answer_text,True,,,,,,,,,,,,,,,,, -,,,quail_context_description_question_text,,True,,,,,,,,,,,,,,,, -,,,quail_context_question_answer_description_id,True,,,,,,,,,,,,,,,,, -,,,quail_context_question_answer_description_text,True,,,,,,,,,,,,,,,,, -,,,quail_context_question_description_answer_id,True,,,,,,,,,,,,,,,,, -,,,quail_context_question_description_answer_text,True,,,,,,,,,,,,,,,,, -,,,quail_context_question_description_text,True,,,,,,,,,,,,,,,,, -,,,quail_description_context_question_answer_id,,True,,,,,,,,,,,,,,,, -,,,quail_description_context_question_answer_text,True,,,,,,,,,,,,,,,,, -,,,quail_description_context_question_text,,True,,,,,,,,,,,,,,,, -,,,quail_no_prompt_id,True,,,,,,,,,,,,,,,,, -,,,quail_no_prompt_text,True,,,,,,,,,,,,,,,,, -,,,quartz_para_question_1,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,quartz_para_question_1_reverse,True,,,,,,,,,,,,,,,,, -,,,quartz_para_question_2,True,,,,,,,,,,,,,,,,, -,,,quartz_para_question_3_choices,True,,,,,,,,,,,,,,,,, -,,,quartz_para_question_4_choices,True,,,,,,,,,,,,,,,,, -,,,quartz_para_question_plain,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,quartz_para_question_plain_reverse,True,,,,,,,,,,,,,,,,, -,,,quartz_question_para_1,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,quartz_question_para_1_reverse,True,,,,,,,,,,,,,,,,, -,,,quartz_question_para_2,True,,,,,,,,,,,,,,,,, -,,,quartz_question_para_3,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,quartz_question_para_3_reverse,True,,,,,,,,,,,,,,,,, -,,,quoref_Template_1,,,,,,,,,,,,,,,,,, -,,,quoref_Template_2,,,,,,,,,,,,,,True,,,, -,,,quoref_Template_3,,,,,True,,,,,,True,,,,,,, -,,,quoref_Template_4,,,,,,,,,,True,,,,,,,True, -,,,quoref_Template_5,,,,,,,,,,True,,,,,,,, -,,,race_high_Read_the_article_and_answer_the_question_no_option_,,True,,,,,,,,,,,,,,,, -,True,,race_high_Read_the_article_and_select_the_best_answer,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,race_high_Read_the_article_and_select_the_best_answer2,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,race_high_Read_the_article_and_select_the_best_answer3,True,,,,,,,,,,,,,,,,, -,,,race_high_Write_a_multi_choice_question_for_the_following_article,,,,,True,,,,,,,,,,,,, -,,,race_high_Write_a_multi_choice_question_for_the_following_article_2,,,,,True,,,,,,,,,,,,, -,,,race_middle_Read_the_article_and_answer_the_question_no_option_,,True,,,,,,,,,,,,,,,, -,True,,race_middle_Read_the_article_and_select_the_best_answer,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,race_middle_Read_the_article_and_select_the_best_answer2,True,,,,,,,,,,,,,,,,, -near duplicate of the above,,True,race_middle_Read_the_article_and_select_the_best_answer3,True,,,,,,,,,,,,,,,,, -,,,race_middle_Write_a_multi_choice_question_for_the_following_article,,,,,True,,,,,,,,,,,,, -,,,race_middle_Write_a_multi_choice_question_for_the_following_article_2,,,,,True,,,,,,,,,,,,, -,,,ropes_funky_prompt,True,,,,,,,,,,,,,,,,, -,,,ropes_plain,True,,,,,,,,,,,,,,,,, -,,,ropes_plain_bottom_hint,True,,,,,,,,,,,,,True,,,, -,,,ropes_plain_no_background,True,,,,,,,,,True,,,,,,,, -,,,ropes_prompt_beginning,True,,,,,,,,,,,,,,,,, -,,,ropes_prompt_bottom_hint_beginning,True,,,,,,,,,,,,,,,,, -,,,ropes_prompt_bottom_no_hint,True,,,,,,,,,True,,,,,,,, -,,,ropes_prompt_mix,True,,,,,,,,,,,,,True,,,, -,,,rotten_tomatoes_rt_1,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_10,True,,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_2,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_3,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_4,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_5,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_6,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_7,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_8,,True,,,,,,,,,,,,,,,, -,,,rotten_tomatoes_rt_9,,,,True,,,,,,,,,,,,,, -,,,sciq_Template_0,,True,,,,,,,,,,,True,,,,, -,,,sciq_Template_1,,True,,,,,,,,,,,True,,,,, -,True,,social_i_qa_social_i_qa1,True,,,,,,,,,,,,,,,,, -,,,social_i_qa_social_i_qa2,,True,,,,,,,,,,,,,,,, -select answer by ordinal word,True,,social_i_qa_social_i_qa3,True,,,,,,,,,,,,,,,,, -,,,social_i_qa_social_i_qa4,,,,,True,,,,,,,,,,,,, -4-way to binary classification,,,social_i_qa_social_i_qa5,,,,True,,,,,,,,,,,,,, -,,,squad_v2_Jeopardy_with_Context,,,,,True,,,,,,,,,,,,, -,,,squad_v2_Jeopardy_without_Context,,,,,True,,,,,True,,,,,,,, -,,,squad_v2_Questions_with_Context,True,,,,,,,,,,,,,,,,, -nicely randomnized prompt phrasing,,,squad_v2_Questions_with_Context_Without_Prompt_Keywords,True,,,,,,,,,,,,,,,,, -,,,squad_v2_Topic_Prediction_Context,,,,,,,,,,True,,,,,,,, -,,,squad_v2_Topic_Prediction_Context_with_randomized_prompt_options,,,,,,,,,,True,,,,,,,, -,,,squad_v2_Topic_Prediction_Context_with_randomized_prompt_options_placed_in_the_end,,,,,,,,,,True,,,,,,,, -,,,squad_v2_Topic_Prediction_Question_and_Answer_Pair,,,,,,,,,,True,,,,,,,, -,,,squad_v2_Trivia,,,,,,,,,,True,,,,,,,, -,True,,super_glue_boolq_GPT_3_Style,,,,True,,,,,,,,,,,,,, -,True,,super_glue_boolq_I_wonder_,,,,True,,,,,,,,,,,,,, -,True,,super_glue_boolq_based_on_the_following_passage,,,,True,,,,,,,,,,,,,, -,True,,super_glue_boolq_based_on_the_previous_passage,,,,True,,,,,,,,,,,,,, -,True,,super_glue_boolq_could_you_tell_me_,,,,True,,,,,,,,,,,,,, -,True,True,super_glue_cb_GPT_3_style,True,,,,,,,,,,,,,,,,, -,True,True,super_glue_cb_based_on_the_previous_passage,True,,,,,,,,,,,,,,,,, -contrapositive,True,True,super_glue_cb_does_S1_contradict_S2_,True,,,,,,,,,True,,,,,,,, -,True,True,super_glue_cb_does_S1_entail_S2_,True,,,,,,,,,,,,,,,,, -,True,True,super_glue_cb_given_does_it_follow_that_,True,,,,,,,,,,,,,,,,, -must/might/may be true,True,True,super_glue_cb_given_it_must_be_true_that_,True,,,,,,,,,,,,,,,,, -,True,,super_glue_copa_C1_or_C2_premise_so_because_,True,,,,,,,,,,,,,,,,, -effect examples,True,,super_glue_copa__As_a_result_C1_or_C2_,True,,,,,,,,,,,,,,,,, -effect examples,True,,super_glue_copa__What_could_happen_next_C1_or_C2_,True,,,,,,,,,,,,,,,,, -cause examples,True,,super_glue_copa__which_may_be_caused_by,True,,,,,,,,,,,,,,,,, -effect examples,True,,super_glue_copa__which_may_cause_C1_or_C2_,True,,,,,,,,,,,,,,,,, -cause examples,True,,super_glue_copa__why_C1_or_C2,True,,,,,,,,,,,,,,,,, -,True,,super_glue_multirc_I_was_going_to_say_,,,,True,,,,,,,,,,,,,, -,True,,super_glue_multirc_Would_it_be_good_to_answer_,,,,True,,,,,,,,,,,,,, -,True,,super_glue_multirc_is_a_correct_answer_,,,,True,,,,,,,,,,,,,, -,True,,super_glue_multirc_is_the_correct_answer_,,,,True,,,,,,,,,,,,,, -,True,,super_glue_multirc_paragraph_question_is_it_,,,,True,,,,,,,,,,,,,, -,True,,super_glue_record_Can_you_figure_out_,,True,,,,,,,,,,,,,,,, -,True,,super_glue_record_In_the_question_above_the_placeholder_stands_for,,True,,,,,,,,,,,,,,,, -,True,,super_glue_record_What_could_the_placeholder_be_,True,,,,,,,,,,,,,,,,, -no difference here?,True,,super_glue_record_Which_one_is_the_placeholder_,True,,,,,,,,,,,,,,,,, -,True,,super_glue_record_the_placeholder_refers_to_,,True,,,,,,,,,,,,,,,, -,True,True,super_glue_rte_GPT_3_style,True,,,,,,,,,,,,,,,,, -,True,True,super_glue_rte_Suppose_Can_we_infer_that_,,,,True,,,,,,,,,,,,,, -,True,True,super_glue_rte_based_on_the_previous_passage,,,,True,,,,,,,,,,,,,, -,True,True,super_glue_rte_does_S1_entail_S2_,,,True,,,,,,,,,,,,,,, -,True,True,super_glue_rte_given_does_it_follow_that_,,,,True,,,,,,,,,,,,,, -,True,True,super_glue_rte__Therefore_we_re_licensed_to_say_that_,,,,True,,,,,,,,,,,,,, -,True,True,super_glue_rte__does_the_previous_passage_support_the_claim_that,,,,True,,,,,,,,,,,,,, -,True,,super_glue_wic_GPT_3_prompt,,,,True,,,,,,,,,,,True,,, -,True,,super_glue_wic_GPT_3_prompt_with_label,,,True,,,,,,,,,,,,True,,, -,True,,super_glue_wic_question_context,,,,True,,,,,,,,,,,True,,, -,True,,super_glue_wic_question_context_meaning,,,,True,,,,,,,,,,,True,,, -,True,,super_glue_wic_question_context_meaning_with_label,,,True,,,,,,,,,,,,True,,, -,True,,super_glue_wic_similar_sense,,,,True,,,,,,,,,,,True,,, -,True,,super_glue_wsc.fixed_Here_p_stands_for_,,,,,,,,,,,,,,,,,, -,True,,super_glue_wsc.fixed_In_the_previous_sentence_the_pronoun_refers_to_,,,,,,,,,,,,,,,,,, -,True,,super_glue_wsc.fixed_Who_is_are_,,,,,,,,,,,,,,,,,, -,True,,super_glue_wsc.fixed_in_the_passage_above_the_pronoun_X_refers_to_,,,,,,,,,,,,,,,,,, -,True,,super_glue_wsc.fixed_passage_what_does_the_pronoun_refer_to_,,,,,,,,,,,,,,,,,, -cast 4-way classification as binary,,,swag_regular_YesNo_0,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_1,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_2,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_3,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_reversed_0,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_reversed_1,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_reversed_2,,,True,,,,,,,,,,,,,,, -,,,swag_regular_YesNo_reversed_3,,,True,,,,,,,,,,,,,,, -,,,swag_regular_complete_first_then,True,,,,,,,,,,,,,,,,, -,,,swag_regular_first_then,True,,,,,,,,,,,,,,,,, -,,,swag_regular_how_ends,True,,,,,,,,,,,,,,,,, -,,,swag_regular_if_begins_how_continues,True,,,,,,,,,,,,,,,,, -,,,swag_regular_which_ending,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_ABBR,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_ABBR_context_first,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_DESC,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_DESC_context_first,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_ENTY,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_ENTY_context_first,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_HUM,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_HUM_context_first,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_LOC,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_LOC_context_first,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_NUM,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_NUM_context_first,True,,,,,,,,,,,,,,,,, -,,,trec_fine_grained_open,,True,,,,,,,,,,,,,,,, -,,,trec_fine_grained_open_context_first,,True,,,,,,,,,,,,,,,, -answers are not what the questions ask for,,True,trec_gao_et_al_1,,,,,,,,,,,,True,,,,,, -answers are not what the questions ask for,,True,trec_gao_et_al_2,,,,,,,,,,,,True,,,,,, -,,,trec_trec1,True,,,,,,,,,,,,,,,,, -,,,trec_trec2,True,,,,,,,,,,,,,,,,, -,,,trivia_qa_rc_context_self_description,,,,,,,,,,,,,,,,,, -,,,trivia_qa_rc_guess_question,,,,,True,True,,,,True,,,,,,,, -,,,trivia_qa_rc_question_answer,,,,,,,,,,,,,,,,,, -,,,trivia_qa_rc_question_with_instruction,,,,,,,,,,,,,,,,,, -,,,trivia_qa_rc_reading_comprehension_1,,,,,,,,,,True,,,,,,,, -,,,trivia_qa_rc_reading_comprehension_2,,,,,,,,,,True,,,,,,,, -,,,web_questions_count_answers,,,,,,,,,True,,,,,,,,, -,,,web_questions_credible_question,,,,,True,,,,,,,,,,,,, -,,,web_questions_if_answers_what_question,,,,,True,,,,,,,,,,,,, -,,,web_questions_potential_correct_answer,,,,,,,,,,,True,,,,,,, -,,,web_questions_question_answer,,,,,,,,,,,,,,,,,, -,,,web_questions_suggest_question,,,,,True,,,,,,,,,,,,, -,,,wiki_bio_comprehension,,,,,,,,,,True,,,,,,,, -,,,wiki_bio_guess_person,,,,,,,,,,True,,,,,,,, -,,,wiki_bio_key_content,,,,,,,,,,True,,,,,,,, -,,,wiki_bio_what_content,,,,,,,,,,True,,,,,,,, -"should rephrase ""summarize""",,,wiki_bio_who,,,,,,,,,,,,,,,,,, -,,,wiki_hop_original_Choose_Best_Object_Candidate,,,,,,,,,,True,,,,,,,,True -,,,wiki_hop_original_Explain_Relation,,True,,,,,,,,True,,,,,,,, -,,,wiki_hop_original_Generate_Fact_Triple,,,,,,,,,,True,,,,,,,,True -,,,wiki_hop_original_Generate_Object_Answer,,,,,,,,,,True,,,,,,,,True -,,,wiki_hop_original_Generate_Subject_Answer,,,,,,,,,,True,,,,,,,,True -,,,wiki_hop_original_Indirect_Question_about_Birthplace_Citizenship_Place_of_Death,,,,,,,,,,,,,True,,,,, -,,,wiqa_effect_with_label_answer,True,,,,,,,,,,,,,,,,, -,,,wiqa_effect_with_string_answer,True,,,,,,,,,,,,,,,,, -,,,wiqa_impacting_the_process,,,,True,,,,,,,,,,,,,, -,,,wiqa_question_type,,,,,,,,,,True,,,,,,,, -,,,wiqa_remove_first_step,,,,,,,,,,True,,,,,,,, -,,,wiqa_remove_first_step_bis,,,,,,,,,,True,,,,,,,, -,,,wiqa_remove_last_step,,,,,,,,,,True,,,,,,,, -,,,wiqa_remove_last_step_bis,,,,,,,,,,True,,,,,,,, -,True,,xsum_Document_,,,,,,,,,,,,,,,,,, -,True,,xsum_Summarize_this_document_,,,,,,,,,,,,,,,,,, -,True,,xsum_TLDR,,,,,,,,,,,,,,,,,, -,True,,xsum_generate_summary_for_this,,,,,,,,,,,,,,,,,, -,True,,xsum_summarize_,,,,,,,,,,,,,,True,,,, -,True,,xsum_write_one_sentence,,,,,,,,,,,,,,,,,, -,,,yelp_review_full_based_on_that,,True,,,,,,,,,,,,,,,, -,,,yelp_review_full_format_rating,,True,,,,,,,,,,,,,,,, -,,,yelp_review_full_format_score,,True,,,,,,,,,,,,,,,, -,,,yelp_review_full_format_star,,True,,,,,,,,,,,,,,,, -,,,yelp_review_full_on_a_scale,,True,,,,,,,,,,,,,,,, -,,,yelp_review_full_so_i_would,,True,,,,,,,,,,,,,,,, -,,,yelp_review_full_this_place,,True,,,,,,,,,,,,,,,, diff --git a/promptsource/seqio_tasks/experiment_D4.csv b/promptsource/seqio_tasks/experiment_D4.csv deleted file mode 100644 index 71c8216cc..000000000 --- a/promptsource/seqio_tasks/experiment_D4.csv +++ /dev/null @@ -1,242 +0,0 @@ -HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference -crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,, -super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012 -winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012 -winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020 -winogrande,winogrande_debiased,coreference,ext,"""debiased"" = adversarially filtered",GPT,TRUE,,TRUE,9248,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020 -glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,,TRUE,8551,0,,TRUE,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019 -super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,TRUE,,TRUE,250,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019 -super_glue,rte,NLI,cls,,,TRUE,,TRUE,2490,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009 -anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,,TRUE,162865,0,,TRUE,0,accuracy,,https://arxiv.org/abs/1910.14599,,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020 -hans,,NLI,cls,,,TRUE,,TRUE,0,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,,sentence pair,syntax?,,McCoy et al. 2019 -super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,, -glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005 -glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link) -paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019 -ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it -chooses the correct answer and 1/k if it reports a k-way tie -(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018 -ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it -chooses the correct answer and 1/k if it reports a k-way tie -(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,, -nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,, -kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018 -trivia_qa,unfiltered,QA_closed_book,gen,,GPT,TRUE,,TRUE,87622,0,TRUE,,87622,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,,Trivia QA,,,,, -web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,,,TRUE,3778,0,TRUE,,3778,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,,web questions,qa/closed-book qa,,,,Berant et al. 2013 -wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclanthology.org/D15-1237.pdf,,,wiki qa,cls/other,,,,Yang et al. 2015 -adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020 -adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,, -adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,, -coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared -against n human answers resulting in n F1 scores, -the maximum of which is chosen as the prediction’s -F1.For each question, we average out F1 across -these n sets, both for humans and models. In our -final evaluation, we use n = 4 human answers for -every question (the original answer and 3 additionally collected answers). The articles a, an and the -and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,, -duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018 -duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018 -ropes,,QA_extractive,ext,,,TRUE,TRUE,,10924,10924,TRUE,,10924,,,,modest,,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019 -squad_v2,,QA_extractive,ext,,GPT,,,TRUE,130319,0,TRUE,,130319,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018 -super_glue,record,QA_extractive,ext,,,TRUE,,TRUE,100730,0,TRUE,TRUE,100730,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018 -qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,,TRUE,6414,0,TRUE,TRUE,6414,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,,qa srl,other,,semantic role,,He et al. 2015 -quac,,QA_extractive,ext,,GPT,,,TRUE,11567,,,,,"average_maximum_f1;HEQ-Q;HEQ-D: To make oracle human and system performance comparable, -given n references, we report the average of the -maximum F1 computed from each n − 1 subset -with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,,dialogue,, -quoref,,QA_extractive,ext,,,TRUE,TRUE,,19399,19399,TRUE,,19399,,,https://aclanthology.org/D19-1606.pdf,,,Quoref,Extractive QA,,,,Dasigi et al. 2019 -tydiqa,,QA_extractive,ext,,Eval WG,,TRUE,,9211,9211,,,,,,,,,,,,,, -drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,TRUE,,TRUE,,,,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019 -cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019 -cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019 -dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019 -openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it -chooses the correct answer and 1/k if it reports a k-way tie -(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018 -qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020 -quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020 -quarel,,QA_multiple_choice,cls,,CrossFit,,TRUE,,1941,1941,,,,,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a -quartz,,QA_multiple_choice,cls,,,TRUE,TRUE,,2696,2696,TRUE,,2696,,,https://aclanthology.org/D19-1608.pdf,given?,,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b -race,high,QA_multiple_choice,cls,GPT-hard,GPT,,,TRUE,62445,0,TRUE,TRUE,62445,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017 -race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,,,TRUE,25421,0,TRUE,TRUE,25421,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017 -sciq,,QA_multiple_choice,cls,,,TRUE,TRUE,,11679,11679,TRUE,,11679,,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017 -social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,TRUE,TRUE,TRUE,33410,33410,TRUE,TRUE,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019 -super_glue,boolq,QA_multiple_choice,cls,,,TRUE,,TRUE,9427,0,TRUE,TRUE,9427,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,,superglue-boolq,,,knowledge-? reading comprehension,, -super_glue,copa,QA_multiple_choice,cls,,,TRUE,,TRUE,400,0,TRUE,TRUE,400,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012 -super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,TRUE,,TRUE,27243,0,TRUE,TRUE,27243,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018 -wiki_hop,original,QA_multiple_choice,cls,,,TRUE,TRUE,,43738,43738,TRUE,,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB, -wiqa,,QA_multiple_choice,cls,,,TRUE,TRUE,,29808,29808,TRUE,,29808,,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019 -circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,,TRUE,34268,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,,circa,cls/other,,pragmatics,,Louis et al. 2020 -mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,,TRUE,0,0,,TRUE,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019 -piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,,,TRUE,16113,0,TRUE,,16113,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020 -amazon_polarity,,sentiment,cls,,,TRUE,TRUE,,3600000,500000,TRUE,,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013 -app_reviews,,sentiment,cls,,,TRUE,TRUE,,288065,288065,TRUE,,288065,,,,,,app reviews,other/regression,,,,Missing -imdb,,sentiment,cls,,,TRUE,TRUE,,25000,25000,TRUE,,25000,,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011 -rotten_tomatoes,,sentiment,cls,,,TRUE,TRUE,,8530,8530,TRUE,,8530,,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005 -yelp_review_full,,sentiment,cls,no dev set,,TRUE,TRUE,,650000,500000,TRUE,,500000,,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link) -lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,, -craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,, -story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,,TRUE,,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,, -hellaswag,,story_completion,cls,,GPT,,,TRUE,39905,0,TRUE,,39905,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019 -common_gen,,structure_to_text,gen,,,TRUE,TRUE,,67389,67389,TRUE,,67389,,,,,,common gen,other,,,,Lin et al. 2020b -wiki_bio,,structure_to_text,gen,,,TRUE,TRUE,,582659,500000,TRUE,,500000,,,,,,wiki bio,cg/other,,,,Lebret et al. 2016 -cnn_dailymail,3.0.0,summarization,gen,,,TRUE,TRUE,,287113,287113,TRUE,,287113,,,,,,,,,,, -gigaword,,summarization,gen,,,TRUE,TRUE,,3803957,500000,TRUE,,500000,,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012 -multi_news,,summarization,gen,,CrossFit,,TRUE,,44972,44972,,,,,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019 -samsum,,summarization,gen,,CrossFit,,TRUE,,14732,14732,,,,,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019 -xsum,,summarization,gen,,,TRUE,TRUE,TRUE,204045,204045,TRUE,TRUE,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,,xsum,cg/summarization,,,,Narayan et al. 2018 -ag_news,,topic_classification,cls,,,TRUE,TRUE,,120000,120000,TRUE,,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,,ag news,cls/topic,,,,Gulli (link) -dbpedia_14,,topic_classification,cls,,,TRUE,TRUE,,560000,500000,TRUE,,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015 -trec,,topic_classification,cls,,,TRUE,TRUE,,5452,5452,TRUE,,5452,,,https://trec.nist.gov/data/qa.html,,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001 -super_glue,wic,word_sense_disambiguation,cls,,,TRUE,,TRUE,5428,0,TRUE,TRUE,5428,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019 -Staging Area,,,,,,,,,,,,,,,,,,,,,,,, -Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,,,,, -definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,,,,,,definite pronoun resolution,other,,,,Rahman and Ng 2012 -jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,,,,,promptsource download error,jeopardy,qa/closed-book qa,,,,(link) -blimp,,,cls,no prompts yet; collapse subsets,,,,,,0,,,0,,,,,,,,,,, -Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,,,,, -Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,,7088,,,,,,,,,,,,,,, -Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,,1211,,,,,,,,,,,,,,, -MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,,14191,,,,,,,,,,,,,,, -narrativeqa,,,,very long input sequence,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,NarQA,Abstractive QA,,,, -newsqa,,,,download error,TaskEmbed,,,,,,,,,,,,,promptsource download error,NewsQA,Extractive QA,,,,Trischler et al. 2017 -eli5,,,,dataset split error,CrossFit,,,,,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,skip: HF datasets error the split field is used for subsets,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019 -Maybe Reconsider,,,,,,,,,,,,,,,,,,,,,,,, -zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,,,,, -swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,,73546,0,TRUE,,73546,,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018 -codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,,2776,0,TRUE,,2776,,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019 -wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,,,,,no prompt yet,wiki auto,cls/other,,text simplification,,Jiang et al. 2020 -proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,,,,,no prompt yet,proto qa,other,,,,Boratko et al. 2020 -empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,no prompt yet,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019 -qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,,,,, -kilt_tasks,aidayago2,,,,,,,,,,,,,,,,,no prompt yet,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011 -kilt_tasks,wow,,,,,,,,,,,,,,,,,no prompt yet,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019 -lama,conceptnet,,,,,,,,,,,,,,,,,no prompt yet,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020 -lama,google_re,,,,,,,,,,,,,,,,,no prompt yet,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020 -lama,squad,,,,,,,,,,,,,,,,,no prompt yet,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020 -lama,trex,,,,,,,,,,,,,,,,,no prompt yet,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020 -limit,,physical cognition,,,,,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,label errors in dataset itself? also no validation set otherwise well motivated by semantic theories,limit,other,,physical semantic repr.,,Manotas et al. 2020 -kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,temporary skip: prompts available in non-benchmark standalone dataset,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018 -Skipped,,,,,,,,,,,,,,,,,,,,,,,, -fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,,,,,skip: awkward prompts as closed-book qa,FEVER,,,,, -hotpot_qa,distractor,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,, -hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,, -emo,,sentiment,cls,skip: offensive and ungrammatical text,,merged,,,30160,0,TRUE,TRUE,30160,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,skip: offensive and ungrammatical text,emo,cls/emotion,,,,Chatterjee et al. 2019 -freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,,20358,0,TRUE,,20358,,,,intensive,,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019 -aqua_rat,,,,,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,skip: nontrivial math,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017 -math_qa,,,,,,,,,,,,,,,,,,skip: nontrivial math,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019 -numer_sense,,,,,,,,,,,,,,,,,,skip: closed-book trivia ,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a -squad_adversarial,,,,,,,,,,,,,,,,,,validation set only,,,,,, -squadshifts,,,,,,,,,,,,,,,,,,test set only,,,,,, -sms_spam,,,,,,,,,,,,,,,,,,skip: unclean corpus and likely harmful content,sms spam,cls/other,,,,Almeida et al. 2011 -search_qa,,,,,,,,,,,,,,,,,,skip: seems like a very unclean corpus,search qa,qa/closed-book qa,,,,Dunn et al. 2017 -kilt_tasks,trex,,,,,,,,,,,,,,,,,skip: non-natural language,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018 -kilt_tasks,structured_zeroshot,,,,,,,,,,,,,,,,,skip: non-natural language,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017 -spider,,,,,,,,,,,,,,,,,,skip: non-natural language,spider,cg/other,,,,Yu et al. 2018 -wikisql,,,,,,,,,,,,,,,,,,skip: non-natural language,wikisql,cg/other,,,,Zhong et al. 2017 -com_qa,,,,,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,skip: non-human language: URL,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers, -climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,skip: no train set,climate fever,cls/fact checking,,,,Diggelmann et al. 2020 -art,,,,,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,skip: NLI reserved for generalization studies (although this one is not a traditionally defined NLI),art (abductive nli),other,,,,Bhagavatula et al. 2020 -glue,mnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-mnli,cls/nli,,,,Williams et al. 2018 -glue,qnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016 -glue,rte,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-rte,cls/nli,,,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009 -glue,wnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-wnli,cls/nli,,,,Levesque et al. 2012 -,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,scitail,cls/nli,,,,Khot et al. 2018 -,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,sick,cls/nli,,,,Marelli et al. 2014 -,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,SNLI (Bowman et al. 2015),NLI,,,misc., -aeslc,,,,summarization by email subject line,,,,,,,,,,,,https://arxiv.org/abs/1906.03497,,skip: niche task,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019 -onestop_english,,,,,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,skip: niche task: classify curriculum diffculty,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018 -mocha,,,,,,,,,,,,,,,,,,skip: model generated text,mocha,other/regression,,,,Chen et al. 2020a -commonsense_qa,,,,duplicate with cos_e,Vania,,,,9741,,,,,,,https://arxiv.org/pdf/1811.00937.pdf,,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019 -,,,,,,,,,,,,,,,,,,skip: maybe harmful content from Twitter,emotion,cls/emotion,,,,Saravia et al. 2018 -,,,,the authors themselves seem to have renounced their own work,,,,,,,,,,,,https://github.com/nyu-mll/crows-pairs,,skip: harmful content,crows pairs,other,,,,Nangia et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017 -,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018 -,,,,,,,,,,,,,,,,,,skip: harmful content,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-title,cg/summarization,,,,Kim et al. 2019 -,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020 -,,,,,,,,,,,,,,,,,,skip: harmful content,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019 -yelp_polarity,,,,,,,,,,,,,,,,,,skip: duplicate with yelp_review_full,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link) -quora,,,,,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,skip: duplicate under GLUE,QQP,paraphrase identification,,,social QA,Iyer et al. 2017 -squad,,,,,,,,,,,,,,,,,,skip: duplicate under Squad 2.0,SQuAD 1.1,Extractive QA,,,, -yahoo_answers_topics,,,,,,,,,,,,,,,,,,skip for early experiments: unclean corpus,yahoo answers topics,cls/topic,,,,(link) -tab_fact,,,,,,,,,,,,,,,,,,skip for early experiments: tabular data,tab fact,cls/fact checking,,,,Chen et al. 2020b -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-existential there quantifiers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020 -poem_sentiment,,,,,,,,,,,,,,,,,,skip for early experiments: poetry domain,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020 -acronym_identification,,,,,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,skip for early experiments: niche/hard task,acronym identification,other,,,,Pouran Ben Veyseh et al. 2020 -google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,google wellformed query,cls/other,,,,Faruqui and Das 2018 -liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,liar,cls/fact checking,,,,Wang 2017 -,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,crawl domain,other,,,,Zhang et al. 2020 -discovery,discovery,,,,,,,,,,,,,,,,,skip for early experiments: niche task no cannonical answer,discovery,cls/other,,generative-ish,,Sileo et al. 2019 -wiki_split,,,,,,,,,,,,,,,,,,skip for early experiments: niche task,wiki split,cg/other,,,,Botha et al. 2018 -,,,,,,,,,,,,,,,,,,skip for early experiments: multilingual,aslg pc12,other,,,,Othman and Jemni 2012 -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Conj (Ficler and Goldberg 2016),conjunct identification,,syntax,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc., -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank, -,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank, -financial_phrasebank,,,,,,,,,,,,,,,,,,skip for early experiments: financial domain,financial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014 -health_fact,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,health fact,cls/fact checking,,,,Kotonya and Toni 2020 -,,,,,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,skip for early experiments: biomedical domain,ade corpus v2-classification,cls/other,,,,Gurulingappa et al. 2012 -,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-dosage,other/slot filling,,,,Gurulingappa et al. 2012 -,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-effect,other/slot filling,,,,Gurulingappa et al. 2012 -,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020 -scicite,,,,,,,,,,,,,,,,,,skip for early experiments: academic domain + niche/hard task,scicite,cls/other,,,,Cohan et al. 2019 -,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,break-QDMR,other,,logical form,,Wolfson et al. 2020 -,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019 -glue,sst2,,,,,,,,,,,,,,,,,revisit: very short and often ill-formed movie reviews,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013 -glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-grain regression tasks,glue-stsb,semantic similarity,,,misc., -,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016 -,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016 -,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,BoolQ-CS,Binary yes/no,,,, -,,,,,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,double check: missing from HF datasets,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB, -,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,DROP-CS,Abstractive QA,,,, -,,,,,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,double check: missing from HF datasets,MCTest,Multiple choice,,,, -,,,,,,,,,,,,,,,,,,double check: missing from HF datasets,MRPC (Dolan and Brockett 2005),paraphrase identification,,,news, -,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,, -,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b -,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,, -,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,, diff --git a/promptsource/seqio_tasks/preview_annotated_prompts.py b/promptsource/seqio_tasks/preview_annotated_prompts.py deleted file mode 100644 index 6890d5247..000000000 --- a/promptsource/seqio_tasks/preview_annotated_prompts.py +++ /dev/null @@ -1,111 +0,0 @@ -import csv -from pprint import pprint -from typing import Dict, List - -import pkg_resources -from t5.data.glue_utils import get_glue_metric, get_super_glue_metric -from t5.evaluation.metrics import accuracy, mean_multiclass_f1, rouge - - -SAFE_EXCLUDE_CRETERIA = [ - "template_bug", - "negated_answers", - "counting", - "answer_span_indices", - "non_natural_language", - "generative_non_true_implausible", -] - -AGGRESSIVE_EXCLUDE_CRETERIA = [ - "generative_non_true_task", - "nontrivial_choices_hidden", - "awkward_phrasing", - "ungrammatical", -] + SAFE_EXCLUDE_CRETERIA - - -NON_GLUE_METRICS = { # for those with do_eval = True - "anli": [accuracy], - "hans": [accuracy], - "circa_goldstandard1_judgement": [mean_multiclass_f1(num_classes=8), accuracy], - "circa_goldstandard2_judgement": [mean_multiclass_f1(num_classes=5), accuracy], - "mc_taco": [accuracy], - "nq_open": [accuracy], - "qa_srl": [accuracy], - "openbookqa": [accuracy], - "race": [accuracy], - "social_i_qa": [accuracy], - "emo": [mean_multiclass_f1(num_classes=4)], - "xsum": [rouge], -} - - -def exclude_bad_prompts(prompt: Dict) -> bool: - for criterion in SAFE_EXCLUDE_CRETERIA: # or AGGRESSIVE_EXCLUDE_CRETERIA - if prompt.get(criterion): - return False - return True - - -def load_annotated_prompts() -> List[Dict]: - annotated_csv_path = pkg_resources.resource_filename(__name__, "experiment_D3.csv") - with open(annotated_csv_path) as in_file: - reader = csv.DictReader(in_file) - all_tasks = [row for row in reader] - - clean_tasks = list(filter(exclude_bad_prompts, all_tasks)) - - # Assign metrics - non_glue_eval_sets = list(NON_GLUE_METRICS.keys()) - for task in clean_tasks: - if not task["do_eval"]: - continue - - full_name = task["dataset_subset_template"] - if full_name.startswith("glue"): - subset = full_name.split("_")[1] - task["metrics"] = get_glue_metric(subset) - elif full_name.startswith("super_glue"): - subset = full_name.split("_")[2] - if subset in ("wsc.fixed", "multirc"): - # TODO: WSC and MultiRC need special pre/postprocesing - task["metrics"] = [accuracy] - continue - task["metrics"] = get_super_glue_metric(subset) - - for dataset_name in non_glue_eval_sets: - if full_name.startswith(dataset_name): - task["metrics"] = NON_GLUE_METRICS[dataset_name] - - # Skip rank_classification for now until we actually support it - # if task["nontrivial_choices_hidden"]: - # # Trick of plugging in answer options and rank LM probabilites as predictions. - # # Required for all prompts with non_trivial_choices_hidden, - # # but could be used for other tasks as well where answer choices are given. - # if "metrics" not in task: - # task["metrics"] = [rank_classification] - # elif rank_classification not in task["metrics"]: - # task["metrics"].append(rank_classification) - - # should be already handled by NON_GLUE_METRICS - # if task['generative_true_task'] or task['generative_non_true_task']: - # task['metrics'] = rouge - - return clean_tasks - - -def preview() -> None: - clean_tasks = load_annotated_prompts() - - train_tasks = [t for t in clean_tasks if not t["skip_train"]] - eval_tasks = [t for t in clean_tasks if t["do_eval"]] - - pprint([t["dataset_subset_template"] for t in train_tasks]) - print(len(train_tasks)) - - pprint([f'{t["dataset_subset_template"]} {t["metrics"]}' for t in eval_tasks]) - print(len(eval_tasks)) - - -if __name__ == "__main__": - preview() diff --git a/promptsource/seqio_tasks/preview_promptsource.py b/promptsource/seqio_tasks/preview_promptsource.py deleted file mode 100644 index 4dbbec761..000000000 --- a/promptsource/seqio_tasks/preview_promptsource.py +++ /dev/null @@ -1,105 +0,0 @@ -import csv -from typing import List, Optional, Tuple - -import pkg_resources - -# from rich import inspect -from rich.pretty import pprint - -from promptsource.templates import TemplateCollection - - -def preview() -> None: - experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv") - gsheet = {} - d4_train: List[Tuple[str, Optional[str]]] = [] - d4_eval: List[Tuple[str, Optional[str]]] = [] - d3_train_gpt: List[Tuple[str, Optional[str]]] = [] - d3_train_sglue: List[Tuple[str, Optional[str]]] = [] - experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv") - with open(experiment_path) as exp_file: - reader = csv.DictReader(exp_file) - for row in reader: - if row["skip"]: - continue - if row["subset"] == "": - row["subset"] = None # to match promptsource.Template object - dataset_subset = (row["HF_name"], row["subset"]) - if row["do_train"] == "TRUE": - d4_train.append(dataset_subset) - if row["do_eval"] == "TRUE": - d4_eval.append(dataset_subset) - if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]: - d3_train_gpt.append(dataset_subset) - if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue": - d3_train_sglue.append(dataset_subset) - gsheet[dataset_subset] = row - all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue - print(f"Number of non-desk-rejected datasets = {len(all_datasets)}") - print(f"Number of training sets = {len(d4_train)}") - print(f"Number of evaluation sets = {len(d4_eval)}") - - template_collection = TemplateCollection() - output = [] - missing_og_flags = [] - missing_metrics = [] - for dataset_name, subset_name in template_collection.keys: - ds_name = (dataset_name, subset_name) - if ds_name not in d4_eval: - template_collection.remove(dataset_name, subset_name) - continue - OG = 0 - non_OG = 0 - dataset = template_collection.get_dataset(dataset_name, subset_name) - for template_name in dataset.all_template_names: - template = dataset[template_name] - # if dataset_name == 'ropes': - # inspect(template.metadata) - if not template.metadata.metrics: - missing_metrics.append(f"{dataset_name}/{subset_name}/{template_name}") - - if template.metadata.original_task is True: - OG += 1 - elif template.metadata.original_task is False: - non_OG += 1 - elif template.metadata.original_task is None: - missing_og_flags.append(dataset_name + "/" + template_name) - continue - - train_size = gsheet[ds_name]["train_size"] - if train_size == "": - train_size = 0 - else: - train_size = int(train_size) - - adjusted_train_size = train_size // len(dataset.all_template_names) - - output.append( - ( - f"{dataset_name} {subset_name if subset_name else ''}", - f"{OG}-{non_OG}", - f"{train_size:,} {adjusted_train_size:,}", - ) - ) - - pprint(output) - print(len(template_collection)) - - print("Missing metrics:") - pprint(missing_metrics) - - print("Missing original task flags:") - pprint(missing_og_flags) - - # # print(d4_train_mixture) - # print(f"Number of training templates = {len(d4_train_mixture)}") - # # print(d4_eval_mixture) - # print(f"Number of evaluation templates = {len(d4_eval_mixture)}") - # # for i in seqio.TaskRegistry.names(): - # # print(i) - # print(f"Number of SeqIO registered templates = {len(seqio.TaskRegistry.names())}") - # print("^ includes non-original task templates which are excluded from the eval mixture") - - -if __name__ == "__main__": - preview() diff --git a/promptsource/seqio_tasks/tasks.py b/promptsource/seqio_tasks/tasks.py deleted file mode 100644 index 5734a9cb5..000000000 --- a/promptsource/seqio_tasks/tasks.py +++ /dev/null @@ -1,421 +0,0 @@ -import csv -import functools -from typing import Dict, List, Optional, Tuple - -import pkg_resources -import seqio -import t5 -import tensorflow as tf -from t5.data.glue_utils import get_glue_metric, get_super_glue_metric -from t5.evaluation import metrics as mt - -import promptsource.templates -from promptsource.seqio_tasks import utils -from promptsource.utils import load_dataset - - -GET_METRICS = { - "BLEU": mt.bleu, - "ROUGE": mt.rouge, - "Span Squad": mt.span_squad, - "Squad": mt.squad, - "Trivia QA": mt.trivia_qa, - "Accuracy": mt.accuracy, - "Sequence Accuracy": mt.sequence_accuracy, - "Pearson Correlation": mt.pearson_corrcoef, - "Spearman Correlation": mt.spearman_corrcoef, - "MultiRC": mt.multirc_f1_over_all_answers, - "AUC": mt.auc, - "COQA F1": mt.coqa_f1, - "Edit Distance": mt.edit_distance, - # "Mean Reciprocal Rank": mt.accuracy, # NOTE not in T5? - "Other": mt.accuracy, - # Missing support for mean_multiclass_f1 etc. which need a num_classes parameter -} - -MAX_EXAMPLES_PER_DATASET = 500_000 - - -def strip_whitespace(output_or_target, example=None, is_target=False): - """Cached tasks from promptsource all have a leading space on the ground-truth targets.""" - return output_or_target.strip() - - -def maybe_get_class_id_postprocessor(template): - if template.get_fixed_answer_choices_list(): - - def postprocess_fn(output_or_target, example=None, is_target=False): - output_or_target = strip_whitespace(output_or_target) - return t5.data.postprocessors.string_label_to_class_id( - output_or_target, label_classes=template.get_fixed_answer_choices_list() - ) - - return postprocess_fn - - else: - return strip_whitespace - - -def get_tf_dataset(split, shuffle_files, seed, dataset_name, subset_name, template, split_mapping): - # HF datasets does not support file-level shuffling - del shuffle_files, seed - dataset = load_dataset(dataset_name, subset_name) - dataset = dataset[split_mapping[split]] - dataset = utils.apply_template(dataset, template) - return utils.hf_dataset_to_tf_dataset(dataset) - - -def add_task(dataset_name, subset_name, template_name, task_name=None, split_mapping=None): - template = all_templates.get_dataset(dataset_name, subset_name)[template_name] - task_name = task_name or utils.get_task_name(dataset_name, subset_name, template_name) - - if dataset_name == "glue": - metrics = get_glue_metric(subset_name) - elif dataset_name == "super_glue": - if subset_name in ("wsc.fixed", "multirc"): - # TODO: WSC and MultiRC need special pre/postprocesing - metrics = [mt.accuracy] - else: - metrics = get_super_glue_metric(subset_name) - else: - # TODO what if metric is null? - metrics = [GET_METRICS[m] for m in template.metadata.metrics] - - dataset_splits = utils.get_dataset_splits(dataset_name, subset_name) - split_mapping = split_mapping or {k: k for k in dataset_splits.keys()} - - dataset_fn = functools.partial( - get_tf_dataset, - seed=None, - dataset_name=dataset_name, - subset_name=subset_name, - template=template, - split_mapping=split_mapping, - ) - data_source = seqio.FunctionDataSource( - dataset_fn, - splits=list(split_mapping.keys()), - num_input_examples={s: dataset_splits[split_mapping[s]].num_examples for s in split_mapping.keys()}, - ) - output_features = { - "inputs": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=False, dtype=tf.int32), - "targets": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=True, dtype=tf.int32), - } - preprocessors = [ - seqio.preprocessors.tokenize, - seqio.preprocessors.append_eos, - seqio.CacheDatasetPlaceholder(required=False), - ] - - # Add train and normal eval tasks - seqio.TaskRegistry.add( - task_name, - data_source, - preprocessors=preprocessors, - output_features=output_features, - metric_fns=metrics, - postprocess_fn=maybe_get_class_id_postprocessor(template), - ) - - # Add rank classification eval task - if template.answer_choices: - rank_classification_preprocessor = functools.partial( - t5.data.preprocessors.rank_classification, - inputs_fn=lambda ex: tf.fill((len(ex["answer_choices"]),), ex["inputs"]), - targets_fn=lambda ex: ex["answer_choices"], - is_correct_fn=lambda ex: tf.equal(ex["answer_choices"], tf.strings.strip(ex["targets"])), - weight_fn=lambda ex: 1.0, - ) - - fixed_choices = template.get_fixed_answer_choices_list() - num_classes = len(fixed_choices) if fixed_choices else None - seqio.TaskRegistry.add( - task_name + "_score_eval", - data_source, - preprocessors=[rank_classification_preprocessor] + preprocessors, - output_features=output_features, - metric_fns=[functools.partial(t5.evaluation.metrics.rank_classification, num_classes=num_classes)], - postprocess_fn=t5.data.postprocessors.rank_classification, - ) - - -datatset_subset_tuple = Tuple[str, Optional[str]] -d4_train: List[datatset_subset_tuple] = [] -d4_eval: List[datatset_subset_tuple] = [] -d3_train_gpt: List[datatset_subset_tuple] = [] -d3_train_sglue: List[datatset_subset_tuple] = [] -bias_fairness_eval: List[datatset_subset_tuple] = [] -gsheet: Dict[datatset_subset_tuple, Dict] = {} -experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv") -with open(experiment_path) as exp_file: - reader = csv.DictReader(exp_file) - for row in reader: - if row["skip"]: - continue - if row["subset"] == "": - row["subset"] = None # to match promptsource.Template object - dataset_subset = (row["HF_name"], row["subset"]) - if row["do_train"] == "TRUE": - d4_train.append(dataset_subset) - if row["do_eval"] == "TRUE": - d4_eval.append(dataset_subset) - if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]: - d3_train_gpt.append(dataset_subset) - if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue": - d3_train_sglue.append(dataset_subset) - if ( - row["do_eval"] == "TRUE" - and row["task_by_convention"] == "bias_and_fairness" - and row["HF_name"] != "winogender" - ): - bias_fairness_eval.append(dataset_subset) - gsheet[dataset_subset] = row -all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue + bias_fairness_eval - -all_templates = promptsource.templates.TemplateCollection() -all_templates.remove("anli") # Need to special-case ANLI due to weird split conventions - -# 3 stages of training/ablation: D4 -> GPT -> SuperGLUE -d4_train_mixture: List[str] = [] # strings are dataset_subset_template -gpt_train_mixture: List[str] = [] -sglue_train_mixture: List[str] = [] -d4_eval_mixture: List[str] = [] -bias_fairness_eval_mixture: List[str] = [] -mixture_cap: Dict[str, int] = {} -single_original_task: Dict[Tuple[str, str], str] = {} -all_original_tasks: List[str] = [] -for dataset_name, subset_name in all_templates.keys: - if (dataset_name, subset_name) not in all_datasets: - all_templates.remove(dataset_name, subset_name) - continue - - dataset = all_templates.get_dataset(dataset_name, subset_name) - num_templates = len(dataset.all_template_names) - train_size = gsheet[(dataset_name, subset_name)]["train_size"] - if train_size == "": - train_size = 0 - else: - train_size = int(train_size) - if train_size > MAX_EXAMPLES_PER_DATASET: - cap = MAX_EXAMPLES_PER_DATASET // num_templates - else: - cap = train_size - for template_name in dataset.all_template_names: - add_task(dataset_name, subset_name, template_name) - - template = dataset[template_name] - - task_name = utils.get_task_name(dataset_name, subset_name, template_name) - - if (dataset_name, subset_name) not in single_original_task and template.metadata.original_task: - single_original_task[(dataset_name, subset_name)] = task_name - - if template.metadata.original_task: - all_original_tasks.append(task_name) - - if (dataset_name, subset_name) in d4_train: - d4_train_mixture.append(task_name) - mixture_cap[task_name] = cap - if (dataset_name, subset_name) in d3_train_gpt: - gpt_train_mixture.append(task_name) - mixture_cap[task_name] = cap - if (dataset_name, subset_name) in d3_train_sglue: - sglue_train_mixture.append(task_name) - mixture_cap[task_name] = cap - if (dataset_name, subset_name) in d4_eval: - if template.metadata.original_task: - d4_eval_mixture.append(task_name) - # TODO use template.metadata.answer_choices here for rank eval - if (dataset_name, subset_name) in bias_fairness_eval: - bias_fairness_eval_mixture.append(task_name) - -# Special case for ANLI, which has weirdly-named splits and rounds that should be subsets -dataset_name, subset_name = ("anli", None) -dataset = all_templates.get_dataset(dataset_name, subset_name) -for anli_round in ("r1", "r2", "r3"): - for template_name in all_templates.get_dataset(dataset_name, subset_name).all_template_names: - task_name = utils.get_task_name(dataset_name, subset_name, template_name) + f"_{anli_round}" - split_mapping = { - "train": f"train_{anli_round}", - "validation": f"dev_{anli_round}", - "test": f"test_{anli_round}", - } - add_task(dataset_name, subset_name, template_name, task_name, split_mapping) - - template = dataset[template_name] - if template.metadata.original_task: - d4_eval_mixture.append(task_name) # TODO or add to ANLI special mixture - # TODO use template.metadata.answer_choices here for rank eval - - -TASK_BLACKLIST = [ - # Tasks which often tokenize to > 1024 tokens currently - "hotpot_qa_distractor_Generate_Explanations", - "hotpot_qa_fullwiki_Generate_Explanations", - "hotpot_qa_distractor_Generate_Answer_and_Explanations", - "hotpot_qa_fullwiki_Generate_Answer_and_Explanations", - "hotpot_qa_fullwiki_Generate_Answer", - "hotpot_qa_distractor_Generate_Answer", - "hotpot_qa_distractor_Generate_Title_2", - "hotpot_qa_fullwiki_Generate_Title_2", - "hotpot_qa_fullwiki_Generate_Title_1", - "hotpot_qa_distractor_Generate_Title_1", - "hotpot_qa_distractor_Generate_Question", - "hotpot_qa_fullwiki_Generate_Question", - "tab_fact_tab_fact_tab_fact_3", - "tab_fact_tab_fact_tab_fact_2", - "tab_fact_tab_fact_tab_fact_1", - "tab_fact_tab_fact_tab_fact_7", - "tab_fact_tab_fact_tab_fact_4", - "tab_fact_tab_fact_tab_fact_5", - "tab_fact_tab_fact_tab_fact_6", - "wiki_hop_masked_Choose_Best_Object_Candidate", - "wiki_hop_masked_Indirect_Question_about_Birthplace_Citizenship_Place_of_Death", - "narrativeqa_Template_05", - "ecthr_cases_alleged_violation_prediction_silver_rationales", - # Tasks with broken cached files - "gigaword_summarize_", -] - -# Tasks that failed caching (won't try to fix them for now) - remove when we are done -D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [ - "amazon_polarity_Is_this_product_review_positive_score_eval", - "amazon_polarity_Is_this_review_negative_score_eval", - "amazon_polarity_Is_this_review_score_eval", - "amazon_polarity_User_recommend_this_product_score_eval", - "amazon_polarity_convey_negative_or_positive_sentiment_score_eval", - "amazon_polarity_flattering_or_not_score_eval", - "amazon_polarity_negative_or_positive_tone_score_eval", - "amazon_polarity_user_satisfied_score_eval", - "amazon_polarity_would_you_buy_score_eval", - "dbpedia_14_given_a_choice_of_categories__score_eval", - "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval", - "dbpedia_14_pick_one_category_for_the_following_text_score_eval", - "wiki_hop_original_choose_best_object_affirmative_1_score_eval", - "wiki_hop_original_choose_best_object_affirmative_2_score_eval", - "wiki_hop_original_choose_best_object_affirmative_3_score_eval", - "wiki_hop_original_choose_best_object_interrogative_1_score_eval", - "wiki_hop_original_choose_best_object_interrogative_2_score_eval", -] - -seqio.MixtureRegistry.add( - "d4_train", - [task for task in d4_train_mixture if task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "gpt_train", - [task for task in gpt_train_mixture if task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "sglue_train", - [task for task in sglue_train_mixture if task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "d4_gpt_train", - [task for task in d4_train_mixture + gpt_train_mixture if task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "d4_gpt_sglue_train", - [task for task in d4_train_mixture + gpt_train_mixture + sglue_train_mixture if task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "d4_eval", - [task for task in d4_eval_mixture if task not in TASK_BLACKLIST], - default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000), -) # eval mixture does not need to be capped - - -seqio.MixtureRegistry.add( - "d4_score_eval", - [ - task - for task in seqio.TaskRegistry.names() - if task.endswith("_score_eval") - and task.split("_score_eval")[0] in d4_eval_mixture - and task.split("_score_eval")[0] not in TASK_BLACKLIST - ], - default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000), -) - -# Train tasks we don't care about evaluating on -D4_TRAIN_SKIP_EVAL = [ - "paws_labeled_final", - "adversarial_qa_dbidaf", - "adversarial_qa_dbert", - "duorc_ParaphraseRC", - "dream", - "amazon_polarity", - "app_reviews", - "imdb", - "wiki_bio", - "gigaword", - "multi_news", - "samsum", - "dbpedia_14", - "trec", -] - -seqio.MixtureRegistry.add( - "d4_train_eval", - [ - task - for task in d4_train_mixture - if task not in TASK_BLACKLIST - and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL]) - and task in all_original_tasks - ], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "d4_train_score_eval", - [ - task - for task in seqio.TaskRegistry.names() - if task.endswith("_score_eval") - and task.split("_score_eval")[0] in d4_train_mixture - and task.split("_score_eval")[0] not in TASK_BLACKLIST - and task not in D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST - and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL]) - and task.split("_score_eval")[0] in all_original_tasks - ], - default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000), -) - -seqio.MixtureRegistry.add( - "d4_train_one_og_prompt", - [task for task in single_original_task.values() if task in d4_train_mixture and task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "d4_train_all_og_prompts", - [task for task in all_original_tasks if task in d4_train_mixture and task not in TASK_BLACKLIST], - default_rate=lambda t: mixture_cap[t.name], -) - -seqio.MixtureRegistry.add( - "bias_fairness_eval", - bias_fairness_eval_mixture, - default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000), -) - -seqio.MixtureRegistry.add( - "bias_fairness_eval_score_eval", - [ - task - for task in seqio.TaskRegistry.names() - if task.endswith("_score_eval") and task.split("_score_eval")[0] in bias_fairness_eval_mixture - ], - default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000), -) diff --git a/promptsource/seqio_tasks/utils.py b/promptsource/seqio_tasks/utils.py deleted file mode 100644 index 1b4df95aa..000000000 --- a/promptsource/seqio_tasks/utils.py +++ /dev/null @@ -1,77 +0,0 @@ -import re - -import datasets -import tensorflow as tf - -import promptsource.utils - - -def feature_to_spec(feature, length=False): - if isinstance(feature, datasets.ClassLabel): - return tf.TensorSpec(shape=() if not length else (None if length == -1 else length,), dtype=tf.int64) - elif isinstance(feature, datasets.Value): - return tf.TensorSpec( - shape=() if not length else (None if length == -1 else length,), dtype=getattr(tf.dtypes, feature.dtype) - ) - elif hasattr(feature, "dtype") and hasattr(feature, "shape"): - return tf.TensorSpec(shape=feature.shape, dtype=feature.dtype) - elif isinstance(feature, datasets.Sequence): - return feature_to_spec(feature.feature, length=feature.length) - elif isinstance(feature, list): - return [feature_to_spec(f, length=length) for f in feature] - elif isinstance(feature, dict): - return {k: feature_to_spec(v, length=length) for k, v in feature.items()} - else: - raise ValueError(f"Unparseable feature type {type(feature)}") - - -def hf_dataset_to_tf_dataset(dataset): - return tf.data.Dataset.from_generator( - dataset.__iter__, output_signature={k: feature_to_spec(v) for k, v in dataset.features.items()} - ) - - -def apply_template(dataset, template): - def map_fn(ex): - ex = promptsource.utils.removeHyphen(ex) - inputs_and_targets = template.apply(ex) - answer_choices = template.get_answer_choices_list(ex) - if len(inputs_and_targets) == 2: - inputs, targets = inputs_and_targets - if targets == "": - ex = {"inputs": inputs, "targets": ""} - else: - ex = {"inputs": inputs, "targets": targets} - # When template results in an empty example, template.apply returns [""] - # Also, if the template gets split wrong, len can be > 2 - # We will filter these out later - else: - ex = {"inputs": "", "targets": ""} - - if answer_choices: - ex["answer_choices"] = answer_choices - - return ex - - def filter_fn(ex): - return len(ex["inputs"]) > 0 and len(ex["targets"]) > 0 - - original_columns = dataset.column_names - dataset = dataset.map(map_fn).filter(filter_fn) - # map keeps original columns, remove them - return dataset.remove_columns(set(original_columns) - {"inputs", "targets", "answer_choices"}) - - -def get_dataset_splits(dataset_name, subset_name=None): - info = datasets.get_dataset_infos(dataset_name) - subset_name = subset_name or list(info.keys())[0] - return info[subset_name].splits - - -def task_clean(text): - # Clean the text according to allowed characters for a task name - return re.sub(r"[^\w\d\._]+", "_", text) - - -def get_task_name(dataset_name, subset_name, template_name): - return task_clean(dataset_name + (f"_{subset_name}_" if subset_name is not None else "_") + template_name) diff --git a/setup.py b/setup.py index d2c5b65c7..79577e50b 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,5 @@ package_data={"": [ "templates/*/*.yaml", "templates/*/*/*.yaml", - "seqio_tasks/experiment_D3.csv", # Experiment D3 - "seqio_tasks/experiment_D4.csv", - "custom_datasets/*/*" ]} )