diff --git a/promptsource/seqio_tasks/__init__.py b/promptsource/seqio_tasks/__init__.py
deleted file mode 100644
index f3ba72430..000000000
--- a/promptsource/seqio_tasks/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Tools for loading prompted tasks in seqio."""
-
-from . import tasks, utils
diff --git a/promptsource/seqio_tasks/dataset_subset_template.csv b/promptsource/seqio_tasks/dataset_subset_template.csv
deleted file mode 100644
index 0358d5202..000000000
--- a/promptsource/seqio_tasks/dataset_subset_template.csv
+++ /dev/null
@@ -1,445 +0,0 @@
-comment,do_eval,skip_train,dataset_subset_template,nontrivial_choices_given,nontrivial_choices_hidden,trivial_choices_given,trivial_choices_hidden,generative_non_true_task,generative_non_true_implausible,generative_true_task,negated_answers,counting,non_true_task_other,awkward_phrasing,ungrammatical,template_bug,long_distance,no_sep_2_sentences,verbose,answer_span_indices,non_natural_language
-,,,adversarial_qa_dbert_adversarial_qa_dbert_1,,,,,,,,,,,,,,,,,,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_10,,,,,,,,,,,,,,,,,True,True
-,,,adversarial_qa_dbert_adversarial_qa_dbert_2,,,,,,,,,,,,,,True,,,,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_3,,,,,,,,,,,,,,,,,,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_4,,,,,True,,,,,,,,,,,,,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_5,,,,,True,,,,,,,,,,,,,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_6,,,,,,,,,,,,,,,,True,,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_7,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_8,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_dbert_adversarial_qa_dbert_9,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_1,,,,,,,,,,,,,,,,,,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_10,,,,,,,,,,,,,,,,,True,True
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_2,,,,,,,,,,,,,,True,,,,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_3,,,,,,,,,,,,,,,,,,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_4,,,,,True,,,,,,,,,,,,,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_5,,,,,True,,,,,,,,,,,,,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_6,,,,,,,,,,,,,,,,True,,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_7,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_8,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_dbidaf_adversarial_qa_dbidaf_9,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_1,,,,,,,,,,,,,,,,,,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_10,,,,,,,,,,,,,,,,,True,True
-,,,adversarial_qa_droberta_adversarial_qa_droberta_2,,,,,,,,,,,,,,True,,,,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_3,,,,,,,,,,,,,,,,,,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_4,,,,,True,,,,,,,,,,,,,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_5,,,,,True,,,,,,,,,,,,,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_6,,,,,,,,,,,,,,,,True,,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_7,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_8,,,,,,,,,,,,,,,,,True,
-,,,adversarial_qa_droberta_adversarial_qa_droberta_9,,,,,,,,,,,,,,,,,True,
-,,,ag_news_classify,,True,,,,,,,,,,,,,,,,
-,,,ag_news_classify_with_choices,True,,,,,,,,,,,,,,,,,
-,,,ag_news_recommend,True,,,,,,,,,,,,,,,,,
-,,,ag_news_which_section,,True,,,,,,,,,,,,,,,,
-,,,ag_news_which_section_choices,True,,,,,,,,,,,,,,,,,
-,,,amazon_polarity_Template_1,,,True,,,,,,,,,,,,,,,
-,,,amazon_polarity_Template_2,,,,True,,,,,,,,,,True,,,,
-,,,amazon_polarity_Template_3,,,,True,,,,,,,,,,,,,,
-,,,amazon_polarity_Template_4,,,,True,,,,,,,,,,True,,,,
-,,,amazon_polarity_Template_5,,,True,,,,,,,,,,,,,,,
-,,,amazon_polarity_Template_6,,,True,,,,,,,,,,,True,,,,
-,True,True,anli_GPT_3_style_r1,True,,,,,,,,,,,,,,,,,
-,True,True,anli_based_on_the_previous_passage_r1,True,,,,,,,,,,,,,,,,,
-,True,True,anli_does_S1_contradict_S2__r1,,,,,,,,True,,True,,,,,,,,
-,True,True,anli_does_S1_entail_S2__r1,True,,,,,,,,,,,,,,,,,
-,True,True,anli_given_does_it_follow_that__r1,True,,,,,,,,,,,,,,,,,
-,True,True,anli_given_it_must_be_true_that__r1,True,,,,,,,,,,,,,,,,,
-,True,True,anli_GPT_3_style_r2,True,,,,,,,,,,,,,,,,,
-,True,True,anli_based_on_the_previous_passage_r2,True,,,,,,,,,,,,,,,,,
-,True,True,anli_does_S1_contradict_S2__r2,,,,,,,,True,,True,,,,,,,,
-,True,True,anli_does_S1_entail_S2__r2,True,,,,,,,,,,,,,,,,,
-,True,True,anli_given_does_it_follow_that__r2,True,,,,,,,,,,,,,,,,,
-,True,True,anli_given_it_must_be_true_that__r2,True,,,,,,,,,,,,,,,,,
-,True,True,anli_GPT_3_style_r3,True,,,,,,,,,,,,,,,,,
-,True,True,anli_based_on_the_previous_passage_r3,True,,,,,,,,,,,,,,,,,
-,True,True,anli_does_S1_contradict_S2__r3,,,,,,,,True,,True,,,,,,,,
-,True,True,anli_does_S1_entail_S2__r3,True,,,,,,,,,,,,,,,,,
-,True,True,anli_given_does_it_follow_that__r3,True,,,,,,,,,,,,,,,,,
-,True,True,anli_given_it_must_be_true_that__r3,True,,,,,,,,,,,,,,,,,
-,,,app_reviews_categorize_rating_using_review,,True,,,,,,,,,,,,,,,,
-,,,app_reviews_convert_to_rating,True,,,,,,,,,,,,,,,,,
-,,,app_reviews_convert_to_star_rating,,,,,,,,,,True,,,,,,,,
-,,,app_reviews_generate_review,,,,,True,True,,,,,,,,,,,,
-,,,ai2_arc_ARC_Challenge_answer_qn,,,,,True,True,,,,,,,,,,,,
-,,,ai2_arc_ARC_Challenge_false,,,,,,,,True,,,,,,,,,,
-,,,ai2_arc_ARC_Challenge_qa_options,True,,,,,,,,,,,,,,,,,
-,,,ai2_arc_ARC_Challenge_test,True,,,,,,,,,,,,,,,,,
-,,,ai2_arc_ARC_Easy_answer_qn,,,,,True,True,,,,,,,,,,,,
-,,,ai2_arc_ARC_Easy_false,,,,,,,,True,,,,,,,,,,
-,,,ai2_arc_ARC_Easy_qa_options,True,,,,,,,,,,,,,,,,,
-,,,ai2_arc_ARC_Easy_test,True,,,,,,,,,,,,,,,,,
-,True,,circa_goldstandard1_judgement,True,,,,,,,,,,True,,,,,,,
-,True,,circa_goldstandard2_judgement,True,,,,,,,,,,True,,,,,,,
-,,,circa_judgement,,True,,,,,,,,True,True,,,,,,,
-,,,circa_possible_qn,,,,,True,,,,,,,,,,,,,
-,,,circa_question_declarative,,,,,,,,,,True,,,,,,,,
-,,,cnn_dailymail_3.0.0_generate_story,,,,,True,,,,,,,,,,,,,
-,,,cnn_dailymail_3.0.0_news_card_view,,,,,,,True,,,,,,,True,,,,
-,,,cnn_dailymail_3.0.0_news_stock,,,,,,,True,,,,,,,True,,,,
-,,,cnn_dailymail_3.0.0_news_summary,,,,,,,True,,,,,,,True,,True,,
-,,,cnn_dailymail_3.0.0_spice_up_story,,,,,True,,,,,,,,,,,,,
-,,,codah_codah_answer_no_option,,True,,,,,,,,,,,,,,,,
-,,,codah_codah_answer_with_option,True,,,,,,,,,,,,,,,,,
-,,,codah_codah_answer_with_option_idx,True,,,,,,,,,,,,,,,,,
-,,,codah_codah_answer_with_option_post,True,,,,,,,,,,,,,,,,,
-,,,codah_codah_choose_from_list,True,,,,,,,,,,,,,,,,,
-,,,codah_codah_finish_from_the_list,True,,,,,,,,,,,,,,,,,
-,,,codah_codah_finish_from_the_list_post,True,,,,,,,,,,,,,,,,,
-,,,codah_codah_finish_pre,,True,,,,,,,,,,,,,,,,
-,,,codah_codah_question_category,,,,,,,,,,True,,,,,,,,
-,,,codah_codah_question_category_bis,,,,,,,,,,True,,,,,,,,
-,,,common_gen_Example_prompt,,,,,,,True,,,,,,,,,,,
-,,,common_gen_Given_concepts,,,,,,,True,,,,,,,,,,,
-,,,common_gen_Put_together,,,,,,,True,,,,,,,,,,,
-,,,common_gen_choice_in_concept_centric_sentence_generation,,,,,,,True,,,,,,,,,,,
-,,,common_gen_sentence_to_concepts,,,,,,,,,,True,,,,,,,,
-,,,cos_e_v1.11_description_question_option_id,True,,,,,,,,,,,,,,,,,
-,,,cos_e_v1.11_description_question_option_text,True,,,,,,,,,,,,,,,,,
-,,,cos_e_v1.11_generate_explanation_given_text,True,,,,,,True,,,,,,True,,,,,
-,,,cos_e_v1.11_generate_explanation_no_given_answer,,True,,,,,True,,,,,,,,,,,
-,,,cos_e_v1.11_question_description_option_id,True,,,,,,,,,,,,,,,,,
-,,,cos_e_v1.11_question_description_option_text,True,,,,,,,,,,,,,,,,,
-,,,cos_e_v1.11_question_option_description_id,True,,,,,,,,,,,,,,,,,
-,,,cos_e_v1.11_question_option_description_text,True,,,,,,,,,,,,,,,,,
-revisit,,,cosmos_qa_context_description_question_answer_id,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_description_question_answer_text,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_description_question_text,,True,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_question_answer_description_id,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_question_answer_description_text,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_question_description_answer_id,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_question_description_answer_text,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_context_question_description_text,,True,,,,,,,,,,,,,,,,
-,,,cosmos_qa_description_context_question_answer_id,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_description_context_question_answer_text,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_description_context_question_text,,True,,,,,,,,,,,,,,,,
-,,,cosmos_qa_no_prompt_id,True,,,,,,,,,,,,,,,,,
-,,,cosmos_qa_no_prompt_text,True,,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_1,,True,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_10,True,,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_3,,True,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_5,,True,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_7,,True,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_8,,True,,,,,,,,,,,,,,,,
-,,,dbpedia_14_dbpedia_9,True,,,,,,,,,,,,,,,,,
-,,,dream_answer_to_dialogue,,,,,True,,,,,,,,,,,,,
-,,,dream_baseline,True,,,,,,,,,,,,,,,,,
-,,,dream_conversation,True,,,,,,,,,,,,,,,,,
-,,,dream_generate_first_utterance,,,,,True,,,,,,,,,,,,,
-,,,dream_generate_last_utterance,,,,,True,,,,,,,,,,,,,
-,True,,emo_feeling,True,,,,,,,,,,,,,,,,,
-,True,,emo_final_message,True,,,,,,,,,,,,,,,,,
-,True,,emo_persons_describe,True,,,,,,,,,,,,,,,True,,
-,True,,emo_persons_infer,True,,,,,,,,,,,,,,,,,
-,True,,emo_spoke_last,True,,,,,,,,,,,,,,,,,
-,,,freebase_qa_inference_chain_prompt,,,,,,,,,,True,,,,,,,,
-,,,freebase_qa_inference_chain_prompt_context,,,,,,,,,,True,,,,,,,,
-,,,freebase_qa_qa_context_1,,,,,,,,,,,,,,,,,,
-,,,freebase_qa_qa_context_2,,,,,,,,,,,,,,,,,,
-,,,freebase_qa_qa_template_basic,,,,,,,,,,,,,,,,,,
-,,,gigaword_Document_,,,,,,,True,,,,,,,,,,,
-,,,gigaword_Summarize_this_document_,,,,,,,True,,,,,,,,,,,
-,,,gigaword_TLDR,,,,,,,True,,,,,,,,,,,
-,,,gigaword_generate_summary_for_this,,,,,,,True,,,,,,,,,,,
-,,,gigaword_in_a_nutshell,,,,,,,True,,,,,,,,,,,
-,,,gigaword_reverse_writing,,,,,,,,,,True,,,,,,,,
-,,,gigaword_reverse_writing_2,,,,,,,True,,,,,,,,,,,
-,,,gigaword_summarize_,,,,,,,True,,,,,,,,,,,
-,,,gigaword_write_one_sentence,,,,,,,True,,,,,,,,,,,
-,True,True,glue_cola_Following_sentence_acceptable,True,,,,,,,,,,,,,,,,,
-,True,True,glue_cola_Make_sense_yes_no,,,True,,,,,,,,,,,,,,,
-,True,True,glue_cola_Previous_sentence_acceptable,,,,True,,,,,,,,,,,,,,
-,True,True,glue_cola_editing,,,True,,,,,,,,,,,,,,,
-,True,True,glue_cola_jinja_example,,,,True,,,,,,,,,,,,,,
-,True,,glue_mrpc_equivalent,True,,,,,,,,,,,,,,True,,,
-,True,,glue_mrpc_paraphrase,,,,True,,,,,,,,,,,,,,
-,True,,glue_mrpc_replace,,,,True,,,,,,,,,,,,,,
-,True,,glue_mrpc_same_thing,,,,True,,,,,,,,,,,True,,,
-,True,,glue_mrpc_want_to_know,,,,True,,,,,,,,,,,True,,,
-,,,glue_qqp_answer,,,,True,,,,,,,,,,,,,,
-,,,glue_qqp_duplicate,,,,True,,,,,,,,,,,,,,
-,,,glue_qqp_duplicate_or_not,True,,,,,,,,,,,,,,,,,
-,,,glue_qqp_quora,,,,True,,,,,,,,,,,,True,,
-,,,glue_qqp_same_thing,,,,True,,,,,,,,,,,,,,
-,,,glue_sst2_following_positive_negative,True,,,,,,,,,,,,,,,,,
-,,,glue_sst2_happy_or_mad,True,,,,,,,,,,,,,,,,,
-,,,glue_sst2_positive_negative_after,True,,,,,,,,,,,,,,,,,
-,,,glue_sst2_review,True,,,,,,,,,,,,,,,,,
-,,,glue_sst2_said,True,,,,,,,,,,,,,,,,,
-,,True,glue_stsb_examples,,,,,,,,,,,,,,,,,,
-,,True,glue_stsb_rank,,,,,,,,,,,,,,,,,,
-,,True,glue_stsb_rate,,,,,,,,,,,,,,,,,,
-,,True,glue_stsb_score,,,,,,,,,,,,,,,,,,
-,,True,glue_stsb_similarity,,,,,,,,,,,,,,,,,,
-,True,True,hans_GPT_3_style,True,,,,,,,,,,,,,,,,,
-,True,True,hans_Suppose_Can_we_infer_that_,,,,True,,,,,,,,,,,,,,
-,True,True,hans_based_on_the_previous_passage,,,,True,,,,,,,,,,,,,,
-,True,True,hans_does_S1_entail_S2_,,,True,,,,,,,,,,,,,,,
-,True,True,hans_given_does_it_follow_that_,,,True,,,,,,,,,,,,,,,
-,True,True,hans__does_the_previous_passage_support_the_claim_that,,,,True,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_0,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_1,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_2,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_3,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_reversed_0,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_reversed_1,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_reversed_2,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_YesNo_reversed_3,,,True,,,,,,,,,,,,,,,
-,,,hellaswag_complete_first_then,True,,,,,,,,,,,,,,,,,
-,,,hellaswag_first_then,True,,,,,,,,,,,,,,,,,
-,,,hellaswag_how_ends,True,,,,,,,,,,,,,,,,,
-,,,hellaswag_if_begins_how_continues,True,,,,,,,,,,,,,,,,,
-,,,hellaswag_which_ending,True,,,,,,,,,,,,,,,,,
-,,,imdb_imdb_1,,True,,,,,,,,,,,,,,,,
-,,,imdb_imdb_2,,True,,,,,,True,,,,,,,,,,
-,,,imdb_imdb_3,,True,,,,,,,,,,,,,,,,
-,,,imdb_imdb_4,,True,,,,,,,,,,,,,,,,
-,,,imdb_imdb_5,,True,,,,,,,,,,,,True,,,,
-,,,imdb_imdb_6,,True,,,,,,,,,,,,,,,,
-,,,imdb_imdb_7,,True,,,,,,,,,,,,,,,,
-,,,imdb_imdb_8,,True,,,,,,,,,,,,,,,,
-,,,imdb_imdb_9,,,,True,,,,,,,,,,,,,,
-,True,,mc_taco_mc_taco_1,,,,True,,,,,,,,,,,,,,
-,,,mc_taco_mc_taco_2,,,,,,,,,,True,,,,,,,,
-,True,,mc_taco_mc_taco_3,,,True,,,,,,,,,,,True,,,,
-,,,mc_taco_mc_taco_4,True,,,,,,,,,True,,,,,,,,
-,,,mc_taco_mc_taco_5,,,,,True,,,,,,,,,,,,,
-,,,mc_taco_mc_taco_6,,True,,,,,,,,,,,,,,,,
-,True,True,nq_open_context_self_description,,,,,,,,,,,,,,,,,,
-,,True,nq_open_guess_question,,,,,True,,,,,,,,,,,,,
-,True,True,nq_open_question_answer,,,,,,,,,,,,,,,,,,
-,True,True,nq_open_question_with_instruction,,,,,,,,,,,,,,,,,,
-,,,onestop_english_ara_context,True,,,,,,,,,,,,,,,,,
-,,,onestop_english_assess,True,,,,,,,,,,,,,True,,,,
-,,,onestop_english_ats,True,,,,,,,,,,,,,,,,,
-,,,onestop_english_esl_context,True,,,,,,,,,,,,,True,,,,
-,,,onestop_english_esl_variation,True,,,,,,,,,,,,,True,,,,
-,True,,openbookqa_main_choices,True,,,,,,,,,,,,,,,,,
-,True,,openbookqa_main_choose_an_answer_with_options,True,,,,,,,,,,,,,,,,,
-,True,,openbookqa_main_only_options,True,,,,,,,,,,,,,,,,,
-,True,,openbookqa_main_pick_answer_with_options,True,,,,,,,,,,,,,,,,,
-,True,,openbookqa_main_pick_using_id,True,,,,,,,,,,,,,,,,,
-,True,,openbookqa_main_which_correct,True,,,,,,,,,,,,,,,,,
-,,True,openbookqa_main_which_correct_inverse,True,,,,,,,,,,,,True,,,,,
-,,,paws_labeled_final_Concatenation,,,True,,,,,,,,,,True,,,,,
-,,,paws_labeled_final_Concatenation_no_label,,,,True,,,,,,,,,True,,,,,
-,,,paws_labeled_final_Meaning,,,True,,,,,,,,,,True,,,,,
-,,,paws_labeled_final_Meaning_no_label,,,,True,,,,,,,,,True,,,,,
-,,,paws_labeled_final_PAWS_ANLI_GPT3,True,,,,,,,,,True,,,,,,,,
-,,,paws_labeled_final_PAWS_ANLI_GPT3_no_label,,True,,,,,,,,True,,,,,,,,
-,,,piqa_Correct_the_solution,,,,,True,,,,,,,,,,,,,
-,,,piqa_Correct_the_solution_if_false_from_sol_1,,,,,True,,,,,,,,,,,,,
-,,,piqa_Correct_the_solution_if_false_from_sol_2,,,,,True,,,,,,,,,,,,,
-should use jinja choice,,,piqa_Does_this_solution_make_sense_sol1,,,,True,,,,,,,,,,,,,,
-,,,piqa_Does_this_solution_make_sense_sol2,,,,True,,,,,,,,,,,,,,
-,,,piqa_Generate_a_similar_but_wrong_solution,,,,,True,,,,,,,,,,,,,
-,,,piqa_choose_the_most_appropriate_solution,True,,,,,,,,,,,,,,,,,
-duplicate of above,,True,piqa_choose_the_most_appropriate_solution_reorder_solution,True,,,,,,,,,,,,,,,,,
-,,,piqa_no_prompt_needed,,,,,True,,,,,,,,,,,,,
-,,,qa_srl_aq,,,,,True,True,,,,,,,,,,,,
-,,,qa_srl_context_answer,,,,,True,,,,,,,,,,,,,
-,,,qa_srl_context_qn,,,,,True,,,,,,,,,,,,,
-,,,qa_srl_predicate,,,,,,,,,,True,,,,,,,,
-need non-naive metric,True,,qa_srl_qa,,,,,,,,,,,,,,,,,,
-,,,qasc_is_correct_0,,,,True,,,,,,,,,,,,,,
-,,,qasc_is_correct_1,,,,True,,,,,,,,,,,,,,
-,,,qasc_qu_combined,True,,,,,,,,,,,,,,,,,
-,,,qasc_sep_combined_can_tell,True,,,,,,,,,,,,,,,,,
-,,,qasc_sep_qu,True,,,,,,,,,,,,,,,,,
-,,,quail_context_description_question_answer_id,True,,,,,,,,,,,,,,,,,
-,,,quail_context_description_question_answer_text,True,,,,,,,,,,,,,,,,,
-,,,quail_context_description_question_text,,True,,,,,,,,,,,,,,,,
-,,,quail_context_question_answer_description_id,True,,,,,,,,,,,,,,,,,
-,,,quail_context_question_answer_description_text,True,,,,,,,,,,,,,,,,,
-,,,quail_context_question_description_answer_id,True,,,,,,,,,,,,,,,,,
-,,,quail_context_question_description_answer_text,True,,,,,,,,,,,,,,,,,
-,,,quail_context_question_description_text,True,,,,,,,,,,,,,,,,,
-,,,quail_description_context_question_answer_id,,True,,,,,,,,,,,,,,,,
-,,,quail_description_context_question_answer_text,True,,,,,,,,,,,,,,,,,
-,,,quail_description_context_question_text,,True,,,,,,,,,,,,,,,,
-,,,quail_no_prompt_id,True,,,,,,,,,,,,,,,,,
-,,,quail_no_prompt_text,True,,,,,,,,,,,,,,,,,
-,,,quartz_para_question_1,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,quartz_para_question_1_reverse,True,,,,,,,,,,,,,,,,,
-,,,quartz_para_question_2,True,,,,,,,,,,,,,,,,,
-,,,quartz_para_question_3_choices,True,,,,,,,,,,,,,,,,,
-,,,quartz_para_question_4_choices,True,,,,,,,,,,,,,,,,,
-,,,quartz_para_question_plain,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,quartz_para_question_plain_reverse,True,,,,,,,,,,,,,,,,,
-,,,quartz_question_para_1,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,quartz_question_para_1_reverse,True,,,,,,,,,,,,,,,,,
-,,,quartz_question_para_2,True,,,,,,,,,,,,,,,,,
-,,,quartz_question_para_3,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,quartz_question_para_3_reverse,True,,,,,,,,,,,,,,,,,
-,,,quoref_Template_1,,,,,,,,,,,,,,,,,,
-,,,quoref_Template_2,,,,,,,,,,,,,,True,,,,
-,,,quoref_Template_3,,,,,True,,,,,,True,,,,,,,
-,,,quoref_Template_4,,,,,,,,,,True,,,,,,,True,
-,,,quoref_Template_5,,,,,,,,,,True,,,,,,,,
-,,,race_high_Read_the_article_and_answer_the_question_no_option_,,True,,,,,,,,,,,,,,,,
-,True,,race_high_Read_the_article_and_select_the_best_answer,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,race_high_Read_the_article_and_select_the_best_answer2,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,race_high_Read_the_article_and_select_the_best_answer3,True,,,,,,,,,,,,,,,,,
-,,,race_high_Write_a_multi_choice_question_for_the_following_article,,,,,True,,,,,,,,,,,,,
-,,,race_high_Write_a_multi_choice_question_for_the_following_article_2,,,,,True,,,,,,,,,,,,,
-,,,race_middle_Read_the_article_and_answer_the_question_no_option_,,True,,,,,,,,,,,,,,,,
-,True,,race_middle_Read_the_article_and_select_the_best_answer,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,race_middle_Read_the_article_and_select_the_best_answer2,True,,,,,,,,,,,,,,,,,
-near duplicate of the above,,True,race_middle_Read_the_article_and_select_the_best_answer3,True,,,,,,,,,,,,,,,,,
-,,,race_middle_Write_a_multi_choice_question_for_the_following_article,,,,,True,,,,,,,,,,,,,
-,,,race_middle_Write_a_multi_choice_question_for_the_following_article_2,,,,,True,,,,,,,,,,,,,
-,,,ropes_funky_prompt,True,,,,,,,,,,,,,,,,,
-,,,ropes_plain,True,,,,,,,,,,,,,,,,,
-,,,ropes_plain_bottom_hint,True,,,,,,,,,,,,,True,,,,
-,,,ropes_plain_no_background,True,,,,,,,,,True,,,,,,,,
-,,,ropes_prompt_beginning,True,,,,,,,,,,,,,,,,,
-,,,ropes_prompt_bottom_hint_beginning,True,,,,,,,,,,,,,,,,,
-,,,ropes_prompt_bottom_no_hint,True,,,,,,,,,True,,,,,,,,
-,,,ropes_prompt_mix,True,,,,,,,,,,,,,True,,,,
-,,,rotten_tomatoes_rt_1,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_10,True,,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_2,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_3,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_4,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_5,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_6,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_7,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_8,,True,,,,,,,,,,,,,,,,
-,,,rotten_tomatoes_rt_9,,,,True,,,,,,,,,,,,,,
-,,,sciq_Template_0,,True,,,,,,,,,,,True,,,,,
-,,,sciq_Template_1,,True,,,,,,,,,,,True,,,,,
-,True,,social_i_qa_social_i_qa1,True,,,,,,,,,,,,,,,,,
-,,,social_i_qa_social_i_qa2,,True,,,,,,,,,,,,,,,,
-select answer by ordinal word,True,,social_i_qa_social_i_qa3,True,,,,,,,,,,,,,,,,,
-,,,social_i_qa_social_i_qa4,,,,,True,,,,,,,,,,,,,
-4-way to binary classification,,,social_i_qa_social_i_qa5,,,,True,,,,,,,,,,,,,,
-,,,squad_v2_Jeopardy_with_Context,,,,,True,,,,,,,,,,,,,
-,,,squad_v2_Jeopardy_without_Context,,,,,True,,,,,True,,,,,,,,
-,,,squad_v2_Questions_with_Context,True,,,,,,,,,,,,,,,,,
-nicely randomnized prompt phrasing,,,squad_v2_Questions_with_Context_Without_Prompt_Keywords,True,,,,,,,,,,,,,,,,,
-,,,squad_v2_Topic_Prediction_Context,,,,,,,,,,True,,,,,,,,
-,,,squad_v2_Topic_Prediction_Context_with_randomized_prompt_options,,,,,,,,,,True,,,,,,,,
-,,,squad_v2_Topic_Prediction_Context_with_randomized_prompt_options_placed_in_the_end,,,,,,,,,,True,,,,,,,,
-,,,squad_v2_Topic_Prediction_Question_and_Answer_Pair,,,,,,,,,,True,,,,,,,,
-,,,squad_v2_Trivia,,,,,,,,,,True,,,,,,,,
-,True,,super_glue_boolq_GPT_3_Style,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_boolq_I_wonder_,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_boolq_based_on_the_following_passage,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_boolq_based_on_the_previous_passage,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_boolq_could_you_tell_me_,,,,True,,,,,,,,,,,,,,
-,True,True,super_glue_cb_GPT_3_style,True,,,,,,,,,,,,,,,,,
-,True,True,super_glue_cb_based_on_the_previous_passage,True,,,,,,,,,,,,,,,,,
-contrapositive,True,True,super_glue_cb_does_S1_contradict_S2_,True,,,,,,,,,True,,,,,,,,
-,True,True,super_glue_cb_does_S1_entail_S2_,True,,,,,,,,,,,,,,,,,
-,True,True,super_glue_cb_given_does_it_follow_that_,True,,,,,,,,,,,,,,,,,
-must/might/may be true,True,True,super_glue_cb_given_it_must_be_true_that_,True,,,,,,,,,,,,,,,,,
-,True,,super_glue_copa_C1_or_C2_premise_so_because_,True,,,,,,,,,,,,,,,,,
-effect examples,True,,super_glue_copa__As_a_result_C1_or_C2_,True,,,,,,,,,,,,,,,,,
-effect examples,True,,super_glue_copa__What_could_happen_next_C1_or_C2_,True,,,,,,,,,,,,,,,,,
-cause examples,True,,super_glue_copa__which_may_be_caused_by,True,,,,,,,,,,,,,,,,,
-effect examples,True,,super_glue_copa__which_may_cause_C1_or_C2_,True,,,,,,,,,,,,,,,,,
-cause examples,True,,super_glue_copa__why_C1_or_C2,True,,,,,,,,,,,,,,,,,
-,True,,super_glue_multirc_I_was_going_to_say_,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_multirc_Would_it_be_good_to_answer_,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_multirc_is_a_correct_answer_,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_multirc_is_the_correct_answer_,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_multirc_paragraph_question_is_it_,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_record_Can_you_figure_out_,,True,,,,,,,,,,,,,,,,
-,True,,super_glue_record_In_the_question_above_the_placeholder_stands_for,,True,,,,,,,,,,,,,,,,
-,True,,super_glue_record_What_could_the_placeholder_be_,True,,,,,,,,,,,,,,,,,
-no difference here?,True,,super_glue_record_Which_one_is_the_placeholder_,True,,,,,,,,,,,,,,,,,
-,True,,super_glue_record_the_placeholder_refers_to_,,True,,,,,,,,,,,,,,,,
-,True,True,super_glue_rte_GPT_3_style,True,,,,,,,,,,,,,,,,,
-,True,True,super_glue_rte_Suppose_Can_we_infer_that_,,,,True,,,,,,,,,,,,,,
-,True,True,super_glue_rte_based_on_the_previous_passage,,,,True,,,,,,,,,,,,,,
-,True,True,super_glue_rte_does_S1_entail_S2_,,,True,,,,,,,,,,,,,,,
-,True,True,super_glue_rte_given_does_it_follow_that_,,,,True,,,,,,,,,,,,,,
-,True,True,super_glue_rte__Therefore_we_re_licensed_to_say_that_,,,,True,,,,,,,,,,,,,,
-,True,True,super_glue_rte__does_the_previous_passage_support_the_claim_that,,,,True,,,,,,,,,,,,,,
-,True,,super_glue_wic_GPT_3_prompt,,,,True,,,,,,,,,,,True,,,
-,True,,super_glue_wic_GPT_3_prompt_with_label,,,True,,,,,,,,,,,,True,,,
-,True,,super_glue_wic_question_context,,,,True,,,,,,,,,,,True,,,
-,True,,super_glue_wic_question_context_meaning,,,,True,,,,,,,,,,,True,,,
-,True,,super_glue_wic_question_context_meaning_with_label,,,True,,,,,,,,,,,,True,,,
-,True,,super_glue_wic_similar_sense,,,,True,,,,,,,,,,,True,,,
-,True,,super_glue_wsc.fixed_Here_p_stands_for_,,,,,,,,,,,,,,,,,,
-,True,,super_glue_wsc.fixed_In_the_previous_sentence_the_pronoun_refers_to_,,,,,,,,,,,,,,,,,,
-,True,,super_glue_wsc.fixed_Who_is_are_,,,,,,,,,,,,,,,,,,
-,True,,super_glue_wsc.fixed_in_the_passage_above_the_pronoun_X_refers_to_,,,,,,,,,,,,,,,,,,
-,True,,super_glue_wsc.fixed_passage_what_does_the_pronoun_refer_to_,,,,,,,,,,,,,,,,,,
-cast 4-way classification as binary,,,swag_regular_YesNo_0,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_1,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_2,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_3,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_reversed_0,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_reversed_1,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_reversed_2,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_YesNo_reversed_3,,,True,,,,,,,,,,,,,,,
-,,,swag_regular_complete_first_then,True,,,,,,,,,,,,,,,,,
-,,,swag_regular_first_then,True,,,,,,,,,,,,,,,,,
-,,,swag_regular_how_ends,True,,,,,,,,,,,,,,,,,
-,,,swag_regular_if_begins_how_continues,True,,,,,,,,,,,,,,,,,
-,,,swag_regular_which_ending,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_ABBR,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_ABBR_context_first,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_DESC,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_DESC_context_first,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_ENTY,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_ENTY_context_first,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_HUM,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_HUM_context_first,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_LOC,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_LOC_context_first,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_NUM,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_NUM_context_first,True,,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_open,,True,,,,,,,,,,,,,,,,
-,,,trec_fine_grained_open_context_first,,True,,,,,,,,,,,,,,,,
-answers are not what the questions ask for,,True,trec_gao_et_al_1,,,,,,,,,,,,True,,,,,,
-answers are not what the questions ask for,,True,trec_gao_et_al_2,,,,,,,,,,,,True,,,,,,
-,,,trec_trec1,True,,,,,,,,,,,,,,,,,
-,,,trec_trec2,True,,,,,,,,,,,,,,,,,
-,,,trivia_qa_rc_context_self_description,,,,,,,,,,,,,,,,,,
-,,,trivia_qa_rc_guess_question,,,,,True,True,,,,True,,,,,,,,
-,,,trivia_qa_rc_question_answer,,,,,,,,,,,,,,,,,,
-,,,trivia_qa_rc_question_with_instruction,,,,,,,,,,,,,,,,,,
-,,,trivia_qa_rc_reading_comprehension_1,,,,,,,,,,True,,,,,,,,
-,,,trivia_qa_rc_reading_comprehension_2,,,,,,,,,,True,,,,,,,,
-,,,web_questions_count_answers,,,,,,,,,True,,,,,,,,,
-,,,web_questions_credible_question,,,,,True,,,,,,,,,,,,,
-,,,web_questions_if_answers_what_question,,,,,True,,,,,,,,,,,,,
-,,,web_questions_potential_correct_answer,,,,,,,,,,,True,,,,,,,
-,,,web_questions_question_answer,,,,,,,,,,,,,,,,,,
-,,,web_questions_suggest_question,,,,,True,,,,,,,,,,,,,
-,,,wiki_bio_comprehension,,,,,,,,,,True,,,,,,,,
-,,,wiki_bio_guess_person,,,,,,,,,,True,,,,,,,,
-,,,wiki_bio_key_content,,,,,,,,,,True,,,,,,,,
-,,,wiki_bio_what_content,,,,,,,,,,True,,,,,,,,
-"should rephrase ""summarize""",,,wiki_bio_who,,,,,,,,,,,,,,,,,,
-,,,wiki_hop_original_Choose_Best_Object_Candidate,,,,,,,,,,True,,,,,,,,True
-,,,wiki_hop_original_Explain_Relation,,True,,,,,,,,True,,,,,,,,
-,,,wiki_hop_original_Generate_Fact_Triple,,,,,,,,,,True,,,,,,,,True
-,,,wiki_hop_original_Generate_Object_Answer,,,,,,,,,,True,,,,,,,,True
-,,,wiki_hop_original_Generate_Subject_Answer,,,,,,,,,,True,,,,,,,,True
-,,,wiki_hop_original_Indirect_Question_about_Birthplace_Citizenship_Place_of_Death,,,,,,,,,,,,,True,,,,,
-,,,wiqa_effect_with_label_answer,True,,,,,,,,,,,,,,,,,
-,,,wiqa_effect_with_string_answer,True,,,,,,,,,,,,,,,,,
-,,,wiqa_impacting_the_process,,,,True,,,,,,,,,,,,,,
-,,,wiqa_question_type,,,,,,,,,,True,,,,,,,,
-,,,wiqa_remove_first_step,,,,,,,,,,True,,,,,,,,
-,,,wiqa_remove_first_step_bis,,,,,,,,,,True,,,,,,,,
-,,,wiqa_remove_last_step,,,,,,,,,,True,,,,,,,,
-,,,wiqa_remove_last_step_bis,,,,,,,,,,True,,,,,,,,
-,True,,xsum_Document_,,,,,,,,,,,,,,,,,,
-,True,,xsum_Summarize_this_document_,,,,,,,,,,,,,,,,,,
-,True,,xsum_TLDR,,,,,,,,,,,,,,,,,,
-,True,,xsum_generate_summary_for_this,,,,,,,,,,,,,,,,,,
-,True,,xsum_summarize_,,,,,,,,,,,,,,True,,,,
-,True,,xsum_write_one_sentence,,,,,,,,,,,,,,,,,,
-,,,yelp_review_full_based_on_that,,True,,,,,,,,,,,,,,,,
-,,,yelp_review_full_format_rating,,True,,,,,,,,,,,,,,,,
-,,,yelp_review_full_format_score,,True,,,,,,,,,,,,,,,,
-,,,yelp_review_full_format_star,,True,,,,,,,,,,,,,,,,
-,,,yelp_review_full_on_a_scale,,True,,,,,,,,,,,,,,,,
-,,,yelp_review_full_so_i_would,,True,,,,,,,,,,,,,,,,
-,,,yelp_review_full_this_place,,True,,,,,,,,,,,,,,,,
diff --git a/promptsource/seqio_tasks/experiment_D4.csv b/promptsource/seqio_tasks/experiment_D4.csv
deleted file mode 100644
index 71c8216cc..000000000
--- a/promptsource/seqio_tasks/experiment_D4.csv
+++ /dev/null
@@ -1,242 +0,0 @@
-HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
-crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
-winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
-winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
-winogrande,winogrande_debiased,coreference,ext,"""debiased"" = adversarially filtered",GPT,TRUE,,TRUE,9248,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
-glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,,TRUE,8551,0,,TRUE,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019
-super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,TRUE,,TRUE,250,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019
-super_glue,rte,NLI,cls,,,TRUE,,TRUE,2490,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
-anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,,TRUE,162865,0,,TRUE,0,accuracy,,https://arxiv.org/abs/1910.14599,,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020
-hans,,NLI,cls,,,TRUE,,TRUE,0,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,,sentence pair,syntax?,,McCoy et al. 2019
-super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
-glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
-glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
-paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
-ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
-ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
-nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
-kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
-trivia_qa,unfiltered,QA_closed_book,gen,,GPT,TRUE,,TRUE,87622,0,TRUE,,87622,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,,Trivia QA,,,,,
-web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,,,TRUE,3778,0,TRUE,,3778,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,,web questions,qa/closed-book qa,,,,Berant et al. 2013
-wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclanthology.org/D15-1237.pdf,,,wiki qa,cls/other,,,,Yang et al. 2015
-adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
-adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
-adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
-coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
-against n human answers resulting in n F1 scores,
-the maximum of which is chosen as the prediction’s
-F1.For each question, we average out F1 across
-these n sets, both for humans and models. In our
-final evaluation, we use n = 4 human answers for
-every question (the original answer and 3 additionally collected answers). The articles a, an and the
-and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
-duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
-duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
-ropes,,QA_extractive,ext,,,TRUE,TRUE,,10924,10924,TRUE,,10924,,,,modest,,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019
-squad_v2,,QA_extractive,ext,,GPT,,,TRUE,130319,0,TRUE,,130319,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018
-super_glue,record,QA_extractive,ext,,,TRUE,,TRUE,100730,0,TRUE,TRUE,100730,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018
-qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,,TRUE,6414,0,TRUE,TRUE,6414,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,,qa srl,other,,semantic role,,He et al. 2015
-quac,,QA_extractive,ext,,GPT,,,TRUE,11567,,,,,"average_maximum_f1;HEQ-Q;HEQ-D:  To make oracle human and system performance comparable,
-given n references, we report the average of the
-maximum F1 computed from each n − 1 subset
-with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,,dialogue,,
-quoref,,QA_extractive,ext,,,TRUE,TRUE,,19399,19399,TRUE,,19399,,,https://aclanthology.org/D19-1606.pdf,,,Quoref,Extractive QA,,,,Dasigi et al. 2019
-tydiqa,,QA_extractive,ext,,Eval WG,,TRUE,,9211,9211,,,,,,,,,,,,,,
-drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,TRUE,,TRUE,,,,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019
-cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
-cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
-dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
-openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
-qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
-quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
-quarel,,QA_multiple_choice,cls,,CrossFit,,TRUE,,1941,1941,,,,,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a
-quartz,,QA_multiple_choice,cls,,,TRUE,TRUE,,2696,2696,TRUE,,2696,,,https://aclanthology.org/D19-1608.pdf,given?,,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
-race,high,QA_multiple_choice,cls,GPT-hard,GPT,,,TRUE,62445,0,TRUE,TRUE,62445,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
-race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,,,TRUE,25421,0,TRUE,TRUE,25421,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
-sciq,,QA_multiple_choice,cls,,,TRUE,TRUE,,11679,11679,TRUE,,11679,,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017
-social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,TRUE,TRUE,TRUE,33410,33410,TRUE,TRUE,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019
-super_glue,boolq,QA_multiple_choice,cls,,,TRUE,,TRUE,9427,0,TRUE,TRUE,9427,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,,superglue-boolq,,,knowledge-? reading comprehension,,
-super_glue,copa,QA_multiple_choice,cls,,,TRUE,,TRUE,400,0,TRUE,TRUE,400,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012
-super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,TRUE,,TRUE,27243,0,TRUE,TRUE,27243,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018
-wiki_hop,original,QA_multiple_choice,cls,,,TRUE,TRUE,,43738,43738,TRUE,,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,
-wiqa,,QA_multiple_choice,cls,,,TRUE,TRUE,,29808,29808,TRUE,,29808,,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019
-circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,,TRUE,34268,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,,circa,cls/other,,pragmatics,,Louis et al. 2020
-mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,,TRUE,0,0,,TRUE,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019
-piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,,,TRUE,16113,0,TRUE,,16113,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020
-amazon_polarity,,sentiment,cls,,,TRUE,TRUE,,3600000,500000,TRUE,,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013
-app_reviews,,sentiment,cls,,,TRUE,TRUE,,288065,288065,TRUE,,288065,,,,,,app reviews,other/regression,,,,Missing
-imdb,,sentiment,cls,,,TRUE,TRUE,,25000,25000,TRUE,,25000,,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011
-rotten_tomatoes,,sentiment,cls,,,TRUE,TRUE,,8530,8530,TRUE,,8530,,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005
-yelp_review_full,,sentiment,cls,no dev set,,TRUE,TRUE,,650000,500000,TRUE,,500000,,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link)
-lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
-craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
-story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,,TRUE,,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,
-hellaswag,,story_completion,cls,,GPT,,,TRUE,39905,0,TRUE,,39905,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019
-common_gen,,structure_to_text,gen,,,TRUE,TRUE,,67389,67389,TRUE,,67389,,,,,,common gen,other,,,,Lin et al. 2020b
-wiki_bio,,structure_to_text,gen,,,TRUE,TRUE,,582659,500000,TRUE,,500000,,,,,,wiki bio,cg/other,,,,Lebret et al. 2016
-cnn_dailymail,3.0.0,summarization,gen,,,TRUE,TRUE,,287113,287113,TRUE,,287113,,,,,,,,,,,
-gigaword,,summarization,gen,,,TRUE,TRUE,,3803957,500000,TRUE,,500000,,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012
-multi_news,,summarization,gen,,CrossFit,,TRUE,,44972,44972,,,,,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019
-samsum,,summarization,gen,,CrossFit,,TRUE,,14732,14732,,,,,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019
-xsum,,summarization,gen,,,TRUE,TRUE,TRUE,204045,204045,TRUE,TRUE,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,,xsum,cg/summarization,,,,Narayan et al. 2018
-ag_news,,topic_classification,cls,,,TRUE,TRUE,,120000,120000,TRUE,,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,,ag news,cls/topic,,,,Gulli (link)
-dbpedia_14,,topic_classification,cls,,,TRUE,TRUE,,560000,500000,TRUE,,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015
-trec,,topic_classification,cls,,,TRUE,TRUE,,5452,5452,TRUE,,5452,,,https://trec.nist.gov/data/qa.html,,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001
-super_glue,wic,word_sense_disambiguation,cls,,,TRUE,,TRUE,5428,0,TRUE,TRUE,5428,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019
-Staging Area,,,,,,,,,,,,,,,,,,,,,,,,
-Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,,,,,
-definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012
-jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,,,,,promptsource download error,jeopardy,qa/closed-book qa,,,,(link)
-blimp,,,cls,no prompts yet; collapse subsets,,,,,,0,,,0,,,,,,,,,,,
-Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,,,,,
-Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,,7088,,,,,,,,,,,,,,,
-Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,,1211,,,,,,,,,,,,,,,
-MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,,14191,,,,,,,,,,,,,,,
-narrativeqa,,,,very long input sequence,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,NarQA,Abstractive QA,,,,
-newsqa,,,,download error,TaskEmbed,,,,,,,,,,,,,promptsource download error,NewsQA,Extractive QA,,,,Trischler et al. 2017
-eli5,,,,dataset split error,CrossFit,,,,,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,skip: HF datasets error the split field is used for subsets,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019
-Maybe Reconsider,,,,,,,,,,,,,,,,,,,,,,,,
-zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,,,,,
-swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,,73546,0,TRUE,,73546,,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018
-codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,,2776,0,TRUE,,2776,,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019
-wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,,,,,no prompt yet,wiki auto,cls/other,,text simplification,,Jiang et al. 2020
-proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,,,,,no prompt yet,proto qa,other,,,,Boratko et al. 2020
-empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,no prompt yet,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019
-qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,,,,,
-kilt_tasks,aidayago2,,,,,,,,,,,,,,,,,no prompt yet,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011
-kilt_tasks,wow,,,,,,,,,,,,,,,,,no prompt yet,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019
-lama,conceptnet,,,,,,,,,,,,,,,,,no prompt yet,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-lama,google_re,,,,,,,,,,,,,,,,,no prompt yet,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-lama,squad,,,,,,,,,,,,,,,,,no prompt yet,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-lama,trex,,,,,,,,,,,,,,,,,no prompt yet,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-limit,,physical cognition,,,,,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,label errors in dataset itself? also no validation set otherwise well motivated by semantic theories,limit,other,,physical semantic repr.,,Manotas et al. 2020
-kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,temporary skip: prompts available in non-benchmark standalone dataset,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018
-Skipped,,,,,,,,,,,,,,,,,,,,,,,,
-fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,,,,,skip: awkward prompts as closed-book qa,FEVER,,,,,
-hotpot_qa,distractor,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
-hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
-emo,,sentiment,cls,skip: offensive and ungrammatical text,,merged,,,30160,0,TRUE,TRUE,30160,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,skip: offensive and ungrammatical text,emo,cls/emotion,,,,Chatterjee et al. 2019
-freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,,20358,0,TRUE,,20358,,,,intensive,,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019
-aqua_rat,,,,,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,skip: nontrivial math,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017
-math_qa,,,,,,,,,,,,,,,,,,skip: nontrivial math,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019
-numer_sense,,,,,,,,,,,,,,,,,,skip: closed-book trivia ,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a
-squad_adversarial,,,,,,,,,,,,,,,,,,validation set only,,,,,,
-squadshifts,,,,,,,,,,,,,,,,,,test set only,,,,,,
-sms_spam,,,,,,,,,,,,,,,,,,skip: unclean corpus and likely harmful content,sms spam,cls/other,,,,Almeida et al. 2011
-search_qa,,,,,,,,,,,,,,,,,,skip: seems like a very unclean corpus,search qa,qa/closed-book qa,,,,Dunn et al. 2017
-kilt_tasks,trex,,,,,,,,,,,,,,,,,skip: non-natural language,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018
-kilt_tasks,structured_zeroshot,,,,,,,,,,,,,,,,,skip: non-natural language,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017
-spider,,,,,,,,,,,,,,,,,,skip: non-natural language,spider,cg/other,,,,Yu et al. 2018
-wikisql,,,,,,,,,,,,,,,,,,skip: non-natural language,wikisql,cg/other,,,,Zhong et al. 2017
-com_qa,,,,,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,skip: non-human language: URL,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,
-climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,skip: no train set,climate fever,cls/fact checking,,,,Diggelmann et al. 2020
-art,,,,,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,skip: NLI reserved for generalization studies (although this one is not a traditionally defined NLI),art (abductive nli),other,,,,Bhagavatula et al. 2020
-glue,mnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-mnli,cls/nli,,,,Williams et al. 2018
-glue,qnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016
-glue,rte,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-rte,cls/nli,,,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
-glue,wnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-wnli,cls/nli,,,,Levesque et al. 2012
-,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,scitail,cls/nli,,,,Khot et al. 2018
-,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,sick,cls/nli,,,,Marelli et al. 2014
-,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,SNLI (Bowman et al. 2015),NLI,,,misc.,
-aeslc,,,,summarization by email subject line,,,,,,,,,,,,https://arxiv.org/abs/1906.03497,,skip: niche task,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019
-onestop_english,,,,,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,skip: niche task: classify curriculum diffculty,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018
-mocha,,,,,,,,,,,,,,,,,,skip: model generated text,mocha,other/regression,,,,Chen et al. 2020a
-commonsense_qa,,,,duplicate with cos_e,Vania,,,,9741,,,,,,,https://arxiv.org/pdf/1811.00937.pdf,,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019
-,,,,,,,,,,,,,,,,,,skip: maybe harmful content from Twitter,emotion,cls/emotion,,,,Saravia et al. 2018
-,,,,the authors themselves seem to have renounced their own work,,,,,,,,,,,,https://github.com/nyu-mll/crows-pairs,,skip: harmful content,crows pairs,other,,,,Nangia et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017
-,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018
-,,,,,,,,,,,,,,,,,,skip: harmful content,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-title,cg/summarization,,,,Kim et al. 2019
-,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019
-yelp_polarity,,,,,,,,,,,,,,,,,,skip: duplicate with yelp_review_full,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link)
-quora,,,,,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,skip: duplicate under GLUE,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017
-squad,,,,,,,,,,,,,,,,,,skip: duplicate under Squad 2.0,SQuAD 1.1,Extractive QA,,,,
-yahoo_answers_topics,,,,,,,,,,,,,,,,,,skip for early experiments: unclean corpus,yahoo answers topics,cls/topic,,,,(link)
-tab_fact,,,,,,,,,,,,,,,,,,skip for early experiments: tabular data,tab fact,cls/fact checking,,,,Chen et al. 2020b
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-poem_sentiment,,,,,,,,,,,,,,,,,,skip for early experiments: poetry domain,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020
-acronym_identification,,,,,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,skip for early experiments: niche/hard task,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020
-google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,google wellformed query,cls/other,,,,Faruqui and Das 2018
-liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,liar,cls/fact checking,,,,Wang 2017
-,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,crawl domain,other,,,,Zhang et al. 2020
-discovery,discovery,,,,,,,,,,,,,,,,,skip for early experiments: niche task no cannonical answer,discovery,cls/other,,generative-ish,,Sileo et al. 2019
-wiki_split,,,,,,,,,,,,,,,,,,skip for early experiments: niche task,wiki split,cg/other,,,,Botha et al. 2018
-,,,,,,,,,,,,,,,,,,skip for early experiments: multilingual,aslg pc12,other,,,,Othman and Jemni 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,
-financial_phrasebank,,,,,,,,,,,,,,,,,,skip for early experiments: financial domain,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014
-health_fact,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,health fact,cls/fact checking,,,,Kotonya and Toni 2020
-,,,,,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,skip for early experiments: biomedical domain,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020
-scicite,,,,,,,,,,,,,,,,,,skip for early experiments: academic domain + niche/hard task,scicite,cls/other,,,,Cohan et al. 2019
-,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,break-QDMR,other,,logical form,,Wolfson et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019
-glue,sst2,,,,,,,,,,,,,,,,,revisit: very short and often ill-formed movie reviews,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013
-glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-grain regression tasks,glue-stsb,semantic similarity,,,misc.,
-,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016
-,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,BoolQ-CS,Binary yes/no,,,,
-,,,,,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,double check: missing from HF datasets,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,DROP-CS,Abstractive QA,,,,
-,,,,,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,double check: missing from HF datasets,MCTest,Multiple choice,,,,
-,,,,,,,,,,,,,,,,,,double check: missing from HF datasets,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,
-,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
-,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,
diff --git a/promptsource/seqio_tasks/preview_annotated_prompts.py b/promptsource/seqio_tasks/preview_annotated_prompts.py
deleted file mode 100644
index 6890d5247..000000000
--- a/promptsource/seqio_tasks/preview_annotated_prompts.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import csv
-from pprint import pprint
-from typing import Dict, List
-
-import pkg_resources
-from t5.data.glue_utils import get_glue_metric, get_super_glue_metric
-from t5.evaluation.metrics import accuracy, mean_multiclass_f1, rouge
-
-
-SAFE_EXCLUDE_CRETERIA = [
-    "template_bug",
-    "negated_answers",
-    "counting",
-    "answer_span_indices",
-    "non_natural_language",
-    "generative_non_true_implausible",
-]
-
-AGGRESSIVE_EXCLUDE_CRETERIA = [
-    "generative_non_true_task",
-    "nontrivial_choices_hidden",
-    "awkward_phrasing",
-    "ungrammatical",
-] + SAFE_EXCLUDE_CRETERIA
-
-
-NON_GLUE_METRICS = {  # for those with do_eval = True
-    "anli": [accuracy],
-    "hans": [accuracy],
-    "circa_goldstandard1_judgement": [mean_multiclass_f1(num_classes=8), accuracy],
-    "circa_goldstandard2_judgement": [mean_multiclass_f1(num_classes=5), accuracy],
-    "mc_taco": [accuracy],
-    "nq_open": [accuracy],
-    "qa_srl": [accuracy],
-    "openbookqa": [accuracy],
-    "race": [accuracy],
-    "social_i_qa": [accuracy],
-    "emo": [mean_multiclass_f1(num_classes=4)],
-    "xsum": [rouge],
-}
-
-
-def exclude_bad_prompts(prompt: Dict) -> bool:
-    for criterion in SAFE_EXCLUDE_CRETERIA:  # or AGGRESSIVE_EXCLUDE_CRETERIA
-        if prompt.get(criterion):
-            return False
-    return True
-
-
-def load_annotated_prompts() -> List[Dict]:
-    annotated_csv_path = pkg_resources.resource_filename(__name__, "experiment_D3.csv")
-    with open(annotated_csv_path) as in_file:
-        reader = csv.DictReader(in_file)
-        all_tasks = [row for row in reader]
-
-    clean_tasks = list(filter(exclude_bad_prompts, all_tasks))
-
-    # Assign metrics
-    non_glue_eval_sets = list(NON_GLUE_METRICS.keys())
-    for task in clean_tasks:
-        if not task["do_eval"]:
-            continue
-
-        full_name = task["dataset_subset_template"]
-        if full_name.startswith("glue"):
-            subset = full_name.split("_")[1]
-            task["metrics"] = get_glue_metric(subset)
-        elif full_name.startswith("super_glue"):
-            subset = full_name.split("_")[2]
-            if subset in ("wsc.fixed", "multirc"):
-                # TODO: WSC and MultiRC need special pre/postprocesing
-                task["metrics"] = [accuracy]
-                continue
-            task["metrics"] = get_super_glue_metric(subset)
-
-        for dataset_name in non_glue_eval_sets:
-            if full_name.startswith(dataset_name):
-                task["metrics"] = NON_GLUE_METRICS[dataset_name]
-
-        # Skip rank_classification for now until we actually support it
-        # if task["nontrivial_choices_hidden"]:
-        #     # Trick of plugging in answer options and rank LM probabilites as predictions.
-        #     # Required for all prompts with non_trivial_choices_hidden,
-        #     # but could be used for other tasks as well where answer choices are given.
-        #     if "metrics" not in task:
-        #         task["metrics"] = [rank_classification]
-        #     elif rank_classification not in task["metrics"]:
-        #         task["metrics"].append(rank_classification)
-
-        # should be already handled by NON_GLUE_METRICS
-        # if task['generative_true_task'] or task['generative_non_true_task']:
-        #     task['metrics'] = rouge
-
-    return clean_tasks
-
-
-def preview() -> None:
-    clean_tasks = load_annotated_prompts()
-
-    train_tasks = [t for t in clean_tasks if not t["skip_train"]]
-    eval_tasks = [t for t in clean_tasks if t["do_eval"]]
-
-    pprint([t["dataset_subset_template"] for t in train_tasks])
-    print(len(train_tasks))
-
-    pprint([f'{t["dataset_subset_template"]} {t["metrics"]}' for t in eval_tasks])
-    print(len(eval_tasks))
-
-
-if __name__ == "__main__":
-    preview()
diff --git a/promptsource/seqio_tasks/preview_promptsource.py b/promptsource/seqio_tasks/preview_promptsource.py
deleted file mode 100644
index 4dbbec761..000000000
--- a/promptsource/seqio_tasks/preview_promptsource.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import csv
-from typing import List, Optional, Tuple
-
-import pkg_resources
-
-# from rich import inspect
-from rich.pretty import pprint
-
-from promptsource.templates import TemplateCollection
-
-
-def preview() -> None:
-    experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
-    gsheet = {}
-    d4_train: List[Tuple[str, Optional[str]]] = []
-    d4_eval: List[Tuple[str, Optional[str]]] = []
-    d3_train_gpt: List[Tuple[str, Optional[str]]] = []
-    d3_train_sglue: List[Tuple[str, Optional[str]]] = []
-    experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
-    with open(experiment_path) as exp_file:
-        reader = csv.DictReader(exp_file)
-        for row in reader:
-            if row["skip"]:
-                continue
-            if row["subset"] == "":
-                row["subset"] = None  # to match promptsource.Template object
-            dataset_subset = (row["HF_name"], row["subset"])
-            if row["do_train"] == "TRUE":
-                d4_train.append(dataset_subset)
-            if row["do_eval"] == "TRUE":
-                d4_eval.append(dataset_subset)
-            if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]:
-                d3_train_gpt.append(dataset_subset)
-            if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
-                d3_train_sglue.append(dataset_subset)
-            gsheet[dataset_subset] = row
-    all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue
-    print(f"Number of non-desk-rejected datasets = {len(all_datasets)}")
-    print(f"Number of training sets = {len(d4_train)}")
-    print(f"Number of evaluation sets = {len(d4_eval)}")
-
-    template_collection = TemplateCollection()
-    output = []
-    missing_og_flags = []
-    missing_metrics = []
-    for dataset_name, subset_name in template_collection.keys:
-        ds_name = (dataset_name, subset_name)
-        if ds_name not in d4_eval:
-            template_collection.remove(dataset_name, subset_name)
-            continue
-        OG = 0
-        non_OG = 0
-        dataset = template_collection.get_dataset(dataset_name, subset_name)
-        for template_name in dataset.all_template_names:
-            template = dataset[template_name]
-            # if dataset_name == 'ropes':
-            #     inspect(template.metadata)
-            if not template.metadata.metrics:
-                missing_metrics.append(f"{dataset_name}/{subset_name}/{template_name}")
-
-            if template.metadata.original_task is True:
-                OG += 1
-            elif template.metadata.original_task is False:
-                non_OG += 1
-            elif template.metadata.original_task is None:
-                missing_og_flags.append(dataset_name + "/" + template_name)
-                continue
-
-        train_size = gsheet[ds_name]["train_size"]
-        if train_size == "":
-            train_size = 0
-        else:
-            train_size = int(train_size)
-
-        adjusted_train_size = train_size // len(dataset.all_template_names)
-
-        output.append(
-            (
-                f"{dataset_name} {subset_name if subset_name else ''}",
-                f"{OG}-{non_OG}",
-                f"{train_size:,}    {adjusted_train_size:,}",
-            )
-        )
-
-    pprint(output)
-    print(len(template_collection))
-
-    print("Missing metrics:")
-    pprint(missing_metrics)
-
-    print("Missing original task flags:")
-    pprint(missing_og_flags)
-
-    # # print(d4_train_mixture)
-    # print(f"Number of training templates = {len(d4_train_mixture)}")
-    # # print(d4_eval_mixture)
-    # print(f"Number of evaluation templates = {len(d4_eval_mixture)}")
-    # # for i in seqio.TaskRegistry.names():
-    # #     print(i)
-    # print(f"Number of SeqIO registered templates = {len(seqio.TaskRegistry.names())}")
-    # print("^ includes non-original task templates which are excluded from the eval mixture")
-
-
-if __name__ == "__main__":
-    preview()
diff --git a/promptsource/seqio_tasks/tasks.py b/promptsource/seqio_tasks/tasks.py
deleted file mode 100644
index 5734a9cb5..000000000
--- a/promptsource/seqio_tasks/tasks.py
+++ /dev/null
@@ -1,421 +0,0 @@
-import csv
-import functools
-from typing import Dict, List, Optional, Tuple
-
-import pkg_resources
-import seqio
-import t5
-import tensorflow as tf
-from t5.data.glue_utils import get_glue_metric, get_super_glue_metric
-from t5.evaluation import metrics as mt
-
-import promptsource.templates
-from promptsource.seqio_tasks import utils
-from promptsource.utils import load_dataset
-
-
-GET_METRICS = {
-    "BLEU": mt.bleu,
-    "ROUGE": mt.rouge,
-    "Span Squad": mt.span_squad,
-    "Squad": mt.squad,
-    "Trivia QA": mt.trivia_qa,
-    "Accuracy": mt.accuracy,
-    "Sequence Accuracy": mt.sequence_accuracy,
-    "Pearson Correlation": mt.pearson_corrcoef,
-    "Spearman Correlation": mt.spearman_corrcoef,
-    "MultiRC": mt.multirc_f1_over_all_answers,
-    "AUC": mt.auc,
-    "COQA F1": mt.coqa_f1,
-    "Edit Distance": mt.edit_distance,
-    # "Mean Reciprocal Rank": mt.accuracy,  # NOTE not in T5?
-    "Other": mt.accuracy,
-    # Missing support for mean_multiclass_f1 etc. which need a num_classes parameter
-}
-
-MAX_EXAMPLES_PER_DATASET = 500_000
-
-
-def strip_whitespace(output_or_target, example=None, is_target=False):
-    """Cached tasks from promptsource all have a leading space on the ground-truth targets."""
-    return output_or_target.strip()
-
-
-def maybe_get_class_id_postprocessor(template):
-    if template.get_fixed_answer_choices_list():
-
-        def postprocess_fn(output_or_target, example=None, is_target=False):
-            output_or_target = strip_whitespace(output_or_target)
-            return t5.data.postprocessors.string_label_to_class_id(
-                output_or_target, label_classes=template.get_fixed_answer_choices_list()
-            )
-
-        return postprocess_fn
-
-    else:
-        return strip_whitespace
-
-
-def get_tf_dataset(split, shuffle_files, seed, dataset_name, subset_name, template, split_mapping):
-    # HF datasets does not support file-level shuffling
-    del shuffle_files, seed
-    dataset = load_dataset(dataset_name, subset_name)
-    dataset = dataset[split_mapping[split]]
-    dataset = utils.apply_template(dataset, template)
-    return utils.hf_dataset_to_tf_dataset(dataset)
-
-
-def add_task(dataset_name, subset_name, template_name, task_name=None, split_mapping=None):
-    template = all_templates.get_dataset(dataset_name, subset_name)[template_name]
-    task_name = task_name or utils.get_task_name(dataset_name, subset_name, template_name)
-
-    if dataset_name == "glue":
-        metrics = get_glue_metric(subset_name)
-    elif dataset_name == "super_glue":
-        if subset_name in ("wsc.fixed", "multirc"):
-            # TODO: WSC and MultiRC need special pre/postprocesing
-            metrics = [mt.accuracy]
-        else:
-            metrics = get_super_glue_metric(subset_name)
-    else:
-        # TODO what if metric is null?
-        metrics = [GET_METRICS[m] for m in template.metadata.metrics]
-
-    dataset_splits = utils.get_dataset_splits(dataset_name, subset_name)
-    split_mapping = split_mapping or {k: k for k in dataset_splits.keys()}
-
-    dataset_fn = functools.partial(
-        get_tf_dataset,
-        seed=None,
-        dataset_name=dataset_name,
-        subset_name=subset_name,
-        template=template,
-        split_mapping=split_mapping,
-    )
-    data_source = seqio.FunctionDataSource(
-        dataset_fn,
-        splits=list(split_mapping.keys()),
-        num_input_examples={s: dataset_splits[split_mapping[s]].num_examples for s in split_mapping.keys()},
-    )
-    output_features = {
-        "inputs": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=False, dtype=tf.int32),
-        "targets": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=True, dtype=tf.int32),
-    }
-    preprocessors = [
-        seqio.preprocessors.tokenize,
-        seqio.preprocessors.append_eos,
-        seqio.CacheDatasetPlaceholder(required=False),
-    ]
-
-    # Add train and normal eval tasks
-    seqio.TaskRegistry.add(
-        task_name,
-        data_source,
-        preprocessors=preprocessors,
-        output_features=output_features,
-        metric_fns=metrics,
-        postprocess_fn=maybe_get_class_id_postprocessor(template),
-    )
-
-    # Add rank classification eval task
-    if template.answer_choices:
-        rank_classification_preprocessor = functools.partial(
-            t5.data.preprocessors.rank_classification,
-            inputs_fn=lambda ex: tf.fill((len(ex["answer_choices"]),), ex["inputs"]),
-            targets_fn=lambda ex: ex["answer_choices"],
-            is_correct_fn=lambda ex: tf.equal(ex["answer_choices"], tf.strings.strip(ex["targets"])),
-            weight_fn=lambda ex: 1.0,
-        )
-
-        fixed_choices = template.get_fixed_answer_choices_list()
-        num_classes = len(fixed_choices) if fixed_choices else None
-        seqio.TaskRegistry.add(
-            task_name + "_score_eval",
-            data_source,
-            preprocessors=[rank_classification_preprocessor] + preprocessors,
-            output_features=output_features,
-            metric_fns=[functools.partial(t5.evaluation.metrics.rank_classification, num_classes=num_classes)],
-            postprocess_fn=t5.data.postprocessors.rank_classification,
-        )
-
-
-datatset_subset_tuple = Tuple[str, Optional[str]]
-d4_train: List[datatset_subset_tuple] = []
-d4_eval: List[datatset_subset_tuple] = []
-d3_train_gpt: List[datatset_subset_tuple] = []
-d3_train_sglue: List[datatset_subset_tuple] = []
-bias_fairness_eval: List[datatset_subset_tuple] = []
-gsheet: Dict[datatset_subset_tuple, Dict] = {}
-experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
-with open(experiment_path) as exp_file:
-    reader = csv.DictReader(exp_file)
-    for row in reader:
-        if row["skip"]:
-            continue
-        if row["subset"] == "":
-            row["subset"] = None  # to match promptsource.Template object
-        dataset_subset = (row["HF_name"], row["subset"])
-        if row["do_train"] == "TRUE":
-            d4_train.append(dataset_subset)
-        if row["do_eval"] == "TRUE":
-            d4_eval.append(dataset_subset)
-        if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]:
-            d3_train_gpt.append(dataset_subset)
-        if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
-            d3_train_sglue.append(dataset_subset)
-        if (
-            row["do_eval"] == "TRUE"
-            and row["task_by_convention"] == "bias_and_fairness"
-            and row["HF_name"] != "winogender"
-        ):
-            bias_fairness_eval.append(dataset_subset)
-        gsheet[dataset_subset] = row
-all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue + bias_fairness_eval
-
-all_templates = promptsource.templates.TemplateCollection()
-all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
-
-# 3 stages of training/ablation: D4 -> GPT -> SuperGLUE
-d4_train_mixture: List[str] = []  # strings are dataset_subset_template
-gpt_train_mixture: List[str] = []
-sglue_train_mixture: List[str] = []
-d4_eval_mixture: List[str] = []
-bias_fairness_eval_mixture: List[str] = []
-mixture_cap: Dict[str, int] = {}
-single_original_task: Dict[Tuple[str, str], str] = {}
-all_original_tasks: List[str] = []
-for dataset_name, subset_name in all_templates.keys:
-    if (dataset_name, subset_name) not in all_datasets:
-        all_templates.remove(dataset_name, subset_name)
-        continue
-
-    dataset = all_templates.get_dataset(dataset_name, subset_name)
-    num_templates = len(dataset.all_template_names)
-    train_size = gsheet[(dataset_name, subset_name)]["train_size"]
-    if train_size == "":
-        train_size = 0
-    else:
-        train_size = int(train_size)
-    if train_size > MAX_EXAMPLES_PER_DATASET:
-        cap = MAX_EXAMPLES_PER_DATASET // num_templates
-    else:
-        cap = train_size
-    for template_name in dataset.all_template_names:
-        add_task(dataset_name, subset_name, template_name)
-
-        template = dataset[template_name]
-
-        task_name = utils.get_task_name(dataset_name, subset_name, template_name)
-
-        if (dataset_name, subset_name) not in single_original_task and template.metadata.original_task:
-            single_original_task[(dataset_name, subset_name)] = task_name
-
-        if template.metadata.original_task:
-            all_original_tasks.append(task_name)
-
-        if (dataset_name, subset_name) in d4_train:
-            d4_train_mixture.append(task_name)
-            mixture_cap[task_name] = cap
-        if (dataset_name, subset_name) in d3_train_gpt:
-            gpt_train_mixture.append(task_name)
-            mixture_cap[task_name] = cap
-        if (dataset_name, subset_name) in d3_train_sglue:
-            sglue_train_mixture.append(task_name)
-            mixture_cap[task_name] = cap
-        if (dataset_name, subset_name) in d4_eval:
-            if template.metadata.original_task:
-                d4_eval_mixture.append(task_name)
-            # TODO use template.metadata.answer_choices here for rank eval
-        if (dataset_name, subset_name) in bias_fairness_eval:
-            bias_fairness_eval_mixture.append(task_name)
-
-# Special case for ANLI, which has weirdly-named splits and rounds that should be subsets
-dataset_name, subset_name = ("anli", None)
-dataset = all_templates.get_dataset(dataset_name, subset_name)
-for anli_round in ("r1", "r2", "r3"):
-    for template_name in all_templates.get_dataset(dataset_name, subset_name).all_template_names:
-        task_name = utils.get_task_name(dataset_name, subset_name, template_name) + f"_{anli_round}"
-        split_mapping = {
-            "train": f"train_{anli_round}",
-            "validation": f"dev_{anli_round}",
-            "test": f"test_{anli_round}",
-        }
-        add_task(dataset_name, subset_name, template_name, task_name, split_mapping)
-
-        template = dataset[template_name]
-        if template.metadata.original_task:
-            d4_eval_mixture.append(task_name)  # TODO or add to ANLI special mixture
-        # TODO use template.metadata.answer_choices here for rank eval
-
-
-TASK_BLACKLIST = [
-    # Tasks which often tokenize to > 1024 tokens currently
-    "hotpot_qa_distractor_Generate_Explanations",
-    "hotpot_qa_fullwiki_Generate_Explanations",
-    "hotpot_qa_distractor_Generate_Answer_and_Explanations",
-    "hotpot_qa_fullwiki_Generate_Answer_and_Explanations",
-    "hotpot_qa_fullwiki_Generate_Answer",
-    "hotpot_qa_distractor_Generate_Answer",
-    "hotpot_qa_distractor_Generate_Title_2",
-    "hotpot_qa_fullwiki_Generate_Title_2",
-    "hotpot_qa_fullwiki_Generate_Title_1",
-    "hotpot_qa_distractor_Generate_Title_1",
-    "hotpot_qa_distractor_Generate_Question",
-    "hotpot_qa_fullwiki_Generate_Question",
-    "tab_fact_tab_fact_tab_fact_3",
-    "tab_fact_tab_fact_tab_fact_2",
-    "tab_fact_tab_fact_tab_fact_1",
-    "tab_fact_tab_fact_tab_fact_7",
-    "tab_fact_tab_fact_tab_fact_4",
-    "tab_fact_tab_fact_tab_fact_5",
-    "tab_fact_tab_fact_tab_fact_6",
-    "wiki_hop_masked_Choose_Best_Object_Candidate",
-    "wiki_hop_masked_Indirect_Question_about_Birthplace_Citizenship_Place_of_Death",
-    "narrativeqa_Template_05",
-    "ecthr_cases_alleged_violation_prediction_silver_rationales",
-    # Tasks with broken cached files
-    "gigaword_summarize_",
-]
-
-# Tasks that failed caching (won't try to fix them for now) - remove when we are done
-D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [
-    "amazon_polarity_Is_this_product_review_positive_score_eval",
-    "amazon_polarity_Is_this_review_negative_score_eval",
-    "amazon_polarity_Is_this_review_score_eval",
-    "amazon_polarity_User_recommend_this_product_score_eval",
-    "amazon_polarity_convey_negative_or_positive_sentiment_score_eval",
-    "amazon_polarity_flattering_or_not_score_eval",
-    "amazon_polarity_negative_or_positive_tone_score_eval",
-    "amazon_polarity_user_satisfied_score_eval",
-    "amazon_polarity_would_you_buy_score_eval",
-    "dbpedia_14_given_a_choice_of_categories__score_eval",
-    "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval",
-    "dbpedia_14_pick_one_category_for_the_following_text_score_eval",
-    "wiki_hop_original_choose_best_object_affirmative_1_score_eval",
-    "wiki_hop_original_choose_best_object_affirmative_2_score_eval",
-    "wiki_hop_original_choose_best_object_affirmative_3_score_eval",
-    "wiki_hop_original_choose_best_object_interrogative_1_score_eval",
-    "wiki_hop_original_choose_best_object_interrogative_2_score_eval",
-]
-
-seqio.MixtureRegistry.add(
-    "d4_train",
-    [task for task in d4_train_mixture if task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "gpt_train",
-    [task for task in gpt_train_mixture if task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "sglue_train",
-    [task for task in sglue_train_mixture if task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "d4_gpt_train",
-    [task for task in d4_train_mixture + gpt_train_mixture if task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "d4_gpt_sglue_train",
-    [task for task in d4_train_mixture + gpt_train_mixture + sglue_train_mixture if task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "d4_eval",
-    [task for task in d4_eval_mixture if task not in TASK_BLACKLIST],
-    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-)  # eval mixture does not need to be capped
-
-
-seqio.MixtureRegistry.add(
-    "d4_score_eval",
-    [
-        task
-        for task in seqio.TaskRegistry.names()
-        if task.endswith("_score_eval")
-        and task.split("_score_eval")[0] in d4_eval_mixture
-        and task.split("_score_eval")[0] not in TASK_BLACKLIST
-    ],
-    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-)
-
-# Train tasks we don't care about evaluating on
-D4_TRAIN_SKIP_EVAL = [
-    "paws_labeled_final",
-    "adversarial_qa_dbidaf",
-    "adversarial_qa_dbert",
-    "duorc_ParaphraseRC",
-    "dream",
-    "amazon_polarity",
-    "app_reviews",
-    "imdb",
-    "wiki_bio",
-    "gigaword",
-    "multi_news",
-    "samsum",
-    "dbpedia_14",
-    "trec",
-]
-
-seqio.MixtureRegistry.add(
-    "d4_train_eval",
-    [
-        task
-        for task in d4_train_mixture
-        if task not in TASK_BLACKLIST
-        and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
-        and task in all_original_tasks
-    ],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "d4_train_score_eval",
-    [
-        task
-        for task in seqio.TaskRegistry.names()
-        if task.endswith("_score_eval")
-        and task.split("_score_eval")[0] in d4_train_mixture
-        and task.split("_score_eval")[0] not in TASK_BLACKLIST
-        and task not in D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST
-        and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
-        and task.split("_score_eval")[0] in all_original_tasks
-    ],
-    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-)
-
-seqio.MixtureRegistry.add(
-    "d4_train_one_og_prompt",
-    [task for task in single_original_task.values() if task in d4_train_mixture and task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "d4_train_all_og_prompts",
-    [task for task in all_original_tasks if task in d4_train_mixture and task not in TASK_BLACKLIST],
-    default_rate=lambda t: mixture_cap[t.name],
-)
-
-seqio.MixtureRegistry.add(
-    "bias_fairness_eval",
-    bias_fairness_eval_mixture,
-    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-)
-
-seqio.MixtureRegistry.add(
-    "bias_fairness_eval_score_eval",
-    [
-        task
-        for task in seqio.TaskRegistry.names()
-        if task.endswith("_score_eval") and task.split("_score_eval")[0] in bias_fairness_eval_mixture
-    ],
-    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-)
diff --git a/promptsource/seqio_tasks/utils.py b/promptsource/seqio_tasks/utils.py
deleted file mode 100644
index 1b4df95aa..000000000
--- a/promptsource/seqio_tasks/utils.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import re
-
-import datasets
-import tensorflow as tf
-
-import promptsource.utils
-
-
-def feature_to_spec(feature, length=False):
-    if isinstance(feature, datasets.ClassLabel):
-        return tf.TensorSpec(shape=() if not length else (None if length == -1 else length,), dtype=tf.int64)
-    elif isinstance(feature, datasets.Value):
-        return tf.TensorSpec(
-            shape=() if not length else (None if length == -1 else length,), dtype=getattr(tf.dtypes, feature.dtype)
-        )
-    elif hasattr(feature, "dtype") and hasattr(feature, "shape"):
-        return tf.TensorSpec(shape=feature.shape, dtype=feature.dtype)
-    elif isinstance(feature, datasets.Sequence):
-        return feature_to_spec(feature.feature, length=feature.length)
-    elif isinstance(feature, list):
-        return [feature_to_spec(f, length=length) for f in feature]
-    elif isinstance(feature, dict):
-        return {k: feature_to_spec(v, length=length) for k, v in feature.items()}
-    else:
-        raise ValueError(f"Unparseable feature type {type(feature)}")
-
-
-def hf_dataset_to_tf_dataset(dataset):
-    return tf.data.Dataset.from_generator(
-        dataset.__iter__, output_signature={k: feature_to_spec(v) for k, v in dataset.features.items()}
-    )
-
-
-def apply_template(dataset, template):
-    def map_fn(ex):
-        ex = promptsource.utils.removeHyphen(ex)
-        inputs_and_targets = template.apply(ex)
-        answer_choices = template.get_answer_choices_list(ex)
-        if len(inputs_and_targets) == 2:
-            inputs, targets = inputs_and_targets
-            if targets == "":
-                ex = {"inputs": inputs, "targets": "<NO LABEL>"}
-            else:
-                ex = {"inputs": inputs, "targets": targets}
-        # When template results in an empty example, template.apply returns [""]
-        # Also, if the template gets split wrong, len can be > 2
-        # We will filter these out later
-        else:
-            ex = {"inputs": "", "targets": ""}
-
-        if answer_choices:
-            ex["answer_choices"] = answer_choices
-
-        return ex
-
-    def filter_fn(ex):
-        return len(ex["inputs"]) > 0 and len(ex["targets"]) > 0
-
-    original_columns = dataset.column_names
-    dataset = dataset.map(map_fn).filter(filter_fn)
-    # map keeps original columns, remove them
-    return dataset.remove_columns(set(original_columns) - {"inputs", "targets", "answer_choices"})
-
-
-def get_dataset_splits(dataset_name, subset_name=None):
-    info = datasets.get_dataset_infos(dataset_name)
-    subset_name = subset_name or list(info.keys())[0]
-    return info[subset_name].splits
-
-
-def task_clean(text):
-    # Clean the text according to allowed characters for a task name
-    return re.sub(r"[^\w\d\._]+", "_", text)
-
-
-def get_task_name(dataset_name, subset_name, template_name):
-    return task_clean(dataset_name + (f"_{subset_name}_" if subset_name is not None else "_") + template_name)
diff --git a/setup.py b/setup.py
index d2c5b65c7..79577e50b 100644
--- a/setup.py
+++ b/setup.py
@@ -29,8 +29,5 @@
     package_data={"": [
         "templates/*/*.yaml",
         "templates/*/*/*.yaml",
-        "seqio_tasks/experiment_D3.csv",  # Experiment D3
-        "seqio_tasks/experiment_D4.csv",
-        "custom_datasets/*/*"
     ]}
 )