# Price estimation

In [1]:
import csv
import os
import spacy
import pandas as pd
import sys

# add reference to the few_shot dir
sys.path.insert(1, os.path.join(sys.path[0], '../../'))

In [2]:
from spacy.lang.en import English
nlp_en = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
lang_tokenizers = {"en": nlp_en.tokenizer}
tokens = lang_tokenizers["en"]("This is a sentence")

In [3]:
from tasks import FEW_SHOT_TASKS_REGISTRY

def get_sets_by_lang(sets, lang):
    supported_sets = {}
    for s, v in sets.items():
        if isinstance(v, dict):
            if lang not in v:
                continue
            supported_sets[s] = v[lang] 
        else:
            if lang == "en":
                supported_sets[s] = v
    return supported_sets
                
        

In [4]:
values = []
#XCOPA	XNLI	PAWS-X	mLAMA	StoryCloze	Hellaswag	ReCoRD	PIQA
tasks = [
         "copa", "xnli", "pawsx", "storycloze", 
         "hellaswag", 
         #"record", 
         "piqa", 
         "openbookqa",
         "winograd"
        ]
#tasks = ["winograd"]
langs_mapping = {"any":"en"}
task_instance_to_str = {
    "winograd": lambda item: " ".join([item[c] for c in ['goal', 'sol1', 'sol2']]),
    # task: lambda item: " ".join([item[c] for c in ['', '', '']]),
    "copa": lambda item: " ".join([item[c] for c in ['premise', 'choice1', 'choice2']]),
    "xnli": lambda item: " ".join([item[c] for c in ['sentence1', 'sentence2']]),
    "pawsx":  lambda item: " ".join([item[c] for c in ['sentence1', 'sentence2']]),
    "storycloze": lambda item: " ".join([item[c] for c in ['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4', 'RandomFifthSentenceQuiz1', 'RandomFifthSentenceQuiz2']]), 
    "hellaswag": lambda item: " ".join([item["ctx"]] + item["endings"]),
    #"record":[], 
    "winograd": lambda item: " ".join([item["txt1"]] + [x + " "  +item["txt2"] for x in item["candidates"]]),
    "piqa": lambda item: " ".join([item[c] for c in ['goal', 'sol1', 'sol2']]),
    "openbookqa": lambda item: "\n".join([item["question"]["stem"]] + [ch["label"] + ") " + ch["text"]  for ch in item["question"]["choices"]]),
}

def get_token_info(items, task, lang):
    stringify = task_instance_to_str[task]
    tokenizer = lang_tokenizers[lang]
    items_tokens_cnt = [] 
    for item in items:
        item_str = stringify(item)
        item_tokens = tokenizer(item_str)
        items_tokens_cnt.append(len(item_tokens))
        
    return {
        "items": len(items_tokens_cnt),
        "total_tokens": sum(items_tokens_cnt),
        "tokens_per_item": sum(items_tokens_cnt)/len(items_tokens_cnt),
    }
    
    
for t, task_class in FEW_SHOT_TASKS_REGISTRY.items():
    if t not in tasks:
        continue
    print(f"task:{t}")
    task_langs = langs_mapping.get(t, [langs_mapping["any"]])
    #task_fields = fields[t]
    
    set_to_file_mapping = task_class.get_sets_and_lang_to_path_mappings()
#     if t == "pawsx":
#         print(set_to_file_mapping)
        
    for lang in task_langs:
        lang_sets = get_sets_by_lang(set_to_file_mapping, lang)
        #print(f"{lang} {lang_sets}")
        for s, p in lang_sets.items():
            set_info = {
                "task": t,
                "lang": lang,
                "set": s,
            }
            if p is None:
                #print(t, s, p)
                continue
            set_path = str(p)
            task_instance = task_class.from_kwargs(**{"language": lang})
            task_items = task_instance.read_data(set_path)
            print(f"{t} {s} ")
            print(task_items[0])
            token_info = get_token_info(task_items, t, lang)
            
            set_info.update(token_info)
            
            values.append(set_info)
            
            
            
    # break # debug
            
        
        
    

task:copa
copa train 
{'premise': 'My body cast a shadow over the grass.', 'choice1': 'The sun was rising.', 'choice2': 'The grass was cut.', 'question': 'cause', 'label': 0, 'idx': 0}
copa val 
{'premise': 'The man turned on the faucet.', 'choice1': 'The toilet filled with water.', 'choice2': 'Water flowed from the spout.', 'question': 'effect', 'label': 1, 'idx': 0}
copa test 
{'premise': 'The item was packaged in bubble wrap.', 'choice1': 'It was fragile.', 'choice2': 'It was small.', 'question': 'cause', 'idx': 0}
task:pawsx
pawsx dev 
{'id': '4', 'sentence1': 'From the merger of the Four Rivers Council and the Audubon Council , the Shawnee Trails Council was born .', 'sentence2': 'Shawnee Trails Council was formed from the merger of the Four Rivers Council and the Audubon Council .', 'label': '1'}
pawsx test 
{'id': '10', 'sentence1': 'The exception was between late 2005 and 2009 when he played in Sweden with Carlstad United BK , Serbia with FK Borac Čačak and Russian FC Terek Gro

In [5]:
df = pd.DataFrame.from_dict(values)
df.to_csv('tokens_info.tsv',sep='\t')
df

Unnamed: 0,task,lang,set,items,total_tokens,tokens_per_item
0,copa,en,train,400,7849,19.6225
1,copa,en,val,100,2005,20.05
2,copa,en,test,500,9620,19.24
3,pawsx,en,dev,2000,86219,43.1095
4,pawsx,en,test,2000,86982,43.491
5,hellaswag,en,train,39905,6498519,162.849743
6,hellaswag,en,val,10042,1695358,168.826728
7,hellaswag,en,test,10003,1646345,164.585124
8,storycloze,en,val2016,1871,107514,57.463389
9,storycloze,en,test2016,1871,107569,57.492785
