# Correlations between tasks and distances

In [40]:
import pandas as pd
import lang2vec.lang2vec as l2v

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth',None)

## Read scores for all tasks

In [41]:
udpos_scores = pd.read_csv("udpos/lang_pair_scores.csv")[["train_lang","test_lang","acc"]] 
ner_scores = pd.read_csv("ner/ner_lang_pair_scores.csv")[["train_lang","test_lang","acc"]]
xnli_scores = pd.read_csv("xnli/xnli_lang_pair_scores.csv").rename(columns={"acc":"acc_xnli"})

In [42]:
pos_ner = pd.merge(ner_scores, udpos_scores, on=["train_lang","test_lang"],suffixes=("_ner", "_pos"),how="outer")
all_tasks = pd.merge(pos_ner,xnli_scores, on=["train_lang","test_lang"], how="outer")

In [43]:
all_tasks.head()

Unnamed: 0,train_lang,test_lang,acc_ner,acc_pos,acc_xnli
0,af,af,0.978337,0.98439,
1,af,ar,0.793688,0.587932,
2,af,bg,0.903963,0.847598,
3,af,bn,0.709012,,
4,af,de,0.921272,0.837953,


## Read files with language codes

In [44]:
lang_feats = pd.read_excel("../xtreme_langs_wals.xlsx", index_col=0)
lang_codes_2 = list(lang_feats['ISO 693-2'].values)
lang_codes_2.remove("may")
lang_codes_1 = list(lang_feats.index)
lang_codes_1.remove("ms")

Compute matrices with syntactic, geographic and genetic distances

In [45]:
synt_dist = pd.DataFrame(data=l2v.syntactic_distance(lang_codes_2),index=lang_codes_1,columns=lang_codes_1)
geo_dist = pd.DataFrame(data=l2v.geographic_distance(lang_codes_2),index=lang_codes_1,columns=lang_codes_1)
gen_dist = pd.DataFrame(data=l2v.genetic_distance(lang_codes_2),index=lang_codes_1,columns=lang_codes_1)

Transform them into a single dataframe

In [46]:
for i in all_tasks.index:
        train_lang = all_tasks.at[i,"train_lang"]
        test_lang = all_tasks.at[i,"test_lang"]
        try:
            all_tasks.at[i,"syn_dist"] =  synt_dist.at[train_lang, test_lang]
            all_tasks.at[i,"geo_dist"] =  geo_dist.at[train_lang, test_lang]
            all_tasks.at[i,"gen_dist"] =  gen_dist.at[train_lang, test_lang]
        except:
            print("Language not in URIEL",train_lang, test_lang)
            all_tasks.at[i,"syn_dist"] = -1
            all_tasks.at[i,"geo_dist"] =  -1
            all_tasks.at[i,"gen_dist"] =  -1

Language not in URIEL af ms
Language not in URIEL ar ms
Language not in URIEL bg ms
Language not in URIEL bn ms
Language not in URIEL de ms
Language not in URIEL el ms
Language not in URIEL en ms
Language not in URIEL es ms
Language not in URIEL et ms
Language not in URIEL eu ms
Language not in URIEL fa ms
Language not in URIEL fi ms
Language not in URIEL fr ms
Language not in URIEL he ms
Language not in URIEL hi ms
Language not in URIEL hu ms
Language not in URIEL id ms
Language not in URIEL it ms
Language not in URIEL ja ms
Language not in URIEL ka ms
Language not in URIEL kk ms
Language not in URIEL ko ms
Language not in URIEL ml ms
Language not in URIEL mr ms
Language not in URIEL ms af
Language not in URIEL ms ar
Language not in URIEL ms bg
Language not in URIEL ms bn
Language not in URIEL ms de
Language not in URIEL ms el
Language not in URIEL ms en
Language not in URIEL ms es
Language not in URIEL ms et
Language not in URIEL ms eu
Language not in URIEL ms fa
Language not in URIE

In [47]:
all_tasks.head()

Unnamed: 0,train_lang,test_lang,acc_ner,acc_pos,acc_xnli,syn_dist,geo_dist,gen_dist
0,af,af,0.978337,0.98439,,0.0,0.0,0.0
1,af,ar,0.793688,0.587932,,0.71,1.0,1.0
2,af,bg,0.903963,0.847598,,0.59,0.4,0.8571
3,af,bn,0.709012,,,0.58,0.4,0.875
4,af,de,0.921272,0.837953,,0.52,0.4,0.2857


In [48]:
all_tasks = all_tasks[all_tasks.train_lang!=all_tasks.test_lang]
all_tasks = all_tasks[(all_tasks.syn_dist!=-1)&(all_tasks.geo_dist!=-1)&(all_tasks.gen_dist!=-1)]

In [49]:
from scipy import stats

corrs = {"task":[],"syn":[],"geo":[],"gen":[]}
for task in ["ner", "pos", "xnli"]:
    print("Task",task)
    corrs["task"].append(task)
    for dist in ["syn","geo","gen"]:
        drop_na = all_tasks.dropna(subset=['acc_'+task,dist+'_dist'])
        corr = stats.pearsonr(drop_na['acc_'+task], drop_na[dist+'_dist'])
        corrs[dist].append(corr[0])

Task ner
Task pos
Task xnli


In [50]:
corrs_table = pd.DataFrame.from_dict(corrs)

In [52]:
corrs_table

Unnamed: 0,task,syn,geo,gen
0,ner,-0.183454,-0.197949,-0.198537
1,pos,-0.292522,-0.378323,-0.31914
2,xnli,-0.157961,-0.274467,-0.100264


In [53]:
# print(corrs_table.to_latex())

In [54]:
from scipy import stats

corrs = {"task":[],"ner":[],"pos":[],"xnli":[]}
for task1 in ["ner", "pos", "xnli"]:
    print("Task",task)
    corrs["task"].append(task1)
    for task2 in ["ner", "pos", "xnli"]:
        drop_na = all_tasks.dropna(subset=['acc_'+task1,'acc_'+task2])
        corr = stats.pearsonr(drop_na['acc_'+task1], drop_na['acc_'+task2])
        corrs[task2].append(corr[0])

Task xnli
Task xnli
Task xnli


In [55]:
corrs_table = pd.DataFrame.from_dict(corrs)

In [56]:
corrs_table

Unnamed: 0,task,ner,pos,xnli
0,ner,1.0,0.495913,0.179361
1,pos,0.495913,1.0,0.065942
2,xnli,0.179361,0.065942,1.0


In [57]:
# print(corrs_table.to_latex())

## Show best source language for each target language and task

In [58]:
def get_best_lang_pos(x):
    return x.at[x["acc_pos"].idxmax(),"train_lang"]

def get_best_lang_ner(x):
    return x.at[x["acc_ner"].idxmax(),"train_lang"]

def get_best_lang_xnli(x):
    return x.at[x["acc_xnli"].idxmax(),"train_lang"]

best_train_langs = pd.DataFrame()
best_train_langs["ner"] = all_tasks[all_tasks.train_lang!=all_tasks.test_lang].dropna(subset=["acc_ner"]).groupby(by="test_lang").apply(get_best_lang_ner)
best_train_langs["pos"] = all_tasks[all_tasks.train_lang!=all_tasks.test_lang].dropna(subset=["acc_pos"]).groupby(by="test_lang").apply(get_best_lang_pos)
best_train_langs["xnli"] = all_tasks[all_tasks.train_lang!=all_tasks.test_lang].dropna(subset=["acc_xnli"]).groupby(by="test_lang").apply(get_best_lang_xnli)

In [59]:
best_train_langs

Unnamed: 0_level_0,ner,pos,xnli
test_lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
af,nl,en,
ar,fa,he,bg
bg,ru,fr,th
bn,ta,,
de,nl,nl,fr
el,it,de,de
en,id,it,de
es,fr,it,ru
et,fi,fi,
eu,hu,ur,


In [68]:
single_task = best_train_langs.count(axis=1)==1

In [70]:
best_train_langs[~single_task].nunique(axis=1).value_counts()

2    18
3     9
1     4
dtype: int64

We can see that out of 31 target languages that appeared in more than one task, only 4 have the same best source language for all tasks: Estonian, Portuguese, Japanese and Thai.