# Analysing predictions UD POS different languages

In [1]:
import pandas as pd
import utils
from collections import defaultdict

pd.options.display.max_rows = 100000
pd.set_option('display.max_colwidth', -1)

PATH="C:/Users/bdolicki/Documents/Git/multilingual-analysis/code/analysing_predictions/udpos"

  


In [2]:
# we use a sample of 3 languages as an example, but you can use any language that is included in all_langs
all_langs = ['af', 'ar', 'bg', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'id', 'it', 'ja', 'ko', 'nl', 'pt', 'ru']
train_langs = ['af', 'ar', 'en']
test_langs = ['af', 'ar', 'en']

## Aggregated analysis

### Helper functions
The functions below can be used to observe different aspects of predictions. We include them all in our script to show an example usage, but, depending what we focus on, they can be used rather independently - for example one can use only the first function.

In [3]:
# show most frequently confused POS tags for a language pair, conf is the confusion matrix
def most_confused_pos(conf):
    conf_count = pd.DataFrame()
    conf_count["count"] = conf.sum().sort_values(ascending=False) 
    conf_count["percent"] = conf_count/conf_count.sum()
    print("Most confused POS tags")
    display(conf_count[:5])
    return conf_count
    
# show most frequently confused POS pairs for a language pair, conf is the confusion matrix
def most_confused_pos_pairs(conf):
    sparse_conf_df = conf.reset_index().melt(id_vars="index",var_name="actual",value_name="count").rename(columns={"index":"pred"})
    sparse_conf_df["percent"] = sparse_conf_df["count"]/sparse_conf_df["count"].sum()
    sparse_conf_df = sparse_conf_df.sort_values(by="count",ascending=False)
    print("Most confused POS pairs")
    display(sparse_conf_df.head())

# show top 5 most frequently confused words and how many times each word was assigned a particular POS tag
# useful to see if a word has multiple POS tags or not
# mistakes is the df with mistaken words
def most_mistaken_word_with_tags(mistakes):
    for token in mistakes_df.token.value_counts()[:5].index:
        token_tags = defaultdict(int)
        for i,sentence in enumerate(pa.all_sentences):
            words = sentence.split()
            for j,word in enumerate(words):
                if word==token:
                    tag = pa.all_tags[i].split()[j]
                    token_tags[tag] += 1
        print("Possible tags for the word",token)
        print(dict(token_tags))
        
# show how many times a word was mistaken together with its frequency
def mistaken_words_with_word_freq(pa):
    def is_all(actual, pred=None):
        return True
    all_df, _ = pa.transform_data(is_all,conf_matrix=True)
    relative = pd.DataFrame()
    relative["% mistaken"] = mistakes_df.token.value_counts().divide(all_df.token.value_counts()).sort_values(ascending=False)
    relative["all_count"] = all_df.token.value_counts()
    relative["mistaken_count"] = mistakes_df.token.value_counts()
    relative = relative.dropna()
    print("Relative mistaken word frequency")
    display(relative.sort_values(by="% mistaken",ascending=False).head())

#   show most mistaken words per target language from all source languages
def most_mistaken_words_per_target_func(most_mistaken_words_list):
    print("Most mistaken words per target language from all source languages")
    display(pd.Series(most_mistaken_words_per_target).value_counts()[:5])

#   show most mistaken POS tags per target language from all source languages
def most_mistaken_pos_per_target_func(most_mistaken_pos_list):
    print("Most mistaken POS tags per target language from all source languages")
    display(pd.Series(most_mistaken_pos_per_target).value_counts()[:5])

#   show top 1 mistaken POS tag from all target languages
def top1_mistaken_pos_per_target(most_mistaken_pos_lang):
    print("Top 1 mistaken POS tag for all target languages")
    display(pd.DataFrame.from_dict(most_mistaken_pos_lang))

In [5]:
most_mistaken_pos_lang = {"lang":[],"tag":[]}
mistakes = defaultdict(dict)

for test_lang in test_langs:
#     aggregate most mistaken words/pos tags per target language for all source languages
    most_mistaken_words_per_target = []
    most_mistaken_pos_per_target = []
    print("test",test_lang)
    for train_lang in train_langs:
        print('train',train_lang)
        pa = utils.PredictionsAnalyzer()
        ud_pos = pa.read_tag_data(f"{PATH}/dev_sets/dev-{test_lang}.tsv",f"{PATH}/results/dev_{train_lang}_{test_lang}_predictions.txt")
        def is_pred_wrong(actual,pred=None):
            return actual!=pred

        mistakes_df, conf_df = pa.transform_data(is_pred_wrong,conf_matrix=True)
#         store df with mistakes for all language pairs if you want to perform further analysis
#         when running for a big subset of languages it might cause memory issues
        mistakes[train_lang][test_lang] = mistakes_df
        
        conf_count = most_confused_pos(conf_df)
        most_confused_pos_pairs(conf_df)
        most_mistaken_word_with_tags(mistakes_df)
        
        if train_lang==test_lang:
            print("Supervised")
        else:
            print("Zero-shot")

        most_mistaken_words_per_target += list(mistakes_df.token.value_counts()[:2].index)
        most_mistaken_pos_per_target += [conf_count.index[0]]
        
        mistaken_words_with_word_freq(pa)
        
    most_mistaken_words_per_target_func(most_mistaken_words_per_target)
    most_mistaken_pos_per_target_func(most_mistaken_pos_per_target)
    
    most_mistaken_pos_lang["lang"].append(test_lang)
    most_mistaken_pos_lang["tag"].append(pd.Series(most_mistaken_pos_per_target).value_counts().index[0])


test af
train af
Most confused POS tags


Unnamed: 0,count,percent
ADV,13.0,0.156627
DET,13.0,0.156627
X,11.0,0.13253
PROPN,11.0,0.13253
ADJ,8.0,0.096386


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
85,ADJ,DET,11.0,0.13253
203,X,PROPN,8.0,0.096386
44,PRON,ADV,6.0,0.072289
279,NOUN,X,4.0,0.048193
34,ADJ,ADV,4.0,0.048193


Possible tags for the word sodanige
{'DET': 11}
Possible tags for the word Crime
{'PROPN': 7}
Possible tags for the word hier
{'ADV': 9}
Possible tags for the word aanspreeklik
{'ADV': 2, 'ADJ': 2}
Possible tags for the word gebruik
{'VERB': 20, 'NOUN': 15}
Supervised
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
ya,1.0,1,1.0
sodanige,1.0,11,11.0
FW,1.0,1,1.0
gelisensieer,1.0,1,1.0
geleë,1.0,1,1.0


train ar
Most confused POS tags


Unnamed: 0,count,percent
DET,398.0,0.260813
AUX,236.0,0.154653
PRON,223.0,0.146134
ADV,142.0,0.093054
VERB,134.0,0.087811


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
66,VERB,AUX,178.0,0.116645
175,DET,PRON,142.0,0.093054
92,NOUN,DET,140.0,0.091743
94,PART,DET,101.0,0.066186
97,PUNCT,DET,100.0,0.065531


Possible tags for the word die
{'DET': 318}
Possible tags for the word wat
{'PRON': 91}
Possible tags for the word 'n
{'DET': 80}
Possible tags for the word te
{'PART': 104, 'ADP': 1}
Possible tags for the word sal
{'AUX': 50}
Zero-shot
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
©,1.0,1,1.0
onmiddellik,1.0,1,1.0
versoenbaar,1.0,1,1.0
oorlede,1.0,1,1.0
oorgesend,1.0,1,1.0


train en
Most confused POS tags


Unnamed: 0,count,percent
PRON,99.0,0.159935
ADP,95.0,0.153473
X,82.0,0.132472
ADV,73.0,0.117932
NOUN,52.0,0.084006


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
30,SCONJ,ADP,87.0,0.140549
283,PROPN,X,59.0,0.095315
175,DET,PRON,48.0,0.077544
130,PROPN,NOUN,45.0,0.072698
43,PART,ADV,32.0,0.051696


Possible tags for the word nie
{'ADV': 36, 'PART': 47}
Possible tags for the word om
{'ADP': 83}
Possible tags for the word GCIS
{'X': 47}
Possible tags for the word enige
{'PRON': 39}
Possible tags for the word as
{'SCONJ': 17}
Zero-shot
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
ya,1.0,1,1.0
GCIS,1.0,47,47.0
regs-,1.0,1,1.0
probeer,1.0,5,5.0
pre-Staatsrede-debat,1.0,1,1.0


Most mistaken words per target language from all source languages


om          1
nie         1
die         1
sodanige    1
wat         1
dtype: int64

Most mistaken POS tags per target language from all source languages


DET     1
ADV     1
PRON    1
dtype: int64

test ar
train af
Most confused POS tags


Unnamed: 0,count,percent
NOUN,3194.0,0.298282
X,1994.0,0.186216
ADJ,1478.0,0.138028
CCONJ,1131.0,0.105622
NUM,861.0,0.080407


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
283,PROPN,X,1405.0,0.13121
120,ADP,NOUN,996.0,0.093015
130,PROPN,NOUN,868.0,0.081061
150,SYM,NUM,848.0,0.079193
258,AUX,VERB,666.0,0.062196


Possible tags for the word ان
{'CCONJ': 239, 'X': 1, 'PART': 3}
Possible tags for the word التي
{'DET': 163, 'X': 1}
Possible tags for the word أن
{'CCONJ': 148}
Possible tags for the word الذي
{'DET': 85, 'X': 13}
Possible tags for the word -
{'PUNCT': 224}
Zero-shot
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
ييري,1.0,1,1.0
يتنازل,1.0,2,2.0
يتمتع,1.0,1,1.0
يتكلم,1.0,1,1.0
يتفقون,1.0,1,1.0


train ar
Most confused POS tags


Unnamed: 0,count,percent
X,272.0,0.311569
NOUN,229.0,0.262314
ADJ,117.0,0.134021
VERB,85.0,0.097365
CCONJ,53.0,0.06071


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
279,NOUN,X,130.0,0.148912
119,ADJ,NOUN,93.0,0.106529
135,X,NOUN,89.0,0.101947
7,NOUN,ADJ,83.0,0.095074
258,AUX,VERB,48.0,0.054983


Possible tags for the word الذي
{'DET': 85, 'X': 13}
Possible tags for the word كان
{'VERB': 31, 'AUX': 9}
Possible tags for the word مليون
{'NUM': 34, 'X': 8}
Possible tags for the word النهائي
{'NOUN': 12, 'ADJ': 1}
Possible tags for the word وفي
{'CCONJ': 17, 'ADP': 8}
Supervised
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
يوغوسلافي,1.0,1,1.0
وديبورتيفو,1.0,1,1.0
ورازندارا,1.0,1,1.0
ورافتر,1.0,1,1.0
مساس,1.0,1,1.0


train en
Most confused POS tags


Unnamed: 0,count,percent
NOUN,4195.0,0.403172
X,1884.0,0.181067
ADJ,1520.0,0.146084
CCONJ,1131.0,0.108698
VERB,502.0,0.048246


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
130,PROPN,NOUN,1888.0,0.181451
283,PROPN,X,1576.0,0.151466
11,PROPN,ADJ,819.0,0.078712
120,ADP,NOUN,808.0,0.077655
134,VERB,NOUN,524.0,0.05036


Possible tags for the word ان
{'CCONJ': 239, 'X': 1, 'PART': 3}
Possible tags for the word التي
{'DET': 163, 'X': 1}
Possible tags for the word أن
{'CCONJ': 148}
Possible tags for the word الذي
{'DET': 85, 'X': 13}
Possible tags for the word ما
{'DET': 68, 'AUX': 1, 'PART': 1}
Zero-shot
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
ييري,1.0,1,1.0
وتشهد,1.0,2,2.0
وتركيا,1.0,3,3.0
وغيره,1.0,1,1.0
وغيرها,1.0,1,1.0


Most mistaken words per target language from all source languages


ان      2
التي    2
كان     1
الذي    1
dtype: int64

Most mistaken POS tags per target language from all source languages


NOUN    2
X       1
dtype: int64

test en
train af
Most confused POS tags


Unnamed: 0,count,percent
VERB,1574.0,0.179496
NOUN,1120.0,0.127723
PART,997.0,0.113696
ADJ,886.0,0.101038
ADV,844.0,0.096248


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
258,AUX,VERB,725.0,0.082678
150,SYM,NUM,619.0,0.07059
255,ADJ,VERB,433.0,0.049378
155,ADV,PART,388.0,0.044247
135,X,NOUN,384.0,0.043791


Possible tags for the word to
{'PART': 849, 'ADP': 516, 'SCONJ': 8, 'ADV': 2}
Possible tags for the word 's
{'AUX': 85, 'PART': 195, 'VERB': 32, 'PRON': 3}
Possible tags for the word not
{'PART': 224, 'ADV': 11, 'CCONJ': 1}
Possible tags for the word as
{'ADP': 126, 'SCONJ': 96, 'ADV': 49, 'CCONJ': 2}
Possible tags for the word n't
{'PART': 152, 'ADV': 4}
Zero-shot
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
“um”s,1.0,1,1.0
waiting,1.0,4,4.0
08/08/2000,1.0,1,1.0
washed,1.0,1,1.0
08/10/2000,1.0,1,1.0


train ar
Most confused POS tags


Unnamed: 0,count,percent
DET,4356.0,0.20135
PROPN,3555.0,0.164325
PRON,2544.0,0.117593
VERB,2432.0,0.112416
ADV,2119.0,0.097948


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
92,NOUN,DET,3513.0,0.162383
203,X,PROPN,2589.0,0.119673
66,VERB,AUX,1236.0,0.057132
262,NOUN,VERB,1050.0,0.048535
194,NOUN,PROPN,893.0,0.041278


Possible tags for the word the
{'DET': 2708, 'PRON': 2}
Possible tags for the word a
{'DET': 1241, 'PRON': 1, 'X': 3}
Possible tags for the word to
{'PART': 849, 'ADP': 516, 'SCONJ': 8, 'ADV': 2}
Possible tags for the word I
{'PRON': 655, 'NUM': 3, 'ADJ': 2}
Possible tags for the word that
{'PRON': 225, 'SCONJ': 260, 'DET': 54, 'ADP': 2, 'ADV': 3}
Zero-shot
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
“um”s,1.0,1,1.0
samantha,1.0,1,1.0
immediately,1.0,6,6.0
pup,1.0,1,1.0
imam,1.0,1,1.0


train en
Most confused POS tags


Unnamed: 0,count,percent
NOUN,378.0,0.180086
ADJ,301.0,0.143402
ADV,296.0,0.14102
PROPN,245.0,0.116722
VERB,213.0,0.101477


Most confused POS pairs


Unnamed: 0,pred,actual,count,percent
194,NOUN,PROPN,188.0,0.089566
130,PROPN,NOUN,185.0,0.088137
35,ADP,ADV,112.0,0.053359
258,AUX,VERB,97.0,0.046212
15,VERB,ADJ,87.0,0.041448


Possible tags for the word all
{'PRON': 44, 'DET': 91, 'ADV': 29, 'NOUN': 1}
Possible tags for the word is
{'AUX': 567, 'VERB': 77}
Possible tags for the word one
{'NUM': 89, 'PRON': 36, 'NOUN': 3, 'DET': 2}
Possible tags for the word out
{'ADJ': 1, 'ADP': 65, 'ADV': 68, 'X': 1, 'NOUN': 1}
Possible tags for the word that
{'PRON': 225, 'SCONJ': 260, 'DET': 54, 'ADP': 2, 'ADV': 3}
Supervised
Relative mistaken word frequency


Unnamed: 0,% mistaken,all_count,mistaken_count
clanking,1.0,1,1.0
MCT,1.0,1,1.0
Mad,1.0,1,1.0
Least,1.0,1,1.0
Investment,1.0,1,1.0


Most mistaken words per target language from all source languages


is     1
the    1
all    1
's     1
to     1
dtype: int64

Most mistaken POS tags per target language from all source languages


DET     1
VERB    1
NOUN    1
dtype: int64

In [57]:
top1_mistaken_pos_per_target(most_mistaken_pos_lang)

Top 1 mistaken POS tag for all target languages


Unnamed: 0,lang,tag
0,af,ADV
1,ar,NOUN
2,en,NOUN


## Further analysis

We can use stored tables to zoom in on particular language pairs. Here we look at the predictions when Afrikaan was the source language and English was the target language.

In [53]:
mistakes_df = mistakes['af']['en']
# show examples when DET was the actual tag
mistakes_df[mistakes_df['actual']=='DET']

Unnamed: 0,predicted,actual,token,sentence,actual_tokens
20,PRON,DET,their,The Cunha report on multiannual guidance programmes comes before Parliament on Thursday and contains a proposal in paragraph 6 that a form of quota penalties should be introduced for countries which fail to meet their fleet reduction targets annually .,DET PROPN NOUN ADP ADJ NOUN NOUN VERB ADP NOUN ADP NOUN CCONJ VERB DET NOUN ADP NOUN NUM PRON DET NOUN ADP NOUN NOUN AUX AUX VERB ADP NOUN PRON VERB PART VERB DET ADJ NOUN NOUN ADV PUNCT
31,PRON,DET,another,"In many ways , the prerequisites differ from one Member State to another .",ADP ADJ NOUN PUNCT DET NOUN VERB ADP NUM NOUN NOUN ADP DET PUNCT
33,PRON,DET,his,The reason Mr Koch produced his sound report was because the work in the CEN and within the United Nations Economic Commission was proceeding none too expeditiously .,DET NOUN NOUN PROPN VERB DET ADJ NOUN VERB SCONJ DET NOUN ADP DET PROPN CCONJ ADP DET PROPN PROPN ADJ NOUN AUX VERB ADV ADV ADV PUNCT
42,PRON,DET,his,"Mr President , I would once again like to congratulate Mr Koch on his magnificent work on this other report , which in a way supplements the debate which we held in October on rail transport .",NOUN NOUN PUNCT PRON AUX ADV ADV VERB PART VERB NOUN PROPN ADP DET ADJ NOUN ADP DET ADJ NOUN PUNCT PRON ADP DET NOUN VERB DET NOUN PRON PRON VERB ADP PROPN ADP NOUN NOUN PUNCT
47,PRON,DET,its,We oppose the excessive control the central administration of the Union and its Member States exercises and we are calling for a reduction in the bureaucracy that has taken root in the drafting and implementation of programmes .,PRON VERB DET ADJ NOUN DET ADJ NOUN ADP DET PROPN CCONJ DET NOUN NOUN VERB CCONJ PRON AUX VERB ADP DET NOUN ADP DET NOUN PRON AUX VERB NOUN ADP DET NOUN CCONJ NOUN ADP NOUN PUNCT
57,PRON,DET,each,"as far as French planning experts are concerned , for example , the most probable scenario today is that of the entrenchment of regional disparities within each country .",ADP ADV ADP ADJ NOUN NOUN AUX ADJ PUNCT ADP NOUN PUNCT DET ADV ADJ NOUN ADV AUX PRON ADP DET NOUN ADP ADJ NOUN ADP DET NOUN PUNCT
59,PRON,DET,your,"In addition , if you choose to use Facebook from your mobile device , please note that you will be responsible for any fees associated with internet usage and / or text messaging as determined by your mobile service provider .",ADP NOUN PUNCT SCONJ PRON VERB PART VERB PROPN ADP DET ADJ NOUN PUNCT ADV VERB SCONJ PRON AUX AUX ADJ ADP DET NOUN VERB ADP NOUN NOUN CCONJ PUNCT CCONJ NOUN NOUN SCONJ VERB ADP DET NOUN NOUN NOUN PUNCT
62,PRON,DET,any,"In addition , if you choose to use Facebook from your mobile device , please note that you will be responsible for any fees associated with internet usage and / or text messaging as determined by your mobile service provider .",ADP NOUN PUNCT SCONJ PRON VERB PART VERB PROPN ADP DET ADJ NOUN PUNCT ADV VERB SCONJ PRON AUX AUX ADJ ADP DET NOUN VERB ADP NOUN NOUN CCONJ PUNCT CCONJ NOUN NOUN SCONJ VERB ADP DET NOUN NOUN NOUN PUNCT
65,PRON,DET,your,"In addition , if you choose to use Facebook from your mobile device , please note that you will be responsible for any fees associated with internet usage and / or text messaging as determined by your mobile service provider .",ADP NOUN PUNCT SCONJ PRON VERB PART VERB PROPN ADP DET ADJ NOUN PUNCT ADV VERB SCONJ PRON AUX AUX ADJ ADP DET NOUN VERB ADP NOUN NOUN CCONJ PUNCT CCONJ NOUN NOUN SCONJ VERB ADP DET NOUN NOUN NOUN PUNCT
69,PRON,DET,your,"To sign up for a brand new account , enter your name , birthday , gender , and email address into the form on www.facebook.com .",ADP VERB ADP ADP DET ADV ADJ NOUN PUNCT VERB DET NOUN PUNCT NOUN PUNCT NOUN PUNCT CCONJ NOUN NOUN ADP DET NOUN ADP PROPN PUNCT
