In [2]:
import pandas as pd
import utils
pd.options.display.max_rows = 100000
pd.set_option('display.max_colwidth', -1)

In [3]:
pa = utils.PredictionsAnalyzer()

PATH="../analysing_features/udpos/results/predictions/"
ud_pos = pa.read_tag_data(f"{PATH}dev-en.tsv",f"{PATH}dev_en_predictions.txt")

In [4]:
ud_pos.head()

Unnamed: 0,sentences,ground_truth,predictions
0,Creative Commons Corporation is not a law firm and does not provide legal services .,PROPN PROPN NOUN AUX PART DET NOUN NOUN CCONJ AUX PART VERB ADJ NOUN PUNCT,X X X X X X X X X X X X X X PUNCT
1,"The work ( as defined below ) is provided under the terms of this Creative Commons Public License ( "" CCPL "" or "" License "" ) .",DET NOUN PUNCT ADP VERB ADV PUNCT AUX VERB ADP DET NOUN ADP DET PROPN PROPN ADJ NOUN PUNCT PUNCT PROPN PUNCT CCONJ PUNCT NOUN PUNCT PUNCT PUNCT,DET X PUNCT X X ADV PUNCT AUX VERB X X X X X X X X X PUNCT PUNCT PROPN PUNCT CCONJ PUNCT X PUNCT PUNCT PUNCT
2,1 . Definitions .,NUM PUNCT NOUN PUNCT,NUM PUNCT X PUNCT
3,4 . Restrictions .,NUM PUNCT NOUN PUNCT,NUM PUNCT X PUNCT
4,7 . Termination .,NUM PUNCT NOUN PUNCT,NUM PUNCT NOUN PUNCT


In [5]:
def is_pred_wrong(actual,pred=None):
    return actual!=pred

mistakes_df, conf_df = pa.transform_data(is_pred_wrong,conf_matrix=True) 

#### How many times each tag is misclassified

In [6]:
conf_count = pd.DataFrame()
conf_count["count"] = conf_df.sum().sort_values(ascending=False); 
conf_count["percent"] = conf_count/conf_count.sum()
conf_count[:10]

Unnamed: 0,count,percent
NOUN,10375.0,0.244763
VERB,5700.0,0.134472
ADP,4961.0,0.117038
DET,4623.0,0.109064
PRON,3750.0,0.088468
ADJ,3580.0,0.084458
AUX,2562.0,0.060442
ADV,2011.0,0.047443
PART,1391.0,0.032816
PROPN,1225.0,0.0289


In [7]:
sparse_conf_df = conf_df.reset_index().melt(id_vars="index",var_name="actual",value_name="count").rename(columns={"index":"pred"})
sparse_conf_df["percent"] = sparse_conf_df["count"]/sparse_conf_df["count"].sum()
sparse_conf_df = sparse_conf_df.sort_values(by="count",ascending=False)
sparse_conf_df.head()

Unnamed: 0,pred,actual,count,percent
135,X,NOUN,10157.0,0.23962
271,X,VERB,5511.0,0.130013
33,X,ADP,4830.0,0.113947
101,X,DET,4576.0,0.107955
186,X,PRON,3740.0,0.088233


In [8]:
mistakes_df[(mistakes_df.predicted=="X")].head()

Unnamed: 0,predicted,actual,token,sentence,actual_tokens
0,X,PROPN,Creative,Creative Commons Corporation is not a law firm and does not provide legal services .,PROPN PROPN NOUN AUX PART DET NOUN NOUN CCONJ AUX PART VERB ADJ NOUN PUNCT
1,X,PROPN,Commons,Creative Commons Corporation is not a law firm and does not provide legal services .,PROPN PROPN NOUN AUX PART DET NOUN NOUN CCONJ AUX PART VERB ADJ NOUN PUNCT
2,X,NOUN,Corporation,Creative Commons Corporation is not a law firm and does not provide legal services .,PROPN PROPN NOUN AUX PART DET NOUN NOUN CCONJ AUX PART VERB ADJ NOUN PUNCT
3,X,AUX,is,Creative Commons Corporation is not a law firm and does not provide legal services .,PROPN PROPN NOUN AUX PART DET NOUN NOUN CCONJ AUX PART VERB ADJ NOUN PUNCT
4,X,PART,not,Creative Commons Corporation is not a law firm and does not provide legal services .,PROPN PROPN NOUN AUX PART DET NOUN NOUN CCONJ AUX PART VERB ADJ NOUN PUNCT


For some reason the model predicts the X token in vast majority of cases, sometimes even for every word in a given example.

### All examples

In [9]:
def all(actual,pred=None):
    return True

all_df, _ = pa.transform_data(all,conf_matrix=True) 

In [10]:
all_df.predicted.value_counts()/all_df.shape[0]

X        0.651312
PUNCT    0.126613
PROPN    0.041688
PRON     0.028196
CCONJ    0.021012
VERB     0.019814
ADV      0.017132
ADJ      0.016381
ADP      0.016094
NOUN     0.016094
AUX      0.015328
NUM      0.011959
DET      0.011176
SCONJ    0.004646
PART     0.002555
Name: predicted, dtype: float64

In [11]:
all_df.actual.value_counts()/all_df.shape[0]

NOUN     0.177689
PUNCT    0.123611
VERB     0.110646
ADP      0.093738
PRON     0.086968
DET      0.084446
ADJ      0.069198
PROPN    0.056760
AUX      0.054541
ADV      0.047324
CCONJ    0.031917
PART     0.024588
SCONJ    0.016988
NUM      0.014082
X        0.003417
INTJ     0.002666
SYM      0.001421
Name: actual, dtype: float64

In [14]:
de_pa = utils.PredictionsAnalyzer()
train_de = de_pa.read_tag_data("train-de.tsv")

In [15]:
train_de.head()

Unnamed: 0,sentences,ground_truth
0,"Hinter der neuen Firma steht unter anderem Lucent Technologies , einer der größten Anbieter von Equipment für Netzwerke und Telekommunikation .",ADP DET ADJ NOUN VERB ADP PRON PROPN X PUNCT PRON DET ADJ NOUN ADP NOUN ADP NOUN CCONJ NOUN PUNCT
1,Wirtschaftsministerium :,NOUN PUNCT
2,20 Millionen DSL-Anschlüsse bis 2010,NUM NOUN NOUN ADP NUM
3,"Im Auftrag des Bundeswirtschaftsministeriums erstellte das Wissenschaftliche Institut für Kommunikationsdienste ( WIK ) die Studie "" Entwicklungstrends im Telekommunikationssektor bis 2010 "" .",ADP NOUN DET NOUN VERB DET ADJ NOUN ADP NOUN PUNCT PROPN PUNCT DET NOUN PUNCT NOUN ADP NOUN ADP NUM PUNCT PUNCT
4,"Hierin prognostiziert das Institut , dass es im Jahr 2010 rund 20 Millionen DSL-Anschlüsse geben wird .",ADV VERB DET NOUN PUNCT SCONJ PRON ADP NOUN NUM ADV NUM NOUN NOUN VERB AUX PUNCT


Get all X tags from German training set.

In [47]:
def is_x_tag(actual,pred=None):
    return actual=="X"

x_tags_df, _ = de_pa.transform_data(is_x_tag) 



In [48]:
import numpy as np, sys
np.set_printoptions(threshold=sys.maxsize)

In [54]:
x_tags = "   ".join(x_tags_df.token.values)

x_tags[:500]

'Technologies   Interactive   Media   Focus   Digital   Tomorrow   Wireless   Local   Loop   Preselection   A4   §§   ff.   Electronic   Arts   President   worldwide   studios   Electronic   Arts   Codes   Secure   Music   Transfer   Agent   SD   Memory   Cards   Secure   Music   Manager   Instant   Messenger   Instant   Messenger   1.2.3   Messenger   flat   surftime   Surftime   by   day   by   night   by   day   Eco   by   Call   Liberty   Media   Communications   Star   Instant   Messenger   '

We can see in this sample (but also in the whole file) that almost all words tagged with X in German are English words. There we hypothesize that during training the model learns that English words should be tagged with X and therefore during validation on English it assigns X to almost all words.

In [None]:
with open("german_x_tags.txt","w",encoding="utf-8") as f:
    f.write(x_tags)