In [149]:
import pandas as pd
import yaml
import os
import plotly.express as px
import plotly
from plotly import graph_objects as go

In [150]:
max_dict = {
    'pl': 17722,
    'be': 25231, # TODO
    'fr': 14449,
    'en': 16621, # TODO
    'lv': 12520,
    'lt': 3642, # TODO
    'cs': 68494,
    'es': 17662, # TODO
    'zh': 3996,
    'slk': 8482,
    'slv': 10000, # TODO
    'uk': 5495,
    'ru': 10000, # TODO
    'de': 10000, # TODO
    'fi': 12215,
    'id': 4481,


}

In [151]:
slavic_langs = ("pl", "be", "cs", "slk", "slv", "uk", "ru")

In [152]:
# yaml_args = yaml.load(open(config))
results_dir = '/home/amysiak/thesis/multilingual-probing-visualization/probingOutputs'

In [153]:
df_data = {'dev': [], 'train': [], 'uuas': [], 'limit': []}

In [154]:
for subdir, dirs, files in os.walk(results_dir):
    # print(files)
    if len(files) < 5:
        continue

    for file in files:
        if file.endswith(".yaml"):
            config_args = yaml.load(open(os.path.join(subdir, file)))
            dset = config_args["dataset"]
            df_data['dev'].append(dset['keys']['dev'][0])
            df_data['train'].append(dset['keys']['train'][0])
            max_ = max_dict[dset['keys']['train'][0]]
            if 'limit' in dset.keys():
                df_data['limit'].append(min(max_, dset['limit']))
            else:
                df_data['limit'].append(max_)
            uuas_file = open(os.path.join(subdir, "dev.uuas"), 'r') 
            uuas_score = float(uuas_file.readlines() [0])
            uuas_file.close() 
            df_data['uuas'].append(uuas_score)


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [155]:
df = pd.DataFrame(df_data)

In [156]:
df["slavic"] = df.train.isin(slavic_langs)

In [157]:
df.tail()

Unnamed: 0,dev,train,uuas,limit,slavic
457,pl,uk,0.777828,5495,True
458,pl,zh,0.604625,3996,False
459,pl,zh,0.608517,3996,False
460,pl,zh,0.600846,3996,False
461,pl,zh,0.601338,3996,False


In [158]:
df = df[df.dev == 'pl']

In [159]:
df.to_csv(f"/home/amysiak/thesis/multilingual-probing-visualization/results.csv")

In [160]:
pl_df = df[(df.dev == 'pl') & (df.train == 'pl')]

In [161]:
pl_df

Unnamed: 0,dev,train,uuas,limit,slavic
13,pl,pl,0.845122,17722,True
40,pl,pl,0.799403,5000,True
41,pl,pl,0.787199,5000,True
42,pl,pl,0.793849,5000,True
54,pl,pl,0.845689,17722,True
55,pl,pl,0.388763,100,True
56,pl,pl,0.836054,10000,True
57,pl,pl,0.799176,5000,True
90,pl,pl,0.575418,1000,True
91,pl,pl,0.758634,2500,True


In [162]:
fig = px.scatter(x=pl_df.limit, y=pl_df.uuas)
fig.show()

In [163]:
df_avg = df.groupby(["train", "limit"]).mean()
df_avg = df_avg.reset_index()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [164]:
df_avg.head()

Unnamed: 0,train,limit,uuas,slavic
0,be,100,0.376049,1.0
1,be,1000,0.609452,1.0
2,be,2500,0.687618,1.0
3,be,3000,0.700314,1.0
4,be,5000,0.726457,1.0


In [165]:
df_avg["slavic"] = (df_avg["slavic"] == 1.0)

In [182]:
means = df.groupby("train").mean()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [185]:
sum(1 - means.slavic)

8.0

In [191]:
fig = px.scatter(data_frame=df_avg[df_avg.limit <= 10000], x="limit", y="uuas", color="slavic", hover_data=["train"], labels={"limit": "# train sentences", "uuas": "UUAS", "slavic": "Is Slavic"})
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    yaxis_range=[0, 1],
    font={"size":20}
)
fig.show()

In [167]:
df = df.sort_values(by="limit")
df_avg = df_avg.sort_values(by="limit")
df_limit = df_avg[df_avg.limit <= 10000]

In [168]:
fig = px.line(data_frame=df_limit, x="limit", y="uuas", color="train", markers=True)
fig.show()

In [178]:
full_name = {
    "pl": "Polish",
    "ru": "Russian",
    "uk": "Ukrainian",
    "slv": "Slovenian",
    "cs": "Czech",
    "be": "Belarussian",
    "de": "German",
    "slk": "Slovak",
    "fr": "French",
    "ar": "Arabic",
    "id": "Indonesian",
    "en": "English",
    "fi": "Finnish",
    "zh": "Chinese",
    "lv": "Latvian",
    "es": "Spanish"
}

In [194]:
fig = go.Figure()
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    font={"size":20},
    xaxis_title = "# train sentences",
    yaxis_title = "UUAS"
)
for lang in ["pl", "ru", "cs", "be", "de", "fr", "en", "fi", "zh"]:
    df_ = df_limit[df_limit.train == lang]
    fig.add_trace(go.Scatter(x=df_.limit, y=df_.uuas,
                        mode='lines+markers',
                        name=full_name[lang]))


fig.show()


In [170]:
df_avg[df_avg.limit==10000].sort_values(by="uuas", ascending=False)

Unnamed: 0,train,limit,uuas,slavic
70,pl,10000,0.834119,True
78,ru,10000,0.787055,True
92,slv,10000,0.783655,True
13,cs,10000,0.781735,True
6,be,10000,0.762934,True
21,de,10000,0.761347,False
50,fr,10000,0.75116,False
35,es,10000,0.740716,False
28,en,10000,0.736092,False
62,lv,10000,0.723343,False
