In [99]:
import pandas as pd
import yaml
import os
import plotly.express as px
import plotly
from plotly import graph_objects as go
import plotly.io as pio

pio.renderers.default = "vscode"
from numpy import dot
from numpy.linalg import norm

from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN

import numpy as np

In [100]:
full_name = {
    "pl": "Polish",
    "ru": "Russian",
    "uk": "Ukrainian",
    "slv": "Slovene",
    "cs": "Czech",
    "be": "Belarusian",
    "bg": "Bulgarian",
    "de": "German",
    "slk": "Slovak",
    "fr": "French",
    "ar": "Arabic",
    "id": "Indonesian",
    "en": "English",
    "fi": "Finnish",
    "zh": "Chinese",
    "lv": "Latvian",
    "es": "Spanish",
    "lt": "Lithuanian",
    "hr": "Croatian",
    "tr": "Turkish",
    "ko": "Korean",
    "hy": "Armenian",
    "fa": "Persian",
    "ja": "Japanese",
    "ga": "Irish",
    "cy": "Welsh",
    "nl": "Dutch",
}

In [101]:
lang_families = {'indo-european': ("pl", "be", "cs", "slk", "slv", "uk", "ru", "bg", "hr", "de", "fr", "en", "lv", "es", "lt", "hy", "fa", 'ga', 'cy', 'nl'), "afro-asiatic": ('ar', ), 'austronesian': ('id', ), 'uralic': ('fi',), 'sino-tibetan': ('zh', ), 'turkic': ('tr', ), 'koreanic': ('ko', ), 'japonic': ('ja', )}
lang_groups = {'slavic': ("pl", "be", "cs", "slk", "slv", "uk", "ru", "bg", "hr"), "germanic": ('de', "en", 'nl'), "italic/romance": ('fr', 'es'), 'semitic': ('ar', ), 'malay': ('id', ), 'finno-ugric': ('fi', ), 'sinitic': ('zh', ), 'baltic': ('lv', 'lt'), 'turkic': ('tr', ), 'koreanic': ('ko', ), 'armenian': ('hy', ), 'iranian': ('fa', ), 'japonic': ('ja', ), 'celtic': ('ga', 'cy')}

In [102]:
lang_to_size = {
    "pl": 1_301_917,
    "ru": 1_500_041,
    "uk": 825_591,
    "slv": 162_385,
    "cs": 413_512,
    "be": 157_651,
    "bg": 246_040,
    "de": 2_224_357,
    "slk": 227_255,
    "fr":  2_044_308,
    "ar": 618_436,
    "id": 440_545,
    "en": 5_725_914,
    "fi": 445_104,
    "zh": 1_024_062,
    "lv": 87_981,
    "es": 1_478_365,
    "lt": 189_673,
    "hr": 199_153,
    "tr": 316_491,
    "ko": 428_406,
    "hy": 246_231,
    "fa": 643_006,
    "ja": 1_122_821,
    "ga": 48_183,
    "cy": 101_401,
    "nl": 1_943_296,
}

In [103]:
max_dict = {
    'ar': 6074,
    'pl': 17721,
    'bg': 8906,
    'be': 22852,
    'fr': 14448,
    'en': 12542,
    'lv': 12520,
    'fa': 4797,
    'lt': 2340,
    'cs': 68494,
    'es': 14286, 
    'zh': 3996,
    'slk': 8482,
    'slv': 10902,
    'uk': 5495,
    'ru': 69629,
    'de': 13813,
    'fi': 12216,
    'id': 4481,
    'hr': 6913,
    'tr': 14850,
    'ko': 23010,
    'hy': 1974,
    'fa': 26196,
    'ja': 7050,
    'ga': 4005,
    'cy': 1111,
    'nl': 12289,
}

In [104]:
lang_types = {'slavic': ("pl", "be", "cs", "slk", "slv", "uk", "ru", "bg", "hr"), 'svo': ("fr", "en", "lv", "lt", "es", "zh", "fi", "id", "de", "nl"), 'sov': ('tr', 'ko', 'hy', 'fa', 'ja'), 'vso': ('ar', 'ga', 'cy')}

In [105]:
lang_types_dict = {}
for lang_type, lang_list in lang_types.items():
    type_dict = {lan: lang_type for lan in lang_list}
    lang_types_dict.update(type_dict)

In [106]:
families_dict = {}
for lang_type, lang_list in lang_families.items():
    type_dict = {lan: lang_type for lan in lang_list}
    families_dict.update(type_dict)

In [107]:
group_dict = {}
for lang_type, lang_list in lang_groups.items():
    type_dict = {lan: lang_type for lan in lang_list}
    group_dict.update(type_dict)

In [108]:
RESULTS_CSV = 'multilingual-probing-visualization/probingOutputs/all_slavic/all_results.csv'

In [109]:
df = pd.read_csv(RESULTS_CSV)
df['train'] = df.train.str[2:-2]
df['test'] = df.test.str[2:-2]

In [110]:
df["wiki_size_train"] = df.train.map(lang_to_size)
df["wiki_size_test"] = df.test.map(lang_to_size)

In [111]:
df = df[df.seed == 0]
# df = df[df.wiki_size_train >= 200e3]
# df = df[df.wiki_size_test >= 200e3]

In [112]:
limit = 10_000
df = df[df.limit == limit]
df = df[df.model == 'mBERT']

In [113]:
df.head()

Unnamed: 0,train,test,model,layer,rank,limit,seed,directory,dspear,uuas,wiki_size_train,wiki_size_test
2,be,be,mBERT,6,128,10000.0,0,/home/amysiak/thesis/multilingual-probing-visu...,0.779117,0.787562,157651,157651
9,cs,be,mBERT,6,128,10000.0,0,/home/amysiak/thesis/multilingual-probing-visu...,0.760374,0.748032,413512,157651
22,en,be,mBERT,6,128,10000.0,0,/home/amysiak/thesis/multilingual-probing-visu...,0.738201,0.704005,5725914,157651
29,es,be,mBERT,6,128,10000.0,0,/home/amysiak/thesis/multilingual-probing-visu...,0.751952,0.69942,1478365,157651
36,fi,be,mBERT,6,128,10000.0,0,/home/amysiak/thesis/multilingual-probing-visu...,0.739131,0.678402,445104,157651


## t-SNE using vectors

In [114]:
lang_to_vec = {}
for lang, data in df.groupby('test'):
    data = data.sort_values(by="train")
    lang_to_vec[lang] = data.uuas.values

In [115]:
lang_to_vec['pl']

array([0.76717298, 0.78168216, 0.76275221, 0.7389103 , 0.73758785,
       0.69209552, 0.68786367, 0.75247487, 0.61588453, 0.7273483 ,
       0.74722285, 0.83102849, 0.78980579, 0.78629185, 0.62737097])

In [116]:
len(lang_to_vec) # num samples

27

In [117]:
len(lang_to_vec['pl']) # num features

15

In [118]:
vecs = []
langs = []
for k, v in lang_to_vec.items():
    vecs.append(v)
    langs.append(k)

In [119]:
X = np.stack(vecs) # dim
X.shape

(27, 15)

In [140]:
for p in range(3, 11):    
    X_embedded = TSNE(perplexity=p, metric='cosine').fit_transform(X)
    fig = px.scatter(x=X_embedded[:,0], y=X_embedded[:,1], color=langs, title=f"perplexity {p}", color_discrete_sequence=px.colors.qualitative.Light24)
    fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [148]:
# dbscan looked better for cherry-picked languages
dbscan = DBSCAN(min_samples=2, metric='cosine', eps=0.001)
# the first cluster I get is the slavics, unsurprisingly
clusters = dbscan.fit_predict(X)
for val in np.unique(clusters):
    print(val)
    for cl, lan in zip(clusters, langs):
        if cl == val:
            print(lan)

    print("\n")


-1
en
fa
ja
ko
tr


0
ar
cy
ga
id


1
be
bg
cs
fi
hr
hy
lt
lv
pl
ru
slk
slv
uk
zh


2
de
nl


3
es
fr




## t-SNE using distances
wayyyy worse

In [129]:
langs_train = df.train.unique()

In [131]:
X_dist = np.zeros((len(langs_train), len(langs_train)))

In [138]:
for i, lang in enumerate(langs_train):
    for j, lang_ in enumerate(langs_train):
        if i == j:
            continue

        r1 = df[(df.train == lang) & (df.test == lang_)].uuas.mean()
        r2 = df[(df.test == lang) & (df.train == lang_)].uuas.mean()

        X_dist[i, j] = 1 - (r1 + r2)/2


In [139]:
for p in range(3, 11):    
    X_embedded = TSNE(perplexity=p, metric='precomputed').fit_transform(X_dist)
    fig = px.scatter(x=X_embedded[:,0], y=X_embedded[:,1], color=langs_train, title=f"perplexity {p}", color_discrete_sequence=px.colors.qualitative.Light24)
    fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

