In [1]:
import os
import pickle
import pandas as pd
from typing import List
import math

In [2]:
RESULT_DIR = "../results"

# Functions used for all datasets


In [3]:
ALL_LANGUAGES = ['en', 'es', 'de', 'el', 'ja', 'ko', 'hi', 'ur', 'bg', 'ru', 'fr', 'ar', 'tr', 'zh', 'vi', 'sw']

In [4]:
def calculate_acc(scores):
	acc = [1 if int(y) == probs.index(min(probs)) else 0 for y, probs in scores]
	acc = (sum(acc)/len(acc))*100
	return acc

In [5]:
def row_results(model="10.4B", experiment="dense"):
	evaluation_directory = os.path.join(RESULT_DIR, model, experiment)
	row = {'index': f"{model}_{experiment}"}
	for language in LANGUGAES:
		file_name = f"{'hf' if model != 'xglm-1.7B' else model}.{task}" \
					f".k{demonstrations}.{split}_{language}.run{runs}.pkl"
		evaluation_file = os.path.join(evaluation_directory, file_name)

		try:
			with open(evaluation_file, "rb") as pickle_file:
				lang_scores=pickle.load(pickle_file)

			row[language] = calculate_acc(lang_scores)
		except FileNotFoundError:
			row[language] = 0
	return	row


# XNLI Task Zero-Shot

In [6]:
task = "xnli"
demonstrations = 0 # zero-shot
runs = 0

split="eval"
LANGUGAES=['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']

Collect data from a split

In [7]:
evals = []

In [8]:
model = "xglm-1.7B"
experiment = "dense"
evals.append(row_results(model, experiment))

In [9]:
model = "10.4B"
experiment = "dense"
evals.append(row_results(model, experiment))

In [10]:
model = "10.4B"
experiment = "C8_lang"
evals.append(row_results(model, experiment))

In [11]:
model = "20.9B"
experiment = "dense"
evals.append(row_results(model, experiment))

In [12]:
model = "20.9B"
experiment = "C8_lang"
evals.append(row_results(model, experiment))

In [13]:
model = "20.9B"
experiment = "C8_hmr"
evals.append(row_results(model, experiment))

Save to pandas and as csv

In [14]:
df = pd.DataFrame(evals)
df.set_index('index', inplace=True)
df.to_csv(f"{task}_{split}_k{demonstrations}_results.csv")

In [15]:
df

Unnamed: 0_level_0,ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
xglm-1.7B_dense,46.8,45.7,44.1,42.5,51.5,36.5,47.2,45.9,47.3,43.6,44.9,42.5,43.5,43.9,46.9
10.4B_dense,47.2,43.5,44.3,42.3,51.7,37.0,46.6,45.3,44.3,40.0,44.9,43.4,43.6,41.1,44.4
10.4B_C8_lang,46.4,46.1,43.5,45.8,52.2,36.0,47.1,42.6,42.5,41.3,0.0,43.5,43.7,42.6,45.3
20.9B_dense,47.9,45.0,45.3,45.2,51.1,37.2,45.9,44.5,44.5,39.6,44.3,44.8,43.1,41.6,44.6
20.9B_C8_lang,46.2,44.9,43.9,45.4,52.0,36.0,47.2,43.5,41.9,40.6,0.0,44.2,41.9,44.4,46.3
20.9B_C8_hmr,46.0,46.0,43.2,46.4,52.3,36.4,47.0,42.4,41.8,41.7,0.0,44.3,41.4,43.5,46.1


# Xstorycloze Task Zero-Shot

In [16]:
task = "xstorycloze"
demonstrations = 0 # zero-shot
runs = 0

split="eval"
LANGUGAES=['ar', 'en', 'es', 'eu', 'hi', 'id', 'my', 'ru', 'sw', 'te', 'zh']

evals = []

In [17]:
model = "xglm-1.7B"
experiment = "dense"
evals.append(row_results(model, experiment))

In [18]:
model = "10.4B"
experiment = "dense"
evals.append(row_results(model, experiment))

In [19]:
model = "10.4B"
experiment = "C8_lang"
evals.append(row_results(model, experiment))

In [20]:
model = "20.9B"
experiment = "dense"
evals.append(row_results(model, experiment))

In [21]:
model = "20.9B"
experiment = "C8_lang"
evals.append(row_results(model, experiment))

In [22]:
model = "20.9B"
experiment = "C8_hmr"
evals.append(row_results(model, experiment))

In [23]:
df = pd.DataFrame(evals)
df.set_index('index', inplace=True)
df.to_csv(f"{task}_{split}_k{demonstrations}_results.csv")

In [24]:
df

Unnamed: 0_level_0,ar,en,es,eu,hi,id,my,ru,sw,te,zh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
xglm-1.7B_dense,53.3,63.1,57.3,56.4,55.0,59.3,54.0,60.0,60.1,57.0,55.5
10.4B_dense,52.0,61.7,56.8,53.8,53.8,57.6,53.7,60.1,60.1,56.4,56.1
10.4B_C8_lang,52.4,62.4,58.1,0.0,55.2,0.0,0.0,59.6,60.2,0.0,57.7
20.9B_dense,50.5,60.7,56.1,52.1,52.0,55.4,53.4,58.6,58.6,55.5,56.2
20.9B_C8_lang,52.3,62.7,57.5,0.0,52.7,0.0,0.0,60.2,60.3,0.0,58.8
20.9B_C8_hmr,52.3,62.6,57.6,0.0,52.9,0.0,0.0,59.6,60.1,0.0,58.0


# Cross-lingual + Few shot

In [25]:
NUM_RUNS = 5

In [26]:
def compute_mean_std(evals: List):
	mean = sum(evals)/len(evals)
	var = sum([(s-mean)**2 for s in evals])/len(evals)
	std_dev = math.sqrt(var)
	std_err = std_dev/math.sqrt(len(evals))

	return mean, std_err

In [27]:
def row_results_with_std(model="10.4B", experiment="C8_lang_from_en_TRG", std=0):
	evaluation_directory = os.path.join(RESULT_DIR, model, experiment)
	row_avg = {'index': f"{model}_{experiment}"}
	row_std = {'index': f"{model}_{experiment}"}
	for language in LANGUGAES:
		runs_eval = []
		for runs in range(NUM_RUNS):

			file_name = f"{'hf' if model != 'xglm-1.7B' else model}.{task}" \
						f".k{demonstrations}.{split}_{language}.run{runs}.pkl"
			evaluation_file = os.path.join(evaluation_directory, file_name)

			try:
				with open(evaluation_file, "rb") as pickle_file:
					run_scores =pickle.load(pickle_file)
			except FileNotFoundError:
				continue

			runs_eval.append(calculate_acc(run_scores))

		if len(runs_eval) == 0:
			row_avg[language] = 0
			row_std[language] = 0
		else:
			row_avg[language], row_std[language] = compute_mean_std(runs_eval)
	return	row_avg, row_std

## XNLI

In [28]:
task = "xnli"
demonstrations = 8 # few-shot

split="eval"
LANGUGAES=['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']

evals = []
evals_std = []

In [29]:
model = "10.4B"
experiment = "C8_lang_from_en_SRC"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [30]:
model = "10.4B"
experiment = "C8_lang_from_en_TRG"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [31]:
model = "10.4B"
experiment = "C8_lang_from_en_ENSEMBLE"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [32]:
model = "20.9B"
experiment = "C8_hmr_from_en_SRC"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [33]:
model = "20.9B"
experiment = "C8_hmr_from_en_TRG"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [34]:
model = "20.9B"
experiment = "C8_hmr_from_en_ENSEMBLE"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [35]:
df_avg = pd.DataFrame(evals)
df_avg.set_index('index', inplace=True)

df_std = pd.DataFrame(evals_std)
df_std.set_index('index', inplace=True)

# format dfs to avg ± std

df = df_avg.copy()
for col in df_avg.columns:
	df[col] = df_avg[col].map('{:,.2f}'.format) + " ± " + df_std[col].map('{:,.2f}'.format)

df.to_csv(f"{task}_{split}_k{demonstrations}_from_en_results.csv")

In [36]:
df

Unnamed: 0_level_0,ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10.4B_C8_lang_from_en_SRC,38.00 ± 0.43,41.98 ± 0.89,45.36 ± 0.37,42.60 ± 0.90,48.58 ± 0.47,38.44 ± 0.57,42.96 ± 0.58,43.42 ± 0.41,42.38 ± 0.91,40.40 ± 0.17,0.00 ± 0.00,40.80 ± 0.40,38.74 ± 0.37,40.88 ± 0.68,46.08 ± 0.58
10.4B_C8_lang_from_en_TRG,43.48 ± 1.11,41.64 ± 0.76,44.46 ± 0.42,42.78 ± 1.00,48.58 ± 0.47,38.44 ± 0.57,46.92 ± 0.74,41.46 ± 0.76,40.96 ± 0.48,40.32 ± 0.65,0.00 ± 0.00,40.66 ± 0.30,40.50 ± 0.51,39.92 ± 0.89,44.38 ± 0.87
10.4B_C8_lang_from_en_ENSEMBLE,39.36 ± 0.67,41.10 ± 0.99,45.26 ± 0.32,42.98 ± 0.95,48.58 ± 0.47,38.44 ± 0.57,44.64 ± 0.84,43.50 ± 0.78,40.24 ± 0.53,40.54 ± 0.47,0.00 ± 0.00,40.18 ± 0.43,39.46 ± 0.44,39.72 ± 0.73,46.10 ± 0.71
20.9B_C8_hmr_from_en_SRC,38.66 ± 0.39,40.24 ± 0.59,45.48 ± 0.39,43.36 ± 0.93,48.90 ± 0.16,38.06 ± 0.56,43.08 ± 0.55,43.66 ± 0.17,41.16 ± 0.62,40.22 ± 0.45,0.00 ± 0.00,40.62 ± 0.55,39.28 ± 0.30,40.40 ± 0.51,46.20 ± 0.58
20.9B_C8_hmr_from_en_TRG,44.76 ± 0.73,42.16 ± 0.77,43.86 ± 0.46,42.76 ± 0.96,48.88 ± 0.14,38.04 ± 0.55,46.84 ± 0.85,41.42 ± 0.89,40.54 ± 0.61,40.18 ± 0.61,0.00 ± 0.00,39.12 ± 0.58,41.88 ± 0.43,40.08 ± 1.11,43.50 ± 0.63
20.9B_C8_hmr_from_en_ENSEMBLE,40.90 ± 0.77,40.70 ± 0.72,45.44 ± 0.25,42.90 ± 0.79,48.88 ± 0.14,38.04 ± 0.55,45.10 ± 0.67,43.38 ± 0.61,40.12 ± 0.52,40.50 ± 0.49,0.00 ± 0.00,39.76 ± 0.49,40.92 ± 0.62,39.56 ± 0.81,45.74 ± 0.60


In [37]:
# computing # computing average  for supported languages
df_avg = df_avg[df_avg.columns.intersection(ALL_LANGUAGES)]
df_avg.mean(axis=1)


index
10.4B_C8_lang_from_en_SRC         42.187143
10.4B_C8_lang_from_en_TRG         42.464286
10.4B_C8_lang_from_en_ENSEMBLE    42.150000
20.9B_C8_hmr_from_en_SRC          42.094286
20.9B_C8_hmr_from_en_TRG          42.430000
20.9B_C8_hmr_from_en_ENSEMBLE     42.281429
dtype: float64

## Xstorycloze

In [38]:
task = "xstorycloze"
demonstrations = 8 # zero-shot


split="eval"
LANGUGAES=['ar', 'en', 'es', 'eu', 'hi', 'id', 'my', 'ru', 'sw', 'te', 'zh']

evals = []
evals_std = []

In [39]:
model = "xglm-1.7B"
experiment = "dense_from_en"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [40]:
model = "10.4B"
experiment = "dense_from_en"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [41]:
model = "10.4B"
experiment = "C8_lang_from_en_SRC"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [42]:
model = "10.4B"
experiment = "C8_lang_from_en_TRG"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [43]:
model = "10.4B"
experiment = "C8_lang_from_en_ENSEMBLE"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [44]:
model = "20.9B"
experiment = "C8_hmr_from_en_SRC"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [45]:
model = "20.9B"
experiment = "C8_hmr_from_en_TRG"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [46]:
model = "20.9B"
experiment = "C8_hmr_from_en_ENSEMBLE"
row_avg, row_std = row_results_with_std(model, experiment)
evals.append(row_avg)
evals_std.append(row_std)

In [47]:
df_avg = pd.DataFrame(evals)
df_avg.set_index('index', inplace=True)

df_std = pd.DataFrame(evals_std)
df_std.set_index('index', inplace=True)

# format dfs to avg ± std

df = df_avg.copy()
for col in df_avg.columns:
	df[col] = df_avg[col].map('{:,.2f}'.format) + " ± " + df_std[col].map('{:,.2f}'.format)

df.to_csv(f"{task}_{split}_k{demonstrations}_from_en_results.csv")

In [48]:
df

Unnamed: 0_level_0,ar,en,es,eu,hi,id,my,ru,sw,te,zh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
xglm-1.7B_dense_from_en,48.64 ± 0.10,58.16 ± 0.18,53.16 ± 0.26,51.66 ± 0.34,50.36 ± 0.09,52.14 ± 0.16,51.52 ± 0.12,52.52 ± 0.31,56.00 ± 0.16,56.48 ± 0.13,53.66 ± 0.16
10.4B_dense_from_en,50.20 ± 0.10,58.90 ± 0.44,54.54 ± 0.12,51.38 ± 0.16,53.28 ± 0.17,54.42 ± 0.26,52.62 ± 0.18,56.56 ± 0.36,58.20 ± 0.28,55.52 ± 0.21,55.02 ± 0.10
10.4B_C8_lang_from_en_SRC,49.78 ± 0.12,60.60 ± 0.27,55.38 ± 0.18,0.00 ± 0.00,52.48 ± 0.14,0.00 ± 0.00,0.00 ± 0.00,54.80 ± 0.20,57.84 ± 0.32,0.00 ± 0.00,54.84 ± 0.15
10.4B_C8_lang_from_en_TRG,50.28 ± 0.15,60.60 ± 0.27,55.38 ± 0.18,0.00 ± 0.00,52.78 ± 0.10,0.00 ± 0.00,0.00 ± 0.00,56.90 ± 0.38,58.24 ± 0.21,0.00 ± 0.00,55.74 ± 0.15
10.4B_C8_lang_from_en_ENSEMBLE,50.20 ± 0.04,60.60 ± 0.27,55.38 ± 0.18,0.00 ± 0.00,53.00 ± 0.22,0.00 ± 0.00,0.00 ± 0.00,55.70 ± 0.26,58.54 ± 0.14,0.00 ± 0.00,55.86 ± 0.09
20.9B_C8_hmr_from_en_SRC,49.44 ± 0.09,60.04 ± 0.40,54.70 ± 0.12,0.00 ± 0.00,52.42 ± 0.20,0.00 ± 0.00,0.00 ± 0.00,55.66 ± 0.40,57.64 ± 0.24,0.00 ± 0.00,54.86 ± 0.20
20.9B_C8_hmr_from_en_TRG,49.40 ± 0.22,60.04 ± 0.40,54.70 ± 0.12,0.00 ± 0.00,52.46 ± 0.19,0.00 ± 0.00,0.00 ± 0.00,56.66 ± 0.38,58.54 ± 0.28,0.00 ± 0.00,55.90 ± 0.22
20.9B_C8_hmr_from_en_ENSEMBLE,49.64 ± 0.17,60.04 ± 0.40,54.70 ± 0.12,0.00 ± 0.00,52.54 ± 0.08,0.00 ± 0.00,0.00 ± 0.00,55.98 ± 0.36,58.58 ± 0.33,0.00 ± 0.00,55.76 ± 0.15


In [49]:
# computing # computing average  for supported languages
df_avg = df_avg[df_avg.columns.intersection(ALL_LANGUAGES)]
df_avg.mean(axis=1)

index
xglm-1.7B_dense_from_en           53.214286
10.4B_dense_from_en               55.242857
10.4B_C8_lang_from_en_SRC         55.102857
10.4B_C8_lang_from_en_TRG         55.702857
10.4B_C8_lang_from_en_ENSEMBLE    55.611429
20.9B_C8_hmr_from_en_SRC          54.965714
20.9B_C8_hmr_from_en_TRG          55.385714
20.9B_C8_hmr_from_en_ENSEMBLE     55.320000
dtype: float64

In [50]:
df_avg

Unnamed: 0_level_0,ar,en,es,hi,ru,sw,zh
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
xglm-1.7B_dense_from_en,48.64,58.16,53.16,50.36,52.52,56.0,53.66
10.4B_dense_from_en,50.2,58.9,54.54,53.28,56.56,58.2,55.02
10.4B_C8_lang_from_en_SRC,49.78,60.6,55.38,52.48,54.8,57.84,54.84
10.4B_C8_lang_from_en_TRG,50.28,60.6,55.38,52.78,56.9,58.24,55.74
10.4B_C8_lang_from_en_ENSEMBLE,50.2,60.6,55.38,53.0,55.7,58.54,55.86
20.9B_C8_hmr_from_en_SRC,49.44,60.04,54.7,52.42,55.66,57.64,54.86
20.9B_C8_hmr_from_en_TRG,49.4,60.04,54.7,52.46,56.66,58.54,55.9
20.9B_C8_hmr_from_en_ENSEMBLE,49.64,60.04,54.7,52.54,55.98,58.58,55.76


#### 