## get all stats

In [1]:
import os, glob
import pandas as pd

gzs = './data/*/eda//*.csv'
paths = glob.glob(gzs)

In [2]:
# generate word level
ds_names = []
dfs = []
for p in paths:
    if 'infor_word' in p:
        ds_names.append(p.split('\\')[1])
        dfs.append(pd.read_csv(p))

df = pd.concat(dfs)
df['dataset'] = ds_names

df = df[['dataset', 'avg_fa', 'min_fa', 'max_fa', '92%_fa', 'all_fa', 'unique_fa', 'avg_en',
       'min_en', 'max_en', '92%_en', 'all_en', 'unique_en']]

df.to_csv('./word_level_info.csv', index=False)

In [3]:
# generate charachter level
ds_names = []
dfs = []
for p in paths:
    if 'infor_char' in p:
        ds_names.append(p.split('\\')[1])
        dfs.append(pd.read_csv(p))

df = pd.concat(dfs)
df['dataset'] = ds_names
df = df[['dataset', 'avgc_fa', 'minc_fa', 'maxc_fa', 'avgc_en', 'minc_en', 'maxc_en']]
df.to_csv('./char_level_info.csv', index=False)



## spilit datasets


In [2]:
import os, glob
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
datasets = ['Mizan', 'PEPC_Bidirectional', 'PEPC_Onedirectional', 'TEP', 'TEP++', 'Quran', 'Bible']
datasets = ['Bible',]

root = './data/'

for d in datasets:
    paths = glob.glob('{}/{}/*'.format(root, d))

    train_path = '{}/{}/train.csv'.format(root, d)
    test_path = '{}/{}/test.csv'.format(root, d)
    dev_path = '{}/{}/dev.csv'.format(root, d)

    for p in paths:
        if 'en-fa.csv' in p:
            df = pd.read_csv(p)

    df = df.sample(frac=1).reset_index(drop=True)
    df_dev = df[:5000]
    if len(df) > 1e+6:
        df_train, df_test = train_test_split(df[5000:], test_size=0.01, random_state=42)
    else:
        df_train, df_test = train_test_split(df[5000:], test_size=0.1, random_state=42)
    
    print("{}".format(d))
    print("{} ||| {}".format(len(df), len(df_train)+len(df_test)+ len(df_dev)))
    print("df_train:", len(df_train))
    print("df_test:", len(df_test))
    print("df_dev:", len(df_dev))
    print("_____________________")
    df_train.to_csv(train_path, index=False)
    df_test.to_csv(test_path, index=False)
    df_dev.to_csv(dev_path, index=False)


Bible
62033 ||| 62033
df_train: 51329
df_test: 5704
df_dev: 5000
_____________________


## dataset each file count


In [4]:
import os, glob
import pandas as pd

In [5]:
datasets = ['Mizan', 'PEPC_Bidirectional', 'PEPC_Onedirectional', 'TEP', 'TEP++', 'OpenSubtitles', 'Bible', 'Quran', 'ParsiNLU']
root = './data/'

df = pd.DataFrame({})
for d in datasets:
    paths = glob.glob('{}/{}/*'.format(root, d))

    train_path = '{}/{}/train.csv'.format(root, d)
    test_path = '{}/{}/test.csv'.format(root, d)
    dev_path = '{}/{}/dev.csv'.format(root, d)
    
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)
    df_dev = pd.read_csv(dev_path)
   
    line ={'datasets': d,
            'train': int(len(df_train)),
            'dev': int(len(df_dev)),
            'test': int(len(df_test)),
            'all': int(len(df_train)) + int(len(df_dev)) + int(len(df_test))}

    df = df.append(line, ignore_index=True)
    # break

df.to_csv('dataset_info.csv', index=False)


## parsiNLU

In [9]:
import pandas as pd

In [10]:
dir_en_fa = './data/ParsiNLU/en-fa/'
dir_fa_en = './data/ParsiNLU/fa-en/'

In [15]:
df_fa_en_train = pd.read_csv(dir_fa_en+'train.tsv', sep='\t', on_bad_lines='skip')
df_fa_en_dev = pd.read_csv(dir_fa_en+'dev.tsv', sep='\t', on_bad_lines='skip')
df_fa_en_test = pd.read_csv(dir_fa_en+'test.tsv', sep='\t',on_bad_lines='skip')

In [37]:
df_en_fa_train = pd.read_csv(dir_en_fa+'train.tsv', sep='\t', on_bad_lines='skip', names=['en', 'fa', 'src'])
df_en_fa_dev = pd.read_csv(dir_en_fa+'dev.tsv', sep='\t', on_bad_lines='skip', names=['en', 'fa', 'src'])
df_en_fa_test = pd.read_csv(dir_en_fa+'test.tsv', sep='\t',on_bad_lines='skip', names=['en', 'fa', 'src'])

In [39]:
df_en_fa_train[['en', 'fa']].to_csv('./data/ParsiNLU/train.csv', index=False)
df_en_fa_dev[['en', 'fa']].to_csv('./data/ParsiNLU/dev.csv', index=False)
df_en_fa_test[['en', 'fa']].to_csv('./data/ParsiNLU/test.csv', index=False)

In [41]:
pd.concat([df_en_fa_train, df_en_fa_dev, df_en_fa_test])[['en', 'fa']].to_csv("./data/ParsiNLU/en-fa.csv", index=False)

## get experiment results

In [1]:
import os, glob
import pandas as pd

gzs_1 = './outs/.*'
gzs_2 = './outs/*'
paths = glob.glob(gzs_1) + glob.glob(gzs_2)


cols = ['data', 'mt5-small', 'mt5-base', 'mt5-large',  'distilled-600M','distilled-1.3B']

datasets = ['Mizan', 'Bidirectional', 'Onedirectional', 'TEP', 'TEP++', 'Quran', 'Bible',  "ParsiNLU", "OpenSubtitles"]

info_en_fa = {}
info_fa_en = {} 
for c in cols[1:]:
    info_en_fa[c] = {}
    info_fa_en[c] = {}
    for d in datasets:
        info_en_fa[c][d] = -1
        info_fa_en[c][d] = -1

In [None]:
for p in paths:
    model = p.split("\\")[1].split('_')[0]
    model = '-'.join(model.split('-')[-2:])
    dataset = p.split("\\")[1].split('_')[-4]
    way = '{}_{}'.format(p.split("\\")[1].split('_')[-3],p.split("\\")[1].split('_')[-2])
    df = pd.read_csv(p + '/bl_score_{}.csv'.format(way))
    score = df['blue_score'].values[0] * 100

    if way == 'fa_en':
        info_fa_en[model][dataset] = score
    else:
        info_en_fa[model][dataset] = score



df_en_fa = pd.DataFrame(info_en_fa)
df_fa_en = pd.DataFrame(info_fa_en)

df_en_fa['dataset'] = list(info_en_fa ['mt5-small'].keys())
df_fa_en['dataset'] = list(info_en_fa ['mt5-small'].keys())

cols = df_en_fa.columns.tolist()
cols = cols[-1:] + cols[:-1]

df_en_fa = df_en_fa[cols]
df_fa_en = df_fa_en[cols]

df_f = pd.concat([df_en_fa, df_fa_en.reindex(df_en_fa.index)], axis=1)

df_en_fa.to_csv("./en_fa.csv", index=False)
df_fa_en.to_csv("./fa_en.csv", index=False)

df_f.drop(columns=['dataset'], inplace=True)
df_f.to_csv("./final.csv")

## generate different bleu scores

In [3]:
import os, glob
import pandas as pd
from torchmetrics.functional import sacre_bleu_score
 
gzs_1 = './outs_d1/.*'
gzs_2 = './outs_d1/*'
paths = glob.glob(gzs_1) + glob.glob(gzs_2)

def get_blue_score(df_test, predicted_target, max_n, tar):
    target = list(df_test[tar].values)[:len(predicted_target)]
    real_target = [[sent] for sent in target]
    bl_score = sacre_bleu_score(predicted_target, real_target, n_gram=max_n)
    return bl_score.item()


In [25]:

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(color_codes=True)
tips = sns.load_dataset("tips")


datasets = ['Mizan', 'PEPC_Bidirectional', 'PEPC_Onedirectional', 'TEP', 'TEP++', 'Quran', 'Bible',  "ParsiNLU", "OpenSubtitles"]

gzs_1 = './outs_d1/*'
gzs_2 = './outs_d1/.*'

# gzs_1 = './outs/*'
# gzs_2 = './outs/.*'

ngrams = [3, 4, 5, 6, 7]
data_dir = './data_d1/'
out_dir = './ngrams'

paths = glob.glob(gzs_1) + glob.glob(gzs_2)

for d in datasets:
    each_bl_pathes = []
    df_en_fa = []
    df_fa_en = []
    os.makedirs("{}/{}/".format(out_dir, d), exist_ok=True)

    for p in paths:
        if d in p:
            each_bl_pathes.append(p)
    
    
    for p in each_bl_pathes:
        model = p.split("\\")[1].split('_')[0]
        model = '-'.join(model.split('-')[-2:])
        way = '{}_{}'.format(p.split("\\")[1].split('_')[-3],p.split("\\")[1].split('_')[-2])
        target = p.split("\\")[1].split('_')[-2]

        data = pd.read_csv(data_dir + d + '/test.csv')
        predicted = pd.read_csv(p + '/predicted_{}.csv'.format(way))
   
        for max_n in ngrams:
            bl_score = get_blue_score(data, predicted['predicted'].values, max_n, target)

            if way == 'en_fa':
                a = {"model": model, "score": bl_score * 100, 'n_gram':max_n}
                df_en_fa.append(a) 
            if way =='fa_en':
                a = {"model": model, "score": bl_score * 100, 'n_gram':max_n}
                df_fa_en.append(a) 

    df_en_fa = pd.DataFrame(df_en_fa)
    df_fa_en = pd.DataFrame(df_fa_en)

    if len(df_en_fa) != 0:
        df_en_fa.to_csv('{}/{}/en-fa-ngrams.csv'.format(out_dir, d), index=False)
        fig = plt.figure()
        sns.factorplot(x='n_gram', y='score', hue='model', data=df_en_fa, kind='bar')
        fig.savefig('{}/{}/en-fa-ngrams.png'.format(out_dir, d)) 

    if len(df_fa_en) != 0:
        df_fa_en.to_csv('{}/{}/fa-en-ngrams.csv'.format(out_dir, d), index=False)
        fig = plt.figure()
        sns.factorplot(x='n_gram', y='score', hue='model', data=df_fa_en, kind='bar')
        fig.savefig('{}/{}/fa-en-ngrams.png'.format(out_dir, d))

fa_en
./outs\.-google-mt5-small_Bible_fa_en_7/predicted_fa_en.csv


## generate each bleu score


In [None]:
import os, glob
import pandas as pd
from torchmetrics.functional import sacre_bleu_score

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(color_codes=True)
tips = sns.load_dataset("tips")




datasets = ['Mizan', 'Bidirectional', 'Onedirectional', 'TEP', 'TEP++', 'Quran', 'Bible',  "ParsiNLU", "OpenSubtitles"]

# gzs_1 = './outs_d1/*'
# gzs_2 = './outs_d1/.*'

gzs_1 = './outs/*'
gzs_2 = './outs/.*'

out_dir = './each_bl'

paths = glob.glob(gzs_1) + glob.glob(gzs_2)

for d in datasets:
    each_bl_pathes = []
    df_en_fa = {}
    df_fa_en = {}
    os.makedirs("{}/{}/".format(out_dir, d), exist_ok=True)

    for p in paths:
        if d in p:
            each_bl_pathes.append(p)
    
    
    for p in each_bl_pathes:
        model = p.split("\\")[1].split('_')[0]
        model = '-'.join(model.split('-')[-2:])
        way = '{}-{}'.format(p.split("\\")[1].split('_')[-3],p.split("\\")[1].split('_')[-2])
        blue_scores = pd.read_csv(p + '/each_bl_score.csv')['blue_score'].values * 100
        
        if way == 'en-fa':
            df_en_fa[model] = blue_scores
        if way =='fa-en':
            df_fa_en[model] = blue_scores

    df_en_fa = pd.DataFrame(df_en_fa)
    df_fa_en = pd.DataFrame(df_fa_en)
    
    if len(df_en_fa) != 0:
        df_en_fa.to_csv('{}/{}/en-fa-bls.csv'.format(out_dir, d), index=False)
        df_en_fa = df_en_fa.set_index(np.arange(1, 8))
        fig = plt.figure()
        sns.lineplot(data=df_en_fa, markers=True, dashes=False)
        fig.savefig('{}/{}/en-fa-bls.png'.format(out_dir, d)) 

    if len(df_fa_en) != 0:
        df_fa_en.to_csv('{}/{}/fa-en-bls.csv'.format(out_dir, d), index=False)
        df_fa_en = df_fa_en.set_index(np.arange(1, 8))
        fig = plt.figure()
        sns.lineplot(data=df_fa_en, markers=True, dashes=False)
        fig.savefig('{}/{}/fa-en-bls.png'.format(out_dir, d))