In [1]:
from sklearn.preprocessing import LabelEncoder
languages = ['af', 'ar', 'bg', 'bn', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hi',
             'hr', 'hu', 'id', 'it', 'lt', 'lv', 'mk', 'ms', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sv',
             'ta', 'tl', 'tr', 'uk', 'vi']
partitions = ['train', 'dev', 'test']
label_le = LabelEncoder().fit(['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'])

In [2]:
import tarfile
import pandas as pd

def get_df_dict(tarfile_path):
    df_dict = {}
    tar = tarfile.open(tarfile_path)
    for member in tar.getmembers():
        f = tar.extractfile(member)
        df_dict[member.name] = pd.read_csv(f, sep='\t', header=None, names=['sent_idx', 'word_idx', 
                                                                            'word_id', 'label_id', 'label', 'word'])
    return df_dict

In [3]:
import os

data_path = './raw_data_wiki/'
output_path = './data_wiki/'
if not os.path.exists(output_path):
    os.mkdir(output_path)

for target in ['af']: # we only create data for 'af', otherwise the list should be 'languages'
    df_dict = get_df_dict(data_path + target + '.tar.gz')

    for partition in partitions:
        df_truth = df_dict[target + '_' + partition][['sent_idx', 'word_idx', 'word', 'word_id', 'label']]
        df_truth.columns = ['sent_idx', 'word_idx', 'word', 'word_id', 'truth']

        df_label = pd.DataFrame({source : label_le.transform(df_dict[target + '_' + source + '_' + partition]['label'])
                                 for source in languages if source != target})
        
        output_path_local = output_path + target + '_' + partition
        if not os.path.exists(output_path_local):
            os.mkdir(output_path_local)

        df_truth.to_csv(output_path_local + '/' + 'truth.csv', index=None)
        df_label.to_csv(output_path_local + '/' + 'label.csv', index=None)