In [22]:
# The dataset can be downloaded from: (Upon request)
# https://github.com/Elbria/xformal-FoST

import os
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore FutureWarning for pandas

DATA_SIZE = 6000

In [23]:
# Load data

def readlines(data_path) -> list:
    with open(data_path, 'r') as f:
        lines = f.readlines()
    return lines

Raw_data = {
    'eng': {
        'em_formal': readlines('GYAFC_Corpus/Entertainment_Music/train/formal'),
        'em_informal': readlines('GYAFC_Corpus/Entertainment_Music/train/informal'),
        'fr_formal': readlines('GYAFC_Corpus/Family_Relationships/train/formal'),
        'fr_informal': readlines('GYAFC_Corpus/Family_Relationships/train/informal'),
    },
    'fra': {
        'em_formal': readlines('XFORMAL/gyafc_translated/fr/Entertainment_Music/train/formal'),
        'em_informal': readlines('XFORMAL/gyafc_translated/fr/Entertainment_Music/train/informal'),
        'fr_formal': readlines('XFORMAL/gyafc_translated/fr/Family_Relationships/train/formal'),
        'fr_informal': readlines('XFORMAL/gyafc_translated/fr/Family_Relationships/train/informal'),
        'eval_informal': readlines('XFORMAL/xformal_eval/french/informal'),
        'eval_formal0': readlines('XFORMAL/xformal_eval/french/formal0'),
        'eval_formal1': readlines('XFORMAL/xformal_eval/french/formal1'),
        'eval_formal2': readlines('XFORMAL/xformal_eval/french/formal2'),
        'eval_formal3': readlines('XFORMAL/xformal_eval/french/formal3'),
    },
    'por': {
        'em_formal': readlines('XFORMAL/gyafc_translated/pt/Entertainment_Music/train/formal'),
        'em_informal': readlines('XFORMAL/gyafc_translated/pt/Entertainment_Music/train/informal'),
        'fr_formal': readlines('XFORMAL/gyafc_translated/pt/Family_Relationships/train/formal'),
        'fr_informal': readlines('XFORMAL/gyafc_translated/pt/Family_Relationships/train/informal'),
        'eval_informal': readlines('XFORMAL/xformal_eval/brazilian_portuguese/informal'),
        'eval_formal0': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal0'),
        'eval_formal1': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal1'),
        'eval_formal2': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal2'),
        'eval_formal3': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal3'),
    },
    'ita': {
        'em_formal': readlines('XFORMAL/gyafc_translated/it/Entertainment_Music/train/formal'),
        'em_informal': readlines('XFORMAL/gyafc_translated/it/Entertainment_Music/train/informal'),
        'fr_formal': readlines('XFORMAL/gyafc_translated/it/Family_Relationships/train/formal'),
        'fr_informal': readlines('XFORMAL/gyafc_translated/it/Family_Relationships/train/informal'),
        'eval_informal': readlines('XFORMAL/xformal_eval/italian/informal'),
        'eval_formal0': readlines('XFORMAL/xformal_eval/italian/formal0'),
        'eval_formal1': readlines('XFORMAL/xformal_eval/italian/formal1'),
        'eval_formal2': readlines('XFORMAL/xformal_eval/italian/formal2'),
        'eval_formal3': readlines('XFORMAL/xformal_eval/italian/formal3'),
    }
}

eng_df = pd.DataFrame(columns=['idx', 'text_number', 'formal_text', 'informal_text', 'category', 'split'])
fra_df = pd.DataFrame(columns=['idx', 'text_number', 'formal_text', 'informal_text', 'category', 'split'])
por_df = pd.DataFrame(columns=['idx', 'text_number', 'formal_text', 'informal_text', 'category', 'split'])
ita_df = pd.DataFrame(columns=['idx', 'text_number', 'formal_text', 'informal_text', 'category', 'split'])

In [24]:
# Create dataframe for English
for i in range(0, DATA_SIZE):
    eng_df = eng_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['eng']['fr_formal'][i],
        'informal_text': Raw_data['eng']['fr_informal'][i],
        'category': 'fr',
        'split': 'train'
    }, ignore_index=True)

assert len(eng_df) == DATA_SIZE, f'English data size is not {DATA_SIZE}, but {len(eng_df)}'

In [25]:
# Create dataframe for French
for i in range(0, DATA_SIZE): 
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['fra']['fr_formal'][i],
        'informal_text': Raw_data['fra']['fr_informal'][i],
        'category': 'fr',
        'split': 'train'
    }, ignore_index=True)

assert len(fra_df) == DATA_SIZE

assert len(Raw_data['fra']['eval_informal']) == 1000
assert len(Raw_data['fra']['eval_informal']) == len(Raw_data['fra']['eval_formal0']) == len(Raw_data['fra']['eval_formal1']) == len(Raw_data['fra']['eval_formal2']) == len(Raw_data['fra']['eval_formal3'])
for i in range(DATA_SIZE, DATA_SIZE + 100): # validation set
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['fra']['eval_formal0'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 2,
        'formal_text': Raw_data['fra']['eval_formal1'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 3,
        'formal_text': Raw_data['fra']['eval_formal2'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 4,
        'formal_text': Raw_data['fra']['eval_formal3'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)

assert len(fra_df) == DATA_SIZE + 400

for i in range(DATA_SIZE + 100, DATA_SIZE + 1000): # test set
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['fra']['eval_formal0'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 2,
        'formal_text': Raw_data['fra']['eval_formal1'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 3,
        'formal_text': Raw_data['fra']['eval_formal2'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    fra_df = fra_df.append({
        'idx': i,
        'text_number': 4,
        'formal_text': Raw_data['fra']['eval_formal3'][i - DATA_SIZE],
        'informal_text': Raw_data['fra']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)

assert len(fra_df) == DATA_SIZE + 4000

In [26]:
# Create dataframe for Brazilian Portuguese
for i in range(0, DATA_SIZE):
    por_df = por_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['por']['fr_formal'][i],
        'informal_text': Raw_data['por']['fr_informal'][i],
        'category': 'fr',
        'split': 'train'
    }, ignore_index=True)

assert len(por_df) == DATA_SIZE

assert len(Raw_data['por']['eval_informal']) == 1000
assert len(Raw_data['por']['eval_informal']) == len(Raw_data['por']['eval_formal0']) == len(Raw_data['por']['eval_formal1']) == len(Raw_data['por']['eval_formal2']) == len(Raw_data['por']['eval_formal3'])
for i in range(DATA_SIZE, DATA_SIZE + 100): # validation set
    por_df = por_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['por']['eval_formal0'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    por_df = por_df.append({
        'idx': i,
        'text_number': 2,
        'formal_text': Raw_data['por']['eval_formal1'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    por_df = por_df.append({
        'idx': i,
        'text_number': 3,
        'formal_text': Raw_data['por']['eval_formal2'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    por_df = por_df.append({
        'idx': i,
        'text_number': 4,
        'formal_text': Raw_data['por']['eval_formal3'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)

assert len(por_df) == DATA_SIZE + 400

for i in range(DATA_SIZE + 100, DATA_SIZE + 1000): # test set
    por_df = por_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['por']['eval_formal0'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    por_df = por_df.append({
        'idx': i,
        'text_number': 2,
        'formal_text': Raw_data['por']['eval_formal1'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    por_df = por_df.append({
        'idx': i,
        'text_number': 3,
        'formal_text': Raw_data['por']['eval_formal2'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    por_df = por_df.append({
        'idx': i,
        'text_number': 4,
        'formal_text': Raw_data['por']['eval_formal3'][i - DATA_SIZE],
        'informal_text': Raw_data['por']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)

assert len(por_df) == DATA_SIZE + 4000

In [27]:
# Create dataframe for Italian
for i in range(0, DATA_SIZE):
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['ita']['fr_formal'][i],
        'informal_text': Raw_data['ita']['fr_informal'][i],
        'category': 'fr',
        'split': 'train'
    }, ignore_index=True)

assert len(ita_df) == DATA_SIZE

assert len(Raw_data['ita']['eval_informal']) == 1000
assert len(Raw_data['ita']['eval_informal']) == len(Raw_data['ita']['eval_formal0']) == len(Raw_data['ita']['eval_formal1']) == len(Raw_data['ita']['eval_formal2']) == len(Raw_data['ita']['eval_formal3'])
for i in range(DATA_SIZE, DATA_SIZE + 100): # validation set
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['ita']['eval_formal0'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 2,
        'formal_text': Raw_data['ita']['eval_formal1'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 3,
        'formal_text': Raw_data['ita']['eval_formal2'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 4,
        'formal_text': Raw_data['ita']['eval_formal3'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'valid'
    }, ignore_index=True)

assert len(ita_df) == DATA_SIZE + 400

for i in range(DATA_SIZE + 100, DATA_SIZE + 1000): # test set
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 1,
        'formal_text': Raw_data['ita']['eval_formal0'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 2,
        'formal_text': Raw_data['ita']['eval_formal1'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 3,
        'formal_text': Raw_data['ita']['eval_formal2'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)
    ita_df = ita_df.append({
        'idx': i,
        'text_number': 4,
        'formal_text': Raw_data['ita']['eval_formal3'][i - DATA_SIZE],
        'informal_text': Raw_data['ita']['eval_informal'][i - DATA_SIZE],
        'category': 'eval',
        'split': 'test'
    }, ignore_index=True)

assert len(ita_df) == DATA_SIZE + 4000

In [28]:
# Save dataframes into json files

if not os.path.exists('XFORMAL/processed'):
    os.makedirs('XFORMAL/processed')

eng_df.to_json('XFORMAL/processed/gyafc_english.json', orient='records', force_ascii=False, indent=4)
fra_df.to_json('XFORMAL/processed/xformal_french.json', orient='records', force_ascii=False, indent=4)
por_df.to_json('XFORMAL/processed/xformal_bra_portuguese.json', orient='records', force_ascii=False, indent=4)
ita_df.to_json('XFORMAL/processed/xformal_italian.json', orient='records', force_ascii=False, indent=4)