In [1]:
# The dataset can be downloaded from: (Upon request)
# https://github.com/Elbria/xformal-FoST

import os
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore FutureWarning for pandas

In [2]:
# Load data

def readlines(data_path) -> list:
    with open(data_path, 'r') as f:
        lines = f.readlines()
    return lines

Raw_data = {
    'fra': {
        'em_formal': readlines('XFORMAL/gyafc_translated/fr/Entertainment_Music/train/formal'),
        'em_informal': readlines('XFORMAL/gyafc_translated/fr/Entertainment_Music/train/informal'),
        'fr_formal': readlines('XFORMAL/gyafc_translated/fr/Family_Relationships/train/formal'),
        'fr_informal': readlines('XFORMAL/gyafc_translated/fr/Family_Relationships/train/informal'),
        'eval_informal': readlines('XFORMAL/xformal_eval/french/informal'),
        'eval_formal0': readlines('XFORMAL/xformal_eval/french/formal0'),
        'eval_formal1': readlines('XFORMAL/xformal_eval/french/formal1'),
        'eval_formal2': readlines('XFORMAL/xformal_eval/french/formal2'),
        'eval_formal3': readlines('XFORMAL/xformal_eval/french/formal3'),
    },
    'por': {
        'em_formal': readlines('XFORMAL/gyafc_translated/pt/Entertainment_Music/train/formal'),
        'em_informal': readlines('XFORMAL/gyafc_translated/pt/Entertainment_Music/train/informal'),
        'fr_formal': readlines('XFORMAL/gyafc_translated/pt/Family_Relationships/train/formal'),
        'fr_informal': readlines('XFORMAL/gyafc_translated/pt/Family_Relationships/train/informal'),
        'eval_informal': readlines('XFORMAL/xformal_eval/brazilian_portuguese/informal'),
        'eval_formal0': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal0'),
        'eval_formal1': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal1'),
        'eval_formal2': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal2'),
        'eval_formal3': readlines('XFORMAL/xformal_eval/brazilian_portuguese/formal3'),
    },
    'ita': {
        'em_formal': readlines('XFORMAL/gyafc_translated/it/Entertainment_Music/train/formal'),
        'em_informal': readlines('XFORMAL/gyafc_translated/it/Entertainment_Music/train/informal'),
        'fr_formal': readlines('XFORMAL/gyafc_translated/it/Family_Relationships/train/formal'),
        'fr_informal': readlines('XFORMAL/gyafc_translated/it/Family_Relationships/train/informal'),
        'eval_informal': readlines('XFORMAL/xformal_eval/italian/informal'),
        'eval_formal0': readlines('XFORMAL/xformal_eval/italian/formal0'),
        'eval_formal1': readlines('XFORMAL/xformal_eval/italian/formal1'),
        'eval_formal2': readlines('XFORMAL/xformal_eval/italian/formal2'),
        'eval_formal3': readlines('XFORMAL/xformal_eval/italian/formal3'),
    }
}

fra_df = pd.DataFrame(columns=['idx', 'text', 'label', 'category', 'split'])
por_df = pd.DataFrame(columns=['idx', 'text', 'label', 'category', 'split'])
ita_df = pd.DataFrame(columns=['idx', 'text', 'label', 'category', 'split'])

In [3]:
# Create dataframe for French
# Label: 0 for informal, 1 for formal

for idx, line in enumerate(Raw_data['fra']['fr_formal']):
    fra_df = fra_df.append({'idx': idx, 'text': line.strip(), 'label': 1, 'category': 'fr', 'split': 'train'}, ignore_index=True)
for idx, line in enumerate(Raw_data['fra']['fr_informal']):
    idx_ = idx + len(Raw_data['fra']['fr_formal'])
    fra_df = fra_df.append({'idx': idx_, 'text': line.strip(), 'label': 0, 'category': 'fr', 'split': 'train'}, ignore_index=True)

# train-valid split: 95% - 5%
fra_df = fra_df.sample(frac=1, random_state=42).reset_index(drop=True) # shuffle
# Select 5% as validation set - change the split column to 'valid'
fra_df.loc[fra_df.index[:int(len(fra_df)*0.05)], 'split'] = 'valid'

# eval data
for idx, line in enumerate(Raw_data['fra']['eval_informal']):
    idx_ = idx + len(Raw_data['fra']['fr_formal']) + len(Raw_data['fra']['fr_informal'])
    fra_df = fra_df.append({'idx': idx_, 'text': line.strip(), 'label': 0, 'category': 'fr', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['fra']['eval_formal0']):
    idx_ = idx + len(Raw_data['fra']['fr_formal']) + len(Raw_data['fra']['fr_informal']) + len(Raw_data['fra']['eval_informal'])
    fra_df = fra_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'fr', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['fra']['eval_formal1']):
    idx_ = idx + len(Raw_data['fra']['fr_formal']) + len(Raw_data['fra']['fr_informal']) + len(Raw_data['fra']['eval_informal']) + len(Raw_data['fra']['eval_formal0'])
    fra_df = fra_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'fr', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['fra']['eval_formal2']):
    idx_ = idx + len(Raw_data['fra']['fr_formal']) + len(Raw_data['fra']['fr_informal']) + len(Raw_data['fra']['eval_informal']) + len(Raw_data['fra']['eval_formal0']) + len(Raw_data['fra']['eval_formal1'])
    fra_df = fra_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'fr', 'split': 'test'}, ignore_index=True)

In [4]:
# Create dataframe for Portuguese
# Label: 0 for informal, 1 for formal

for idx, line in enumerate(Raw_data['por']['fr_formal']):
    por_df = por_df.append({'idx': idx, 'text': line.strip(), 'label': 1, 'category': 'por', 'split': 'train'}, ignore_index=True)
for idx, line in enumerate(Raw_data['por']['fr_informal']):
    idx_ = idx + len(Raw_data['por']['fr_formal'])
    por_df = por_df.append({'idx': idx_, 'text': line.strip(), 'label': 0, 'category': 'por', 'split': 'train'}, ignore_index=True)

# train-valid split: 95% - 5%
por_df = por_df.sample(frac=1, random_state=42).reset_index(drop=True) # shuffle
# Select 5% as validation set - change the split column to 'valid'
por_df.loc[por_df.index[:int(len(por_df)*0.05)], 'split'] = 'valid'

# eval data
for idx, line in enumerate(Raw_data['por']['eval_informal']):
    idx_ = idx + len(Raw_data['por']['fr_formal']) + len(Raw_data['por']['fr_informal'])
    por_df = por_df.append({'idx': idx_, 'text': line.strip(), 'label': 0, 'category': 'por', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['por']['eval_formal0']):
    idx_ = idx + len(Raw_data['por']['fr_formal']) + len(Raw_data['por']['fr_informal']) + len(Raw_data['por']['eval_informal'])
    por_df = por_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'por', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['por']['eval_formal1']):
    idx_ = idx + len(Raw_data['por']['fr_formal']) + len(Raw_data['por']['fr_informal']) + len(Raw_data['por']['eval_informal']) + len(Raw_data['por']['eval_formal0'])
    por_df = por_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'por', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['por']['eval_formal2']):
    idx_ = idx + len(Raw_data['por']['fr_formal']) + len(Raw_data['por']['fr_informal']) + len(Raw_data['por']['eval_informal']) + len(Raw_data['por']['eval_formal0']) + len(Raw_data['por']['eval_formal1'])
    por_df = por_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'por', 'split': 'test'}, ignore_index=True)

In [5]:
# Create dataframe for Italian
# Label: 0 for informal, 1 for formal

for idx, line in enumerate(Raw_data['ita']['fr_formal']):
    ita_df = ita_df.append({'idx': idx, 'text': line.strip(), 'label': 1, 'category': 'ita', 'split': 'train'}, ignore_index=True)
for idx, line in enumerate(Raw_data['ita']['fr_informal']):
    idx_ = idx + len(Raw_data['ita']['fr_formal'])
    ita_df = ita_df.append({'idx': idx_, 'text': line.strip(), 'label': 0, 'category': 'ita', 'split': 'train'}, ignore_index=True)

# train-valid split: 95% - 5%
ita_df = ita_df.sample(frac=1, random_state=42).reset_index(drop=True) # shuffle
# Select 5% as validation set - change the split column to 'valid'
ita_df.loc[ita_df.index[:int(len(ita_df)*0.05)], 'split'] = 'valid'

# eval data
for idx, line in enumerate(Raw_data['ita']['eval_informal']):
    idx_ = idx + len(Raw_data['ita']['fr_formal']) + len(Raw_data['ita']['fr_informal'])
    ita_df = ita_df.append({'idx': idx_, 'text': line.strip(), 'label': 0, 'category': 'ita', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['ita']['eval_formal0']):
    idx_ = idx + len(Raw_data['ita']['fr_formal']) + len(Raw_data['ita']['fr_informal']) + len(Raw_data['ita']['eval_informal'])
    ita_df = ita_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'ita', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['ita']['eval_formal1']):
    idx_ = idx + len(Raw_data['ita']['fr_formal']) + len(Raw_data['ita']['fr_informal']) + len(Raw_data['ita']['eval_informal']) + len(Raw_data['ita']['eval_formal0'])
    ita_df = ita_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'ita', 'split': 'test'}, ignore_index=True)
for idx, line in enumerate(Raw_data['ita']['eval_formal2']):
    idx_ = idx + len(Raw_data['ita']['fr_formal']) + len(Raw_data['ita']['fr_informal']) + len(Raw_data['ita']['eval_informal']) + len(Raw_data['ita']['eval_formal0']) + len(Raw_data['ita']['eval_formal1'])
    ita_df = ita_df.append({'idx': idx_, 'text': line.strip(), 'label': 1, 'category': 'ita', 'split': 'test'}, ignore_index=True)

In [6]:
# Save dataframes into json files

if not os.path.exists('XFORMAL/processed'):
    os.makedirs('XFORMAL/processed')

fra_df.to_json('XFORMAL/processed/xformal_french.json', orient='records', force_ascii=False, indent=4)
por_df.to_json('XFORMAL/processed/xformal_bra_portuguese.json', orient='records', force_ascii=False, indent=4)
ita_df.to_json('XFORMAL/processed/xformal_italian.json', orient='records', force_ascii=False, indent=4)