## Preprocess OpenSubtitles
This script downloads, cleans and tokenises the OpenSubtitles2018 dataset

### Installs, Arguments and Imports

In [1]:
# Some arguments
MAX_SEQ = 32
TRAIN_FRAC = 0.95
DEV_FRAC = 0.025

In [2]:
# Installs
# !pip install transformers
# !pip install wget

In [2]:
# Imports
import wget
import gzip
import pandas as pd
from io import BytesIO
import torch
from sklearn.utils import shuffle
from transformers import AutoTokenizer
from tqdm import tqdm
import pickle

In [6]:
# languages to use
languages = ['af','bn','br' 'af', 'ar', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu','fa', 'fi',
             'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'ka', 'kk', 'ko', 'lt', 'lv', 'mk', 'ml',
             'ms', 'nl', 'no', 'pl', 'pt', 'pt_br', 'ro', 'ru', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th',
             'tl', 'tr', 'uk', 'ur', 'vi', 'ze_en', 'ze_zh','ze_cn' ]

# base download url
base_url = 'https://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.'

# downloads dataset and saves to txt file for each language
def download_opus_dataset(languages,base_url):
    for language in tqdm(languages):
        url = base_url + language + '.gz'
        wget.download(url=url, out='/home/ec2-user/SageMaker/data/raw.{}.gz'.format(language))
        f_out = gzip.open('/home/ec2-user/SageMaker/data/raw.{}.gz'.format(language), 'rb')
        contents = f_out.read()
        df = pd.read_csv(BytesIO(contents), sep='\t',header=None)
        df.to_csv(r'/home/ec2-user/SageMaker/data/{}.txt'.format(language), header=None, index=None, sep='\t')

# DOWNLOAD OPUS DATASET
download_opus_dataset(languages=languages, base_url=base_url)
    

100%|██████████| 2/2 [00:47<00:00, 23.73s/it]


In [3]:
# gets file length
def file_len(fname):
  with open(fname) as f:
    for i, l in enumerate(f):
      pass
  return i + 1

In [4]:
# creates a single dataset from the different language datasets, and caps each language at samples_per_lang
def create_dataset(languages,base_path, samples_per_lang):
    list_of_dataframes = []
    for language in tqdm(languages):
        path = base_path + language + '.txt'
    
        if file_len(path)>=samples_per_lang:
            df = pd.read_csv(path,header=None,sep='\t', nrows=samples_per_lang)
        else:
            df = pd.read_csv(path,header=None,sep='\t')
        
        list_of_dataframes.append(df)
    
    merged_df = pd.concat(list_of_dataframes)
    return merged_df

# CREATE DATASET

languages = [ 'af','ar','bn','br', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fi',
              'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'ka', 'ko', 'lt', 'lv', 'mk', 'ml',
              'ms', 'nl', 'no', 'pl', 'pt', 'pt_br', 'ro', 'ru', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th',
              'tl', 'tr', 'uk', 'ur', 'vi', 'ze_en', 'ze_zh','zh_cn', 'zh_tw']

data = create_dataset(languages=languages, base_path='/home/ec2-user/SageMaker/data/',samples_per_lang=200000)
    

100%|██████████| 19/19 [1:11:14<00:00, 224.96s/it]


In [8]:
# shuffles data
data.sample(frac=1, random_state=1)[:20].to_csv('subtitles_samples.csv')

In [15]:
# performs basic data cleaning
def basic_cleaning(x):
    try:
        x = x.str.replace('"','')
        x = x.str.replace('-','')
        x = x.str.replace('\'','')
        x = x.str.replace('!','')
        x = x.str.replace('#','')
        x = x.str.replace('@','')
        x = x.str.replace('*','')
        x = x.str.replace('%','')
    except:
        pass
    return x

# CLEAN DATA
data.apply(basic_cleaning)
nan_value = float('NaN')
data.replace("",nan_value,inplace=True)
data.dropna(inplace=True)

In [16]:
# splits data based on given train/dev fractions
def train_dev_test_split(data, train_fraction, dev_fraction):
    test_fraction = 1-train_fraction-dev_fraction
    total_samples = len(data.index)
    data = shuffle(data)
    
    train_data = data[0].to_list()[:round(train_fraction*total_samples)]
    dev_data = data[0].to_list()[round(train_fraction*total_samples):round((train_fraction+dev_fraction)*total_samples)]
    test_data = data[0].to_list()[round((train_fraction+dev_fraction)*total_samples):]
    
    return train_data,dev_data,test_data

# split data 
train_data, dev_data, test_data = train_dev_test_split(data=data, train_fraction=TRAIN_FRAC, dev_fraction=DEV_FRAC)

import pickle

# save data each dataset to a pkl file
train_file = '/home/ec2-user/SageMaker/data/train_lst.pkl'
dev_file = '/home/ec2-user/SageMaker/data/dev_lst.pkl'
test_file = '/home/ec2-user/SageMaker/data/test_lst.pkl'

with open(train_file, 'wb') as handle:
    pickle.dump(train_data, handle)

print('Train done!')
    
with open(dev_file, 'wb') as handle:
    pickle.dump(dev_data, handle)

print('Dev done!')

with open(test_file, 'wb') as handle:
    pickle.dump(test_data, handle)


Train done!
Dev done!


In [3]:
# load data files
from utils import load_pickle

train_file = '/home/ec2-user/SageMaker/data/train_lst.pkl'
dev_file = '/home/ec2-user/SageMaker/data/dev_lst.pkl'
test_file = '/home/ec2-user/SageMaker/data/test_lst.pkl'
train_data = load_pickle(train_file)
dev_data = load_pickle(dev_file)
test_data = load_pickle(test_file)

In [5]:
# tokenises data in fractions of n_splits, since train dataset is too large to handle all at once
def split_tokenization(train_data, n_splits, n_done):
    # for memory - cpu can't do all at once
    num_samples = len(train_data)
    prev_train_frac = 0
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
    for i in range(n_splits):
        train_frac = round((i+1)*(1/n_splits)*num_samples)
        if i+1>n_done:
            train_subset = train_data[prev_train_frac:train_frac]
            print('Tokenizing...')
            encoded_X_train= tokenizer(train_subset, padding=True, truncation=True, max_length=MAX_SEQ, return_tensors='pt')
            train_file = '/home/ec2-user/SageMaker/data/train_{}.pkl'.format(i+1)
            print('Saving...')
            with open(train_file,'wb') as handle:
                pickle.dump(encoded_X_train, handle)
            print(i+1,' Complete!')
            
        prev_train_frac = train_frac
        

In [6]:
split_tokenization(train_data=train_data, n_splits=10, n_done=0)

Tokenizing...
Saving...
1  Complete!
Tokenizing...
Saving...
2  Complete!
Tokenizing...
Saving...
3  Complete!
Tokenizing...
Saving...
4  Complete!
Tokenizing...
Saving...
5  Complete!
Tokenizing...
Saving...
6  Complete!
Tokenizing...
Saving...
7  Complete!
Tokenizing...
Saving...
8  Complete!
Tokenizing...
Saving...
9  Complete!
Tokenizing...
Saving...
10  Complete!


In [3]:
# load the split data and return entire tokenised dataset
def load_split_data(n_splits=10):
    path = '/home/ec2-user/SageMaker/data/train_{}.pkl'
    lst = []
    print('Data loading...')
    for i in range(n_splits):
        subset = load_pickle(path.format(i+1))
        print(i+1,' Complete!')
        lst.append(subset)
    print('Concatenating input_ids...')
    lst[0]['input_ids']=torch.cat([i['input_ids'] for i in lst],dim=0)
    print('Concatenating token_type_ids...')
    lst[0]['token_type_ids']=torch.cat([i['token_type_ids'] for i in lst],dim=0)
    print('Concatenating attention_mask...')
    lst[0]['attention_mask']=torch.cat([i['attention_mask'] for i in lst],dim=0)  
    return lst[0]

In [4]:
from utils import load_pickle
train_data_final = load_split_data(n_splits=10)

Data loading...
1  Complete!
2  Complete!
3  Complete!
4  Complete!
5  Complete!
6  Complete!
7  Complete!
8  Complete!
9  Complete!
10  Complete!
Concatenating input_ids...
Concatenating token_type_ids...
Concatenating attention_mask...


In [8]:
# save tokenised train dataset
train_file = '/home/ec2-user/SageMaker/data/train.pkl'
with open(train_file,'wb') as handle:
    pickle.dump(train_data_final, handle)

In [None]:
# tokenise dev and test set
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
encoded_X_dev= tokenizer(dev_data, padding=True, truncation=True, max_length=MAX_SEQ, return_tensors='pt')
encoded_X_test= tokenizer(test_data, padding=True, truncation=True, max_length=MAX_SEQ, return_tensors='pt')

In [None]:
# save dev and test data 
import pickle

dev_file = '/home/ec2-user/SageMaker/data/dev.pkl'
test_file = '/home/ec2-user/SageMaker/data/test.pkl'

with open(dev_file, 'wb') as handle:
    pickle.dump(encoded_X_dev, handle)

print('Dev done!')

with open(test_file, 'wb') as handle:
    pickle.dump(encoded_X_test, handle)