In [118]:
# Load and open the English sentences in the tar file

# Load the Romanian/English sentence pairs
import tarfile
filename = '.\\data\\ro-en.tgz'
tar = tarfile.open(filename, 'r:gz')

# Extract the sentences
for i,item in enumerate(tar):
    tar.extract(item)

# Open the English sentences
import pandas as pd
df = pd.read_csv('europarl-v7.ro-en.en',sep='\t',header=None)

In [121]:
# Format the DF

# Drop the first and last few lines
df=df[50:-50]

# Drop all lines below n characters in length
n = 40
df = df[df[0].apply(lambda x: len(x)>=n )].reset_index(drop=True)

# Shuffle
df = df.sample(frac=1,random_state=355).reset_index(drop=True)

In [125]:
# Save file, in case we need to create another version of the CSV file
# This cell is commented because it is only here as a utility; uncomment if needed

# filename = '.\\data\\europarl_english_sentence_samples.csv'
# df.to_csv(filename,header=False,index=True)

## Continue data generation

This section is used to generate the data a few samples at a time.

In [None]:
# Choose which type of data to generate
mutation_type = "rep"
# mutation_type = "del"
# mutation_type = "sub"

# This variable determines which mutation is applied to the data. To change what a type means or
# to add a type, see text_mutation_generation.py:mutate_selectively.

# In addition, this notebook creates the following files based on this variable:

# data\sentences_to_be_processed_{mutation_type}.csv
# e.g. sentences_to_be_processed_rep.csv -> sentences still to be processed for repetition data

# data\generated_data_{mutation_type}.csv
# e.g. generated_data_del.csv -> generated deletion data

In [2]:
# Load and clean the sentences

# Load the list of sentences still to be processed
import pandas as pd
filename_to_be_processed = f'.\\data\\sentences_to_be_processed_{mutation_type}.csv'
df = pd.read_csv(filename_to_be_processed, header=None, names = ['sentence'])

# Some of the sentences have nation abbreviations in them, like "(DE)". Remove these:
df.sentence = df.sentence.str.replace(r'\([A-Z]{2}\)','')

In [54]:
# Pull off the portion to be used for generation now.
n = 100
df_still_to_be_processed = df[n:] # Once we're done here, we'll save this for next time.
df = df[:n]

In [55]:
# Generate data using the current portion.
from mutation_dataset_generator import create_dataset

start_idx = df.index[0] # Used for labelling the audio files

In [None]:
df_new_samples = create_dataset(df.sentence.values, 'europarl_sentence', mutation_type, start_idx=start_idx)

In [57]:
# Verify the new samples look okay
df_new_samples

Unnamed: 0,true_text,tags,filepath,asr
0,My group therefore has its own resolution.,"[O, O, RB2, RI2, RB2, RI2, O]",.\data\audio\europarl_sentence486.wav,virut therefore has therefore has its own its ...
1,I should like to take advantage of the Commiss...,"[O, O, O, O, RB3, O, O, O, O, RB2, RI2, RI2, R...",.\data\audio\europarl_sentence487.wav,ishould like to tae take advantage of the com...
2,Vice-President of the Commission. - Mr Preside...,"[RB3, RI3, RI3, RI3, RI3, RI3, RI3, RI3, O, O,...",.\data\audio\europarl_sentence488.wav,last president of the commission mister presid...
3,"Member of the Commission. - Madam President, i...","[O, O, O, O, RB2, RI2, RI2, RI2, RI2, RI2, RI2...",.\data\audio\europarl_sentence489.wav,member of the commission madam president it is...
4,The various discounts that are granted to the ...,"[O, O, O, O, RB3, RI3, RI3, RB3, RI3, RI3, RI3...",.\data\audio\europarl_sentence490.wav,the various disfounts that are granted to our ...
...,...,...,...,...
95,This issue is not new and raises regular quest...,"[O, O, O, RB3, RI3, RI3, RI3, RI3, O, O, RB3, ...",.\data\audio\europarl_sentence581.wav,this issue is not new and rais is regular not ...
96,"Ultimately, however, Parliament gave way, with...","[O, O, O, O, O, O, O, O, O, O, O, O, RB2, RI2,...",.\data\audio\europarl_sentence582.wav,ultimately however parliament gave way with th...
97,The joint declaration of 16 April issued by al...,"[RB3, RI3, RI3, RI3, RI3, O, O, O, O, O, O, O,...",.\data\audio\europarl_sentence583.wav,the joint declaration of suxceeing the joint d...
98,I appeal to the European Commission to ensure ...,"[O, RB3, RI3, RI3, RI3, RI3, RI3, RI3, RI3, RI...",.\data\audio\europarl_sentence584.wav,appealed to the european commission to ensure...


In [58]:
# Load the existing data (true text and asr transcripts)
filename_existing = f'.\\data\\generated_data_{mutation_type}.csv'
df_existing_data = pd.read_csv(filename_existing, header=None, index_col=0, names=['true_text','tags','filepath','asr'])

In [59]:
# Concatenate previous data with current data
df_all_data = pd.concat([df_existing_data, df_new_samples]).reset_index(drop=True)

In [60]:
#scr check
print(df_all_data.shape)
print(df_existing_data.shape)
print(df_new_samples.shape)
print(df_still_to_be_processed.shape)

(586, 4)
(486, 4)
(100, 4)
(365741, 1)


In [61]:
# Save the concatenated previous with current data (overwriting old version)
df_all_data.to_csv(filename_existing,header=False,index=True)

In [62]:
# Save the list of sentences to be processed (overwriting old version)
df_still_to_be_processed.to_csv(filename_to_be_processed,header=False,index=True)