# Raw Data Processing

In [None]:
import pandas as pd
import numpy as np
import re
import random

## Data Import

In [2]:
# Read all of the files of the raw data

# These files are ELAN files that have been converted to tab-delimited text files
# Due to the processing capabilities of the computer, the extraction to text files was done in 9 parts
df1=pd.read_csv("../data/raw_data/Gusso_1.txt", header=0, delimiter="\t", low_memory=False)
df2=pd.read_csv("../data/raw_data/Gusso_2.txt", header=0, delimiter="\t", low_memory=False)
df3=pd.read_csv("../data/raw_data/Gusso_3.txt", header=0, delimiter="\t", low_memory=False)
df4=pd.read_csv("../data/raw_data/Gusso_4.txt", header=0, delimiter="\t", low_memory=False)
df5=pd.read_csv("../data/raw_data/Gusso_5.txt", header=0, delimiter="\t", low_memory=False)
df6=pd.read_csv("../data/raw_data/Gusso_6.txt", header=0, delimiter="\t", low_memory=False)
df7=pd.read_csv("../data/raw_data/Gusso_7.txt", header=0, delimiter="\t", low_memory=False)
df8=pd.read_csv("../data/raw_data/Gusso_8.txt", header=0, delimiter="\t", low_memory=False)
df9=pd.read_csv("../data/raw_data/Gusso_9.txt", header=0, delimiter="\t", low_memory=False)

full_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9], ignore_index=True)

In [3]:
# Example of the data extracted from the ELAN files
full_df.head()

Unnamed: 0,Begin Time - msec,End Time - msec,Duration - msec,B_phrase-gls-en,B_morph-hn-hru,B_word-txt-hru,A_morph-msa-en,A_word-txt-hru,interlinear-text-title-hru,B_phrase-segnum-en,...,C_word-txt-en,A_word-txt-en,E_morph-type,E_morph-cf-hru,E_phrase-gls-en,E_morph-msa-en,E_morph-hn-hru,E_morph-gls-en,E_phrase-segnum-en,E_word-txt-hru
0,0,603715,603715,,,,,,hru_1085_genesis,,...,,,,,,,,,,
1,20971,21982,1011,Father Vijay.,,Fadar Vijay,,,hru_1085_genesis,6.0,...,,,,,,,,,,
2,515888,516454,566,Yes.,,ẽ,,,hru_1085_genesis,174.0,...,,,,,,,,,,
3,603092,603715,623,Indeed.,2.0,ã,,,hru_1085_genesis,193.0,...,,,,,,,,,,
4,12319,12934,615,,,,adv,yow,hru_1085_genesis,,...,,,,,,,,,,


## Data Cleanup

In [4]:
# Get the names of the columns that are speaker-dependent
speaker_cols = full_df.filter(regex='^(A_|B_|C_|D_|E_)').columns
# Get the names of the prev mentioned columns without the speaker specification
unique_speaker_cols = speaker_cols.str.replace('[A|B|C|D|E]_', '', regex=True).unique()

# Get the names of the columns that are not speaker-dependent
general_cols = full_df.filter(regex='^(?!A_|B_|C_|D_|E_)').columns

# Create a new dataframe, which will store the processed cleaned up data
result_df = pd.DataFrame(columns=list(unique_speaker_cols) + list(general_cols))

# The speaker-dependent columns are replaced with a general column, filled with the
# non-NA value found in the relevant unprocessed columns
for col in unique_speaker_cols:
    original_cols = []
    for letter in ['A', 'B', 'C', 'D', 'E']:
        col_name = f'{letter}_{col}'
        if col_name in speaker_cols:
            original_cols.append(col_name)
    result_df[col] = full_df[original_cols].bfill(axis=1).iloc[:, 0]

# Copy the columns that are not speaker-dependent to new dataframe
for col in general_cols:
    result_df[col] = full_df[col]

In [5]:
result_df = result_df.drop(columns=["File Path"])  # Removes the "File Path" column

# Generalisation of the "interlinear-text-title" (it was called differently in different files)
result_df['interlinear-text-title-hru'] = result_df['interlinear-text-title-hru'].fillna('')
result_df['interlinear-text-title-en'] = result_df['interlinear-text-title-en'].fillna('')
result_df['interlinear-text-title'] = result_df['interlinear-text-title-hru'] + result_df['interlinear-text-title-en']
result_df = result_df.drop(columns=["interlinear-text-title-hru", "interlinear-text-title-en"])

# Save dataframe to csv file
result_df.to_csv("../data/data.csv")

## Save Data as Split Sentences

In [None]:
df = pd.read_csv("../data/data.csv", index_col=0, low_memory=False)

In [None]:
# Different sentences can be identified based on the phrase number and the text title
sent_id = df[['phrase-segnum-en', 'interlinear-text-title', 'phrase-gls-en']].drop_duplicates()

# Create empty list of sentences
sentences = []

for index, row in sent_id.iterrows():
    # Get the df of words for each sentence
    hru_word_df = df.loc[(df['phrase-segnum-en'] == row['phrase-segnum-en']) & (df['interlinear-text-title'] == row['interlinear-text-title']), 'word-txt-hru']
    # Concatenate words into sentence
    sentence_raw = hru_word_df.str.cat(sep=' ')

    # Append non-empty, non-filler sentences without duplicate words
    if sentence_raw not in ["", "FILLER", "filler", "LAUGH", "INDISTINCT"]:
        # Remove duplicate words, the pattern is case-insensitive and matches sequences of duplicates
        sentence = re.sub(r'\b(\w+)(\s+\1)+\b', r'\1', sentence_raw, flags=re.IGNORECASE).split()
        # Remove filler words
        for fil in ["FILLER", "filler", "LAUGH", "INDISTINCT"]:
            while fil in sentence:
                sentence.remove(fil)
        sentences.append(sentence)

np.save("../data/split_sentences.npy", sentences)

## Create Train and Test Splits

In [None]:
with open('../data/split_sentences.txt', 'r') as file:
    sentences = file.readlines()

# Strip newline characters from the end of each line
sentences = [sentence.strip() for sentence in sentences]
random.shuffle(sentences)

# Create data splits
train_end = int(0.8 * len(sentences))
val_end = train_end + int(0.1 * len(sentences))
train = sentences[:train_end]
val = sentences[train_end:val_end]
test = sentences[val_end:]

# Write train, validation and test data to files
with open("../data/train_data.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(train))

with open("../data/val_data.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(val))

with open("../data/test_data.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test))