## Data preprocessing

In [1]:
import os
import gzip
import pandas as pd
from pyndl import count, preprocess
#os.chdir('/home/ansost/ModMo-Implementation/scripts')

In [121]:
# Reads in the file with. A header is specified in the 'names' argument since the original file does not have one. 
# The suffix column is already named 'outcomes' since we dont need to preprocess it and it saves one renaming
# operation. 
df = pd.read_csv('../data/20thcent.txt', 
                 sep = '\t', 
                 names=["Outcomes", "onset1", "nucleus1", "coda1", "onset2", 
                        "nucleus2", "coda2", "syntact_info", "some_number", "token"])     
df

In [122]:
# Removes commas in Suffix and token columns.
cols_to_replace = ["Outcomes", "token"]
df[cols_to_replace] = df[cols_to_replace].replace({',':''}, regex=True)
df

In [123]:
# Remove unused columns to be more memory efficient. 
df.drop(columns=["some_number"], inplace = True)
df

In [124]:
# Model 1.
# Cues: second-to-last syllable (syll1), last syllable (syll2), token, syntactical info, Outcomes: suffix.

df_m1 = df.copy()

# Merges the syllable info together.
df_m1["syll1"] = df_m1["onset1"] + df_m1["nucleus1"] + df_m1["coda1"] 
df_m1["syll2"] = df_m1["onset2"] + df_m1["nucleus2"] + df_m1["coda2"] 
df_m1.drop(columns=["onset1", "nucleus1", "coda1", "onset2", "nucleus2", "coda2"], inplace = True)

# Merges columns into 'cues' column.
df_m1["Cues"] = df_m1["syll1"] + "_" + df_m1["syll1"] + "_" + df_m1["syntact_info"] + "_" + df_m1["token"] 
df_m1.drop(columns=["syll1", "syll2", "token", "syntact_info"], inplace = True)
df_m1 = df_m1[["Cues", "Outcomes"]]
df_m1.to_csv("../data/m1_syllable.tsv", sep = '\t', index = False)
del df_m1

In [125]:
# Model 2.
# Cues: onset 1, nucleus 1, coda 1, onset 2, nucleus 2, coda 2, token, syntactical info, Outcomes suffix.

df_m2 = df.copy()

# Merges columns into 'cues' column.
df_m2["Cues"] = df_m2["onset1"] + "_" + df_m2["nucleus1"] + "_" + df_m2["coda1"]  + "_" + df_m2["onset2"] + "_" + df_m2["nucleus2"]  + "_" + df_m2["coda2"]  + "_" + df_m2["syntact_info"] + "_" + df_m2["token"]
df_m2.drop(columns=["onset1", "nucleus1", "coda1", "onset2", "nucleus2", "coda2", "syntact_info", "token"], inplace = True)
df_m2 = df_m2[["Cues", "Outcomes"]]
df_m2.to_csv("../data/m2_separate.tsv", sep = '\t', index = False)
del df_m2

In [126]:
# Gzips the files, needed for pyndl input. 
with open("../data/m1_syllable.tsv", "rb") as f_in, gzip.open("../data/m1_syllable.gz", "wb") as f_out:
    f_out.writelines(f_in)
with open("../data/m2_separate.tsv", "rb") as f_in, gzip.open("../data/m2_separate.gz", "wb") as f_out:
    f_out.writelines(f_in)

In [129]:
# Model 3.
# Filtering the eventfile to get a model without '=' as a cue.
# The pyndl functions only work with g-zipped files. 
preprocess.filter_event_file(input_event_file="../data/m2_separate.gz",
                             output_event_file="../data/m3_separate_filtered.gz",
                             remove_cues=('='))