## Data-preprocessing
This script preprocesses the data used in Arndt-Lappe 2014. The dataset is given a header and unneccessary information is removed from the dataset:
- The number column
- The word/phrase column
- Comata in the -ity/-ness and word column

Current plan for the models:
- Model 1
    - Cues: Syll1, Syll2, token, syntactical info --> Outcome: suffix
- Model 2
    - Cues: o1, n1, c1, o2, n2, c2, token, syntactical info --> Outcome: suffix
- Model 3
    - Cues: o1, n1, c1, o2, n2, c2 (excluding '='), token, syntactical info --> Outcome: suffix

In [128]:
import os
import gzip
import pandas as pd
from pyndl import count, preprocess
#os.chdir('/home/ansost/ModMo-Implementation/scripts')

In [121]:
# The 'name' parameter adds a header to the file. This is not permanent.
df = pd.read_csv('../data/20thcent.txt', 
                 sep = '\t', 
                 names=["Outcomes", "onset1", "nucleus1", "coda1", "onset2", 
                        "nucleus2", "coda2", "syntact_info", "some_number", "token"])     
#df

In [122]:
# Remove commas
cols_to_replace = ["Outcomes", "token"]
df[cols_to_replace] = df[cols_to_replace].replace({',':''}, regex=True)
#df

In [123]:
# Remove unused columns to be more memory efficient. 
df.drop(columns=["some_number"], inplace = True)
#df

In [124]:
## Make the event file for Model1: 
    # Cues: Syll1, Syll2, token, syntactical info --> Outcome: suffix
# Mergin the syllable info together
df_m1 = df.copy()
df_m1["syll1"] = df_m1["onset1"] + df_m1["nucleus1"] + df_m1["coda1"] 
df_m1["syll2"] = df_m1["onset2"] + df_m1["nucleus2"] + df_m1["coda2"] 
df_m1.drop(columns=["onset1", "nucleus1", "coda1", "onset2", "nucleus2", "coda2"], inplace = True)

# Merging columns into 'cues' column
df_m1["Cues"] = df_m1["syll1"] + "_" + df_m1["syll1"] + "_" + df_m1["syntact_info"] + "_" + df_m1["token"] 
df_m1.drop(columns=["syll1", "syll2", "token", "syntact_info"], inplace = True)
df_m1 = df_m1[["Cues", "Outcomes"]]
df_m1.to_csv("../data/m1_syllable.tsv", sep = '\t', index = False)
del df_m1

In [125]:
## Make the event file for Model2: 
    # Cues: o1, n1, c1, o2, n2, c2, token, syntactical info --> Outcome: suffix
df_m2 = df.copy()

# Merging columns into 'cues' column
df_m2["Cues"] = df_m2["onset1"] + "_" + df_m2["nucleus1"] + "_" + df_m2["coda1"]  + "_" + df_m2["onset2"] + "_" + df_m2["nucleus2"]  + "_" + df_m2["coda2"]  + "_" + df_m2["syntact_info"] + "_" + df_m2["token"]
df_m2.drop(columns=["onset1", "nucleus1", "coda1", "onset2", "nucleus2", "coda2", "syntact_info", "token"], inplace = True)
df_m2 = df_m2[["Cues", "Outcomes"]]
df_m2.to_csv("../data/m2_separate.tsv", sep = '\t', index = False)
del df_m2

In [126]:
# Gzip the files
with open("../data/m1_syllable.tsv", "rb") as f_in, gzip.open("../data/m1_syllable.gz", "wb") as f_out:
    f_out.writelines(f_in)

with open("../data/m2_separate.tsv", "rb") as f_in, gzip.open("../data/m2_separate.gz", "wb") as f_out:
    f_out.writelines(f_in)

In [129]:
# Filtering the eventfile to make model3 without '='
preprocess.filter_event_file(input_event_file="../data/m2_separate.gz",
                             output_event_file="../data/m3_separate_filtered.gz",
                             remove_cues=('='))

In [None]:
#TODO: Write this to a file/table and plot it. 
freq, cue_freq_map, outcome_freq_map = count.cues_outcomes(event_file_name="../data/m3_separate_filtered.gz")