In [None]:
from google.colab import drive
import pandas as pd
from tqdm import tqdm
import os
import math
import re
import numpy as np

In [None]:
# define base_path
base_path = ""

# Path to word frequencies
freqs = pd.read_csv(os.path.join(base_path, "freqs_coca.csv"), header=0)

# Path to GP and filler files containing surprisal data for each word
GP_base      = "items_ClassicGP.ambig.csv.m"
filler_base  = "items_filler.ambig.csv.m"

# Path to filler and GP reading times
GP_RT_df     = pd.read_csv(os.path.join(base_path, "ClassicGardenPathSet.csv"), header=0)
filler_RT_df = pd.read_csv(os.path.join(base_path, "Fillers.csv"), header=0)

In [None]:
# To save df to file_path in chunks of size sz
def save_progbar(df, file_path, sz=100000):
    num_chunk = (len(df) // sz) + 1

    with open(file_path, 'w') as file:
        # append header first
        df.iloc[:0].to_csv(file, index=False)

        # write rest of columns
        for i in tqdm(range(num_chunk), desc=f"Saving {file_path}"):
            start = i * sz
            end = (i + 1) * sz
            df.iloc[start:end].to_csv(file, index=False, header=False)


In [None]:
####

GP_paths     = [os.path.join(base_path, GP_base + str(n))  for n in range(4)]
filler_paths = [os.path.join(base_path, filler_base + str(n)) for n in range(4)]

GP_surp_dfs     = [pd.read_csv(filepath, header=0) for filepath in GP_paths]
filler_surp_dfs = [pd.read_csv(filepath, header=0) for filepath in filler_paths]

for n in range(len(filler_surp_dfs)):
  filler_surp_dfs[n]['model'] = 'm' + str(n)
  GP_surp_dfs[n]['model'] = 'm' + str(n)

In [None]:
spr = GP_RT_df[(GP_RT_df['RT'] < 3000) & (GP_RT_df['RT'] > 0)]
spr['Sentence'] = spr['Sentence'].str.replace('%2C', ',')
spr['EachWord'] = spr['EachWord'].str.replace('%2C', ',')

fspr = filler_RT_df[(filler_RT_df['RT'] < 3000) & (filler_RT_df['RT'] > 0)]
fspr['Sentence'] = fspr['Sentence'].str.replace('%2C', ',')
fspr['EachWord'] = fspr['EachWord'].str.replace('%2C', ',')


In [None]:
fbound = pd.concat(filler_surp_dfs)
fbound['word_clean'] = fbound['word'].str.lower().str.replace("[.,!?:;]'", '', regex=True)
fbound = fbound.merge(freqs, left_on="word_clean", right_on="word", how="left")
fbound['word_pos'] = fbound['word_pos'] + 1
fmerged = pd.merge(fspr, fbound, left_on=['Sentence', 'WordPosition'], right_on=['Sentence', 'word_pos'], how='left')
fmerged = fmerged.dropna(subset=['word_x'])
fmerged['count'] = fmerged['count'].fillna(1)
fmerged['logfreq'] = np.log(fmerged['count'])
fmerged['length'] = fmerged['word_clean'].str.len()
print(list(fmerged['EachWord'])[0:20])
print(list(fmerged['word_x'])[0:20])

In [None]:
bound = pd.concat(GP_surp_dfs)

bound_long = bound.melt(id_vars=[col for col in bound.columns if col not in ["sum_lex_surprisal", "sum_syn_surprisal", "mean_lex_surprisal", "mean_syn_surprisal"]],
                        value_vars=["sum_lex_surprisal", "sum_syn_surprisal", "mean_lex_surprisal", "mean_syn_surprisal"],
                        var_name="surprisal_type", value_name="surprisal")
bound_long['word_pos_ralign'] = bound_long['word_pos'] - bound_long['disambPosition_0idx']


bound['word_clean'] = bound['word'].str.lower().str.replace('[.,!?:;]', '', regex=True)
merged = bound.merge(freqs, left_on="word_clean", right_on="word", how="left")
merged['word_pos'] = merged['word_pos'] + 1
merged = pd.merge(spr, merged, left_on=['Sentence', 'WordPosition'], right_on=['Sentence', 'word_pos'], how='left')
merged = merged.dropna(subset=['word_x'])
merged['count'] = merged['count'].fillna(1)
merged['logfreq'] = np.log(merged['count'])
merged['length'] = merged['word_clean'].str.len()

print(list(merged['EachWord'])[0:20])
print(list(merged['word_x'])[0:20])

In [None]:
# Note: ensure lfreq_mean and np.std(lfreqs) are not NaN, as {f}merged['logfreq_s'] will regress to non-numerical values
# Might have to precision of lfreq_mean and np.std(lfreqs) so as to not cause errors, or replace manually with decimals in code
syn_surps = list(fmerged['sum_syn_surprisal']) + list(merged['sum_syn_surprisal'])
lex_surps = list(fmerged['sum_lex_surprisal']) + list(merged['sum_lex_surprisal'])
lengths = list(fmerged['length']) + list(merged['length'])
lfreqs = list(fmerged['logfreq']) + list(merged['logfreq'])

In [None]:
syn_surp_mean = np.mean(syn_surps)
lex_surp_mean = np.mean(lex_surps)
len_mean = np.mean(lengths)
lfreq_mean = np.mean(lfreqs)

merged['syn_surprisal_s']  = (merged['sum_syn_surprisal']  - syn_surp_mean)/np.std(syn_surps)
fmerged['syn_surprisal_s'] = (fmerged['sum_syn_surprisal'] - syn_surp_mean)/np.std(syn_surps)

merged['lex_surprisal_s']  = (merged['sum_lex_surprisal']  - lex_surp_mean)/np.std(lex_surps)
fmerged['lex_surprisal_s'] = (fmerged['sum_lex_surprisal'] - lex_surp_mean)/np.std(lex_surps)

merged['length_s'] = (merged['length'] - len_mean)/np.std(lengths)
fmerged['length_s'] = (fmerged['length'] - len_mean)/np.std(lengths)

merged['logfreq_s'] = (merged['logfreq'] - lfreq_mean)/np.std(lfreqs)
fmerged['logfreq_s'] = (fmerged['logfreq'] - lfreq_mean)/np.std(lfreqs)

In [None]:
fmerged_file = f"{base_path}/fmerged_mod.csv"
save_progbar(fmerged, fmerged_file)

In [None]:
merged_file = f"{base_path}/merged_mod.csv"
save_progbar(merged, merged_file)