In [1]:
import json
from tqdm import tqdm
import pandas as pd

from variables import *

#### Read in the event file, weights and parts of the regression dataset

In [3]:
# Event file.
event_file = pd.read_csv('../data/batches/file.tsv', 
                         sep = '\t', 
                         low_memory = True,
                         engine = 'c')

# Weights. 
df = xr.open_dataarray('../output/weights/file.nc')
weight_matrix = df.to_pandas()
weight_matrix = weight_matrix.transpose()
weight_matrix.info(verbose=False, memory_usage="deep")

# Only loading the speakerID and word column from the regression dataframe.  
speaker_word = pd.read_csv('../data/regression_data.csv',
                           usecols=['wordID'],
                           dtype={"wordID": "category"},
                           low_memory = True)

<class 'pandas.core.frame.DataFrame'>
Index: 4091 entries, c.okay to c.content
Columns: 2327 entries, mm-hmm to philip
dtypes: float64(2327)
memory usage: 72.9 MB


#### Add prior to the dataframe

In [17]:
words = speaker_word['wordID'].tolist()
prior_dict = {}

# Make a prior dictionary.
for index, word in tqdm(enumerate(words)):
    if word not in prior_dict.keys(): 

        prior_all = get_prior(weight_matrix = weight_matrix, word_outcome = word, domain_specific = False)
        priors = get_prior(weight_matrix = weight_matrix, word_outcome = word, domain_specific = True)
        
        prior_dict[word] = {'prior_all': prior_all, 
                            'prior_segments': priors['Segment'], 
                            'prior_syllables': priors['Syllable'], 
                            'prior_context' : priors['Context']} 
# Save to json. 
out_file = open("../data/prior_dictionary.json", "w")
json.dump(prior_dict, out_file, indent = 6)
out_file.close()

# Make new dataframe with the priors 
df = pd.DataFrame(columns=['prior_all', 'prior_segments', 'prior_syllables', 'prior_context'])
for word in tqdm(enumerate(words)):
    df = df.loc[index]({'prior_all' : prior_dict[word]['prior_all'], 
                        'prior_segments': prior_dict[word]['prior_segments'], 
                        'prior_syllables' : prior_dict[word]['prior_syllables'], 
                        'prior_context' : prior_dict[word]['prior_context']})

# Loading whole regression dataset.
regression_data = pd.read_csv('../data/regression_data.csv', 
                              dtype={"speakerID": "category","speakerAge": "category", "speakerGender": "category",
                                     "interviewerGender": "category", "wordID": "category", "wordDur": "float",
                                     "wordPOS": "category", "n_segments": "category", "n_syllables": "category"}, 
                              engine = 'c',
                              low_memory = True)

result = pd.concat(objects = [regression_data, df], axis = 1)
result.to_csv('../data/regression_data_prior.csv', index = False)

6it [00:00, 46.27it/s]


TypeError: string indices must be integers