In [2]:
import json
import pandas as pd
from tqdm.notebook import tqdm

from variables import *

#### Read in the event file, weights and parts of the regression dataset

In [3]:
# Event file.
event_file = pd.read_csv('../data/batchfiles/batch1.gz', 
                         sep = '\t', 
                         low_memory = True,
                         engine = 'c')

# The word column from the regression dataframe.  
speaker_word = pd.read_csv('../data/regression_data.csv',
                           usecols=['wordID'],
                           dtype={"wordID": "category"},
                           low_memory = True, 
                           engine = 'c')

# Weights. 
df = xr.open_dataarray('../output/weights/weights_buckeye.nc')
weight_matrix = df.to_pandas()
weight_matrix = weight_matrix.transpose()
weight_matrix.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 1293 entries, c.when's to y.gihn
Columns: 610 entries, please to begin
dtypes: float64(610)
memory usage: 6.1 MB


#### Add prior to the dataframe

In [None]:
#words = speaker_word['wordID'].tolist()
words = event_file['outcomes'].tolist()
prior_dict = {}

# Make a prior dictionary.
for index, word in tqdm(enumerate(words)):
    if word not in prior_dict.keys(): 

        prior_all = get_prior(weight_matrix = weight_matrix, word_outcome = word, domain_specific = False)
        priors = get_prior(weight_matrix = weight_matrix, word_outcome = word, domain_specific = True)
        
        prior_dict[word] = {'prior_all': prior_all, 
                            'prior_segments': priors['Segment'], 
                            'prior_syllables': priors['Syllable'], 
                            'prior_context' : priors['Context']} 
# Save to json. 
out_file = open("../data/prior_dictionary.json", "w")
json.dump(prior_dict, out_file, indent = 6)
out_file.close()
    
df = pd.DataFrame({'prior_all': [], 'prior_segments': [], 'prior_syllables' : [], 'prior_context': []})

for index, word in enumerate(words):
    df.at[index, 'prior_all'] = prior_dict[word]['prior_all']
    df.at[index, 'prior_segments'] = prior_dict[word]['prior_segments']
    df.at[index, 'prior_syllables'] = prior_dict[word]['prior_syllables']
    df.at[index, 'prior_context'] = prior_dict[word]['prior_context']

# Loading whole regression dataset.
regression_data = pd.read_csv('../data/regression_data.csv', 
                              dtype={"speakerID": "category","speakerAge": "category", "speakerGender": "category",
                                     "interviewerGender": "category", "wordID": "category", "wordDur": "float",
                                     "wordPOS": "category", "n_segments": "category", "n_syllables": "category"}, 
                              engine = 'c',
                              low_memory = True)

#result = pd.concat(objects = [regression_data, df], axis = 1)
#result.to_csv('../data/regression_data_prior.csv', index = False)

#### Add activation to the dataframe

In [12]:
#words = speaker_word['wordID'].tolist()
words = event_file['outcomes'].tolist()
df = pd.DataFrame({'activation_all': [], 'activation_segments': [], 'activation_syllables' : [], 
                   'activation_context': []})

for index, word in tqdm(enumerate(words[:100])): 
    if index == 0:
        c1 = 'c.' + words[index+1]
        c2 = None
    elif index == len(words)-1:
        c1 = 'c.' + words[index-1]
        c2 = None
    else:
        c1 = 'c.' + words[index-1]
        c2 = 'c.' + words[index+1]
    
    act = activation(word_outcome = word, c1 = c1, c2=c2, 
           event_files = [event_file], weight_matrix = weight_matrix, 
           domain_specific = False)
    act_domain = activation(word_outcome = word, c1 = c1, c2=c2,  
           event_files = [event_file], weight_matrix = weight_matrix, 
           domain_specific = True)
    
    df.at[index, 'activation_all'] = act
    df.at[index, 'activation_segments'] = act_domain['Segment']
    df.at[index, 'activation_syllables'] = act_domain['Syllable']
    df.at[index, 'activation_context'] = act_domain['Context']

#ende = pd.concat(objects = [regression_data, df], axis = 1)
#ende.to_csv('../data/regression_data_activation.csv', index = False)

100it [00:00, 780.45it/s]

c.when's None
c.please c.the
c.when's c.last
c.the c.time
c.last c.you
c.time c.thought
c.you c.a
c.thought c.game
c.a c.was
c.game c.fun
c.was c.but
c.fun c.you're
c.but c.going
c.you're c.to
c.going c.lose
c.to c.blank
c.lose c.doesn't
c.blank c.lose
c.doesn't c.this
c.lose c.is
c.this c.fun
c.is c.what
c.fun c.a
c.what c.double
c.a c.jump
c.double c.you've
c.jump c.gotta
c.you've c.be
c.gotta c.kidding
c.be c.cheater
c.kidding c.i
c.cheater c.is
c.i c.that
c.is c.the
c.that c.ability
c.the c.that
c.ability c.only
c.that c.a
c.only c.few
c.a c.warbeasts
c.few c.possess
c.warbeasts c.to
c.possess c.overcome
c.to c.their
c.overcome c.own
c.their c.physical
c.own c.limits
c.physical c.blood
c.limits c.destruction
c.blood c.i
c.destruction c.won't
c.i c.lose
c.won't c.then
c.lose c.one
c.then c.hundred
c.one c.rule
c.hundred c.number
c.rule c.ten
c.number c.that's
c.ten c.not
c.that's c.just
c.not c.a
c.just c.double
c.a c.jump
c.double c.that's
c.jump c.a
c.that's c.bug
c.a c.i
c.bug c.




In [10]:
df

Unnamed: 0,activation_all,activation_segments,activation_syllables,activation_context
0,0.028172,0.008272,0.010000,0.0
1,0.029869,0.008207,0.010000,0.0
2,0.772025,0.350647,0.401478,0.0
3,0.025256,0.004247,0.010000,0.0
4,0.113009,0.053482,0.047616,0.0
...,...,...,...,...
3599,0.130923,0.056247,0.056235,0.0
3600,0.287915,0.194566,0.078634,0.0
3601,0.079600,0.039800,0.020000,0.0
3602,0.764194,0.350647,0.401478,0.0


In [13]:
activation(word_outcome = "when's", c1 = 'please', c2='the',  
           event_files = [event_file], weight_matrix = weight_matrix, 
           domain_specific = True)

{'Segment': 0.00820677189592, 'Syllable': 0.010000000000000002, 'Context': 0}

In [16]:
i = weight_matrix.index
if 'please' in i:
    print('yes')