In [40]:
import os
import re
import spacy
import math
import argparse

In [26]:
# Prepare tokenization
def tokenization(input_str):
    # Split on any non-alphanumeric character
    tokenizer = re.compile(r"\W+")
    
    # Tokenize 
    token_list = tokenizer.split(input_str)

    return token_list

In [29]:
# Prep: Number of times search term is present with given collocates
# st means search term
def calc_st_w_coll(concordances):

    st_w_coll = {} 

    for conc in concordances:
        for collocate in conc:
            if collocate in st_w_coll:
                st_w_coll[collocate] += 1
            else:
                st_w_coll[collocate] = 1

    return st_w_coll

In [30]:
# Prep: Calculate collocates w/o search term and total of collocates
def calc_coll_wo_st_total_coll(concordances, all_tokens, st_w_coll_dict):
    coll_wo_st = {}
    total_coll = {}

    for conc in concordances:
        for collocate in conc:
            total_count = all_tokens.count(collocate)
            total_coll[collocate] = total_count
            wo_st = total_count - st_w_coll_dict[collocate]
            coll_wo_st[collocate] = wo_st
    
    return (coll_wo_st, total_coll)

In [31]:
# Prep: Number of times search term is present without any given collocate
def calc_stcount_st_wo_coll(all_tokens, st, st_w_coll_dict):
    st_wo_coll = {}

    # Number of times the search term occurs across the corpus
    st_count = all_tokens.count(st)
    for collocate, count in st_w_coll_dict.items():
        st_wo_coll[collocate] = st_count - count

    return (st_count, st_wo_coll)

In [43]:
# Prep: The expected frequency of a given collocate occurring with the search term
def calc_exp_freq(total_coll, stcount, corpus_size):
    
    exp_freq = {}

    for collocate in total_coll: 
        exp_freq[collocate] = (stcount * total_coll[collocate]) / N

    return exp_freq

In [33]:
# Prep: The mutual information for any given collocate to the search term
def calc_mut_inf(st_w_coll, exp_freq):
    
    mut_inf = {}

    for collocate in st_w_coll:
        mut_inf[collocate] = math.log(st_w_coll[collocate] / exp_freq[collocate])

    return mut_inf

 # Write headers to csv - separately from appending, since this also overwrites the old file
def write_csv(outfile, st_w_coll, mut_inf):
   
    with open(outfile, 'w', encoding='utf-8') as fh:
        fh.write('collocate,raw_frequency,MI\n')

    with open(outfile, 'a', encoding='utf-8') as fh:
        for collocate in O11:
            fh.write(f'{collocate},{st_w_coll[collocate]},{mut_inf[collocate]}\n')

In [38]:
def main(data_dir, st, window_size, sample_num):
    # Output file is named after the st and window size
    outpath = 'output'
    outfile = os.path.join(outpath, f'{st}_{window_size}.csv')

    # Make everything lowercase 
    st = st.lower()

    # The list of all tokens in the corpus
    tokens = [] 

    for i, novel_path in enumerate(data_dir.glob('*.txt')): # enumerate() method adds a counter to an iterable and returns it in a form of enumerating object
        # Only use a subset of files if running as demo
        if i == sample_num: 
            break

        with open(novel_path, 'r', encoding='utf-8') as fh:
            content = fh.read()
            # Splits the whole novel-content string into tokens on non-word characters
            tokens += tokenization(content)

    # Clowercase tokens yes please
    tokens = [token.lower() for token in tokens]
    
    # Total number of tokens in corpus
    corpus_size = len(tokens)

    # Locations of search term in the token list
    st_indices = [i for i, token in enumerate(tokens) if token == st.lower()]

    # A list of tokens slices +- window_size around search term
    concordances = [tokens[max(0, i - window_size) : i + window_size + 1] for i in st_indices]

    # Filters out the exact search term that we are checking against, but not other occurences of the search term
    for conc in concordances:
        # Removes the window_size'th element (exact middle) of concordance line, which is the search term
        conc.pop(window_size)

    # Number of times search term is present with given collocates
    st_w_col = calc_st_w_col(concordances)

    # coll_wo_st: Number of times given collocate occurs without search term
    # total_coll: Total number of times a given collocate occurs
    coll_wo_st, total_coll = calc_coll_wo_st_total_coll(concordances, tokens, st_w_coll)
    
    # R1 / stcount: Number of times the search term occurs across the corpus
    # st_wo_coll: Number of times search term is present without any given collocate
    R1, st_wo_coll = calc_stcount_st_wo_coll(tokens, st, st_w_coll)

    # The expected frequency of a given collocate occurring with the search term
    exp_freq = calc_exp_freq(total_coll, R1, corpus_size)

    # The MI for any given collocate to the search term
    mut_inf = calc_mut_inf(st_w_col, exp_freq)

    # Write 3 columns: collocate, raw_frequency, MI
    write_csv(outfile, st_w_coll, mut_inf)
    
    print('Data written to: ' + outfile)

In [41]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='calculate collocates for a specific keyword')
    parser.add_argument('keyword', help='the keyword to look for')
    parser.add_argument('-w', '--window_size', type=int, default=5, help='the number of words on both sides of the keyword to look for collocates in')
    parser.add_argument('-s', '--sample_num', type=int, help='whether to only use a subset of files and how many to use')
    parser.add_argument('-d', '--data_dir', type=Path, default = Path('./data/'), help='the directory containing all of your text files to analyze')
    args = parser.parse_args()	

    main(keyword = args.keyword, window_size = args.window_size, sample_num = args.sample_num, data_dir = args.data_dir)

NameError: name 'Path' is not defined