## Thank ye, good Sir  🎄

In [None]:
import os
import re
import json
import pandas as pd
from pyndl import preprocess, count, ndl
from tqdm.notebook import tqdm
from dp.phonemizer import Phonemizer

phonemizer = Phonemizer.from_checkpoint('../data/en_us_cmudict_forward.pt')

#### Make first version of event file with pyndl

In [None]:
for file in tqdm(os.listdir('../data/subtitles/')[:10000]):
    preprocess.create_event_file(corpus_file = '../data/subtitles/' + file ,
                             event_file = '../data/individual_eventfiles/' + file + '.gz',
                             symbols = "a-zA-Z'",
                             context_structure ='document',
                             event_structure = 'word_to_word',
                             event_options = (1,0),
                             lower_case = True,
                             cue_structure = 'word_to_word')
# All subtitle files are 446283. 

#### Add syllable, context and segment cues with cue tags

In [None]:
## Syllabifier script. 
# English language settings for the language parameter in the syllabifier.
English = {
    'consonants': ['B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L',
                   'M', 'N', 'NG', 'P', 'R', 'S', 'SH', 'T', 'TH', 'V', 'W',
                   'Y', 'Z', 'ZH'],
    'vowels': [ 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 'IH',
               'IY', 'OW', 'OY', 'UH', 'UW'],
    'onsets': ['P', 'T', 'K', 'B', 'D', 'G', 'F', 'V', 'TH', 'DH', 'S', 'Z',
               'SH', 'CH', 'JH', 'M', 'N', 'R', 'L', 'HH', 'W', 'Y', 'P R',
               'T R', 'K R', 'B R', 'D R', 'G R', 'F R', 'TH R', 'SH R',
               'P L', 'K L', 'B L', 'G L', 'F L', 'S L', 'T W', 'K W',
               'D W','S W', 'S P', 'S T', 'S K', 'S F', 'S M', 'S N', 'G W',
               'SH W', 'S P R', 'S P L', 'S T R', 'S K R', 'S K W', 'S K L',
               'TH W', 'ZH', 'P Y', 'K Y', 'B Y', 'F Y', 'HH Y', 'V Y',
               'TH Y', 'M Y', 'S P Y', 'S K Y', 'G Y', 'HH W', '']
    }
     
def syllabify(language, word):
    '''Syllabifies the word, given a language configuration loaded with
    loadLanguage. word is either a string of phonemes from the CMU
    pronouncing dictionary set (with optional stress numbers after vowels),
    or a Python list of phonemes, e.g. "B AE1 T" or ["B", "AE1", "T"]
    '''

    if type(word) == str:
        word = word.split()
    # This is the returned data structure.
    syllables = []

    # This maintains a list of phonemes between nuclei.
    internuclei = []

    for phoneme in word :

        phoneme = phoneme.strip()
        if phoneme == "" :
            continue
        stress = None
        if phoneme[-1].isdigit() :
            stress = int(phoneme[-1])
            phoneme = phoneme[0:-1]

        # Split the consonants seen since the last nucleus into coda and
        # onset.
        if phoneme in language["vowels"] :

            coda = None
            onset = None

            # If there is a period in the input, split there.
            if "." in internuclei :
                period = internuclei.index(".")
                coda = internuclei[:period]
                onset = internuclei[period+1:]

            else :
                # Make the largest onset we can. The 'split' variable marks
                # the break point.
                for split in range(0, len(internuclei)+1) :
                    coda = internuclei[:split]
                    onset = internuclei[split:]

                    # If we are looking at a valid onset, or if we're at the
                    # start of the word (in which case an invalid onset is
                    # better than a coda that doesn't follow a nucleus), or
                    # if we've gone through all of the onsets and we didn't
                    # find any that are valid, then split the nonvowels
                    # we've seen at this location.
                    if " ".join(onset) in language["onsets"] \
                       or len(syllables) == 0 \
                       or len(onset) == 0 :
                       break


            # Tack the coda onto the coda of the last syllable. Can't do it
            # if this is the first syllable.
            if len(syllables) > 0 :
                syllables[-1][3].extend(coda)

            # Make a new syllable out of the onset and nucleus.
            syllables.append( (stress, onset, [phoneme], []) )

            # At this point we've processed the internuclei list.
            internuclei = []

        elif not phoneme in language["consonants"] and phoneme != "." :
            raise ValueError("Invalid phoneme: " + phoneme)

        else : # a consonant
            internuclei.append(phoneme)

    # Done looping through phonemes. We may have consonants left at the end.
    # We may have even not found a nucleus.
    if len(internuclei) > 0 :
        if len(syllables) == 0 :
            syllables.append( (None, internuclei, [], []) )
        else :
            syllables[-1][3].extend(internuclei)

    return syllables

def stringify(syllables) :
    '''This function takes a syllabification returned by syllabify and
       turns it into a string, with phonemes spearated by spaces and
       syllables spearated by periods.'''
    ret = []
    for syl in syllables :
        stress, onset, nucleus, coda = syl
        if stress != None and len(nucleus) != 0 :
            nucleus[0] += str(stress)
        ret.append("".join(onset + nucleus + coda))
    return " ".join(ret)

language = English

In [None]:
# Functions for getting/formatting segments/syllables
def get_segments(word, upper = False):
    """Returns the segments of a word."""
    raw_segment_string = phonemizer(word, lang='en_us')
    
    if upper:
        segments_string = re.sub(r'[\[\]-]',' ', raw_segment_string)
        segments = segments_string.split()
        return segments
    else:
        segments_string = re.sub(r'[\[\]-]',' ', raw_segment_string.lower())
        segments = segments_string.split()
        return segments

def join_segments(word):
    """Returns the segments of a word in a cue formatted string."""
    segments = get_segments(word)
    segments_y = []
    for segment in segments:
        segment = 's.' + segment
        segments_y.append(segment)
    segments_joined = '_'.join(segments_y)
    return segments_joined

def join_syllables(syllables):
    """Returns the syllables of a word in a cue formatted string."""
    syll_list = syllables.split()
    syllable_cuestring = []
    for entry in syll_list:
        syllable_cue = 'y.' + entry
        syllable_cuestring.append(syllable_cue.lower())
    syllables_joined = '_'.join(syllable_cuestring)
    return syllables_joined

In [None]:
transcriptions = {}

for file in tqdm(os.listdir('../data/individual_eventfiles/')):
    df = pd.read_csv('../data/individual_eventfiles/' + file, 
               sep = '\t', 
               low_memory = True, 
               dtype = ({'outcomes':'category'}), 
               engine = 'c')
    words = df['outcomes'].tolist()

    for index, word in enumerate(words):

        # Get the CONTEXT (preceding and following word).
        if index == 0:
            context = 'c.' + words[index+1]
        elif index == len(words)-1:
            context = 'c.' + words[index-1]
        else:
            previous_word = 'c.' + words[index-1]
            following_word = 'c.' + words[index+1]
            context = previous_word + '_' + following_word 

        # Get the SEGMENTS of the word. Transcription dict entry is cue-string formatted segments for the event file
        # and unformatted segments for the syllabify script. 
        if word not in transcriptions.keys(): 
            cue_segments = join_segments(word)
            segments = get_segments(word, upper = True)
            transcriptions[word] = {'cue_segments': cue_segments, 'segments': segments} 
        else:
            cue_segments = transcriptions[word]['cue_segments']

        # Get the SYLLABLES of the word. 
        raw_syllables = stringify(syllabify(English, transcriptions[word]['segments']))
        syllables = join_syllables(raw_syllables)

        cues = context + '_' + syllables + '_' + cue_segments

        # Append all information to the dataframe. 
        df.at[index, 'cues'] = cues

    # Save eventfile. 
    df.to_csv('../data/individual_eventfiles/' + file, sep = '\t', index = False, compression = 'gzip')  

#### Make batches, 2k files each. 

In [None]:
files = []
dataframes = []
batch_num = 1

for file in tqdm(os.listdir('../data/individual_eventfiles/')):
    if len(dataframes) < 2000:
        files.append(file.split(".")[0])
        df = pd.read_csv('../data/individual_eventfiles/' + file,
                         sep = '\t', low_memory = True, 
                         dtype = ({'outcomes':'category'}), 
                         engine = 'c')
        dataframes.append(df)
        
    else: 
        # Write filenames of current batch to json.
        with open('../data/logs/batch' + str(batch_num) + '.json', 'w', encoding='utf-8') as f:
            json.dump(files, f, ensure_ascii=False, indent=4)
        
        # Save the 2000 files in one batch. 
        batch = pd.concat(files)
        batch.to_csv('../data/batchfiles/batch' + str(batch_num) + '.gz', sep = '\t', index = False, compression = 'gzip')

            
        batch_num =+1    
        files = []    

#### Replace the context cues below the frequency cutoff with unkown and remove them as outcomes

In [None]:
below_cutoff = []
drop_outcomes = []

for file in tqdm(os.listdir('../data/batchfiles/')):
    df = pd.read_csv('../data/batchfiles/' + file,
                     sep = '\t', 
                     low_memory = True, 
                     dtype = ({'outcomes':'category'}), 
                     engine = 'c')
    words = df['outcomes'].tolist()
    
    # Look through outcomes for below cutoff words. If found, replace cues in the line before and after with unkown
    for index, word in enumerate(words):
        if word in below_cutoff:
            if indx != 0:
                df.at[indx-1, 'cues'] = df.at[indx-1, 'cues'].replace('c.' + str(word), '<unknown>')
            if indx != len(df)-1:
                df.at[indx+1, 'cues'] = df.at[indx+1, 'cues'].replace('c.' + str(word), '<unknown>')
            
            # Append index of outcome so it can be removed later. Otherwise the index changes. 
            drop_outcomes.append(index)
            
    df.drop(index=sammlung, inplace = True)  # Save memory with inplace 
    df.to_csv('../data/batchfiles/' + file, sep = '\t', index = False, compression = 'gzip')  

### Train the model

In [None]:
weights = ndl.ndl(events = '../data/final_eventfile_buckeye.gz', 
                  alpha = 0.1, 
                  betas = (0.1,0.1), 
                  lambda_=1.0, 
                  method='openmp', 
                  remove_duplicates=True, 
                  verbose=True)

weights.to_netcdf('../output/weights/' + 'weights_buckeye.nc')

In [None]:
# Load previous weights 
previous_weights = xr.open_dataarray('../output/weights/weights_buckeye.nc')

weights = ndl.ndl(events='../data/batchfiles/batch1.gz', 
                  alpha=0.1, 
                  betas=(0.1, 0.1), 
                  method='openmp', 
                  verbose = True, 
                  weights = previous_weights, 
                  remove_duplicates = False)

weights.to_netcdf('../output/weights/weights_buckeyeNb1.nc')

In [None]:
previous_weights = xr.open_dataarray('../output/weights/weights_buckeyeNb1.nc')

weights = ndl.ndl(events='../data/batchfiles/batch2.gz', 
                  alpha=0.1, 
                  betas=(0.1, 0.1), 
                  method='openmp', 
                  verbose = True, 
                  weights = previous_weights, 
                  remove_duplicates = False)

weights.to_netcdf('../output/weights/weights_buckeyeNb1Nb2.nc')

In [None]:
previous_weights = xr.open_dataarray('../output/weights/weights_buckeyeNb1Nb2.nc')

weights = ndl.ndl(events='../data/batchfiles/batch3.gz', 
                  alpha=0.1, 
                  betas=(0.1, 0.1), 
                  method='openmp', 
                  verbose = True, 
                  weights = previous_weights, 
                  remove_duplicates = False)

weights.to_netcdf('../output/weights/weights_buckeyeNb1Nb2Nb3.nc')

In [None]:
previous_weights = xr.open_dataarray('../output/weights/weights_buckeyeNb1Nb2Nb3.nc')

weights = ndl.ndl(events='../data/batchfiles/batch4.gz', 
                  alpha=0.1, 
                  betas=(0.1, 0.1), 
                  method='openmp', 
                  verbose = True, 
                  weights = previous_weights, 
                  remove_duplicates = False)

weights.to_netcdf('../output/weights/weights_buckeyeNb1Nb2Nb3Nb4.nc')