<a href="https://colab.research.google.com/github/abhilasha-kumar/fluency-cogsci2022/blob/main/fluency_cogsci2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phonological Intrusions in Semantic Memory Retrieval

# Importing drive, GPU, and packages

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import heapq
import itertools
import scipy.spatial.distance

import pandas as pd
import numpy as np

from numpy.random import randint
from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler, normalize
from numpy.linalg import matrix_power
from functools import lru_cache
import glob
from scipy.special import expit

import matplotlib.pyplot as plt
import nltk
from functools import lru_cache
from itertools import product as iterprod
import itertools
from nltk.metrics import *



# Phoneme Function

In [None]:
# algo to obtain phonemes for any given strng
# obtained from: https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
try:
    arpabet = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    arpabet = nltk.corpus.cmudict.dict()

@lru_cache()
def wordbreak(s):
    s = s.lower()
    if s in arpabet:
        return arpabet[s]
    middle = len(s)/2
    partition = sorted(list(range(len(s))), key=lambda x: (x-middle)**2-x)
    for i in partition:
        pre, suf = (s[:i], s[i:])
        if pre in arpabet and wordbreak(suf) is not None:
            return [x+y for x,y in iterprod(arpabet[pre], wordbreak(suf))]
    return None

def normalized_sim(w1, w2):
  return 1-edit_distance(w1,w2)/(max(len(w1), len(w2)))


## example

In [None]:
w1 = "birds"
w2 = "pigs"
print("wordbreak(w1)[0]:",wordbreak(w1)[0])
print("wordbreak(w2)[0]:",wordbreak(w2)[0])

print("orig phon:", edit_distance(wordbreak(w1)[0],wordbreak(w2)[0]))
print("orig orth:", edit_distance(w1, w2))

print("norm orth:", normalized_sim(w1, w2))
print("norm phon:", normalized_sim(wordbreak(w1)[0],wordbreak(w2)[0]))

# reading data

In [None]:
parentfolder = "/content/drive/My Drive/IU-Abhilasha-Mike/Fluency/sem-phon/verbal_fluency/cochlear_alldomains"
with tf.device('/device:GPU:0'):
  semantic_files = glob.glob(parentfolder + '/*.xlsx')
print(f"This folder has {len(semantic_files)} files")

# reading embeddings

In [None]:
# import glove embeddings
parentfolder = "/content/drive/My Drive/IU-Abhilasha-Mike/Fluency/sem-phon/verbal_fluency/cochlear"
with tf.device('/device:GPU:0'):
  # glove = pd.read_csv(parentfolder +"/fluency_glove.csv").transpose().values
  # vocab = pd.read_csv(parentfolder +"/fluency_vocab.csv")
  glove = pd.read_csv(parentfolder +"/glove_parents.csv", encoding="unicode-escape").transpose().values
  vocab = pd.DataFrame(pd.read_csv(parentfolder +"/glove_parents.csv", encoding="unicode-escape").columns, columns=["vocab_word"])
  print(f"embeddings are shaped:", glove.shape)
  print(f"vocab is {len(vocab)} words")

# obtaining phonemic & semantic similarity

In [None]:
## now we loop through each txt file
import warnings
warnings.filterwarnings("ignore")


phon_list = []

# read in the data for the category as a pandas dataframe
category_file = pd.read_csv(semantic_files)
import re
for index, row in category_file.iterrows():  
  word = str(row["value"])
  mod_word = re.sub('[^a-zA-Z]+', '', word)
  if(len(mod_word)>0):
    phonemes = wordbreak(mod_word)[0]
    phon_list.append(phonemes)
  else:
    phon_list.append("wordnotfound")

category_file["phonemes"] = phon_list
category_file["response_number"] = category_file.groupby(['participantID', 'cue']).cumcount()+1

# exclude rows that do not have a valid phoneme
category_file = category_file[category_file.phonemes != "wordnotfound"]
category_file = category_file.reset_index(drop= True)

#now we compute the levenshtein edit distance as a measure of orthographic/phonemic similarity 

phon_similarity = []
orth_similarity = []
glove_similarity = []

norm_phon = []
norm_orth = []

for index, row in category_file.iterrows():
  current_word = re.sub('[^a-zA-Z]+', '', str(row["value"]))
  current_phoneme = row["phonemes"]
  current_word = "FALSE" if current_word in ["False", "false"] else current_word
  if row["response_number"] == 1:
      sem_val = -999
      phon_val = -999
      orth_val = -999
      norm_phon_val = -999
      norm_orth_val = -999
  else:
    previous_word = re.sub('[^a-zA-Z]+', '', str(category_file.value[index-1]))
    previous_word = "FALSE" if previous_word in ["False", "false"] else previous_word
    previous_phoneme = category_file.phonemes[index-1]

    #calculate orthographic similarity as Levenshtein (edit) distance
    orth_val = edit_distance(previous_word, current_word)
    norm_orth_val = normalized_sim(previous_word, current_word)
    
    # can also get edit distance for the phonemes themselves (as in Siew et al. Hoosier network)

    phon_val = edit_distance(previous_phoneme, current_phoneme)
    norm_phon_val = normalized_sim(previous_phoneme, current_phoneme)
    
    # extract word embedding for current word
    if current_word in list(vocab["vocab_word"]):
      current_word_index = list(vocab["vocab_word"]).index(current_word)
      current_word_vec = glove[current_word_index].reshape((1,glove.shape[1]))
      # extract word embedding for current word
      if previous_word in list(vocab["vocab_word"]):
        previous_word_index = list(vocab["vocab_word"]).index(previous_word)
        previous_word_vec = glove[previous_word_index].reshape((1,glove.shape[1]))
        sem_val = float((1 - scipy.spatial.distance.cdist(previous_word_vec, current_word_vec, 'cosine'))[0])      
        #print(f"for {current_word} and {previous_word} similarity is {sem_val}")
      else:
        sem_val = "NA"
    else:
      sem_val = "NA"

  phon_similarity.append(phon_val)
  orth_similarity.append(orth_val)
  norm_phon.append(norm_phon_val)
  norm_orth.append(norm_orth_val)
  glove_similarity.append(sem_val)
  
category_file["phon_similarity"] = phon_similarity
category_file["orth_similarity"] = orth_similarity
category_file["norm_phon"] = norm_phon
category_file["norm_orth"] = norm_orth
category_file["glove_childes"] = glove_similarity 

# computational search models

we only consider the "animals" domain for the computational models. we start with a predefined list of X animals for which we create a semantic and phonological similarity matrix and obtain word frequency estimates.

## create semantic similarity matrix

In [None]:
## create similarity matrix and similarity labels file from whichever corpus you're using

def create_sim_matrix(vectorpath, corpus_dir):
  #glove = pd.read_csv(vectorpath, encoding="unicode-escape").transpose().values
  vocab = pd.DataFrame(pd.read_csv(vectorpath, encoding="unicode-escape").columns, columns=["vocab_word"])
  ## the vocab consists of ALL possible words in corpus, but we need only the "animals" subset here
  ## could use the similarity_labels file from psyrev to constrain?
  simlabels = pd.read_csv(corpus_dir+'similaritylabels.csv').values.reshape(-1,).tolist()
  print("simlabels:", simlabels)

  animals_index = [list(vocab.vocab_word).index(lab) if lab in list(vocab.vocab_word) else -999 for lab in simlabels ]
  print("animals_index:", len(animals_index))
  animals_index = list(filter((-999).__ne__, animals_index))
  print("animals_index:", len(animals_index))
  print("animals_index = ", animals_index)
  
  ## now we restrict our vocab and embeddings to ONLY these animals
  #glove_small = glove[animals_index, :]
  #print(f"embeddings are shaped:", glove_small.shape)
  vocab_small = vocab.iloc[animals_index]
  print(f"vocab is now:", list(vocab_small.vocab_word))
  N = len(vocab_small)
  print(f"vocab is {N} words")

  # create semantic similarity matrix
  # matrix = 1-scipy.spatial.distance.cdist(glove_small, glove_small, 'cosine').reshape(-1)
  # matrix = matrix.reshape((N,N))
  # print("sim matrix has been created:", matrix.shape)

  # w1_index = list(vocab_small.vocab_word).index("dolphin")
  # w2_index = list(vocab_small.vocab_word).index("kitten")
  # w3_index = list(vocab_small.vocab_word).index("whale")

  # print("dolphin-kitten:", matrix[w1_index, w2_index])
  # print("dolphin-whale:", matrix[w1_index, w3_index])

  # pd.DataFrame(matrix).to_csv(corpus_dir + 'corpus_sim_matrix.csv', index=False, header=False)
  # vocab_small.to_csv(corpus_dir + 'corpus_sim_labels.csv', index=False, header=False)


vectorpath = "/content/drive/My Drive/IU-Abhilasha-Mike/Fluency/sem-phon/verbal_fluency/cochlear/glove_parents.csv"

create_sim_matrix(vectorpath, corpus_dir)


## create phonological similarity matrix

In [None]:

def create_phon_matrix(vocab):
  # takes in a list of labels and computes the phonological similarity matrix
  vocabulary = vocab.copy()
  N = len(vocabulary)
  print(f"vocab is {N} words")
  # replace all underscores (_) with space (" ") to match with glove vectors/vocab
  vocabulary = [re.sub('[^a-zA-Z]+', '', str(v)) for v in vocabulary]
  print(f"vocab now looks like:", vocabulary[:5])
  # create phonemic similarity matrix for the small vocab
  pmatrix = np.array([normalized_sim(wordbreak(w1)[0], wordbreak(w2)[0]) for w1 in vocabulary for w2 in vocabulary]).reshape((N,N))
  print("pmatrix has been created:", pmatrix.shape)
  print(pmatrix)
  pd.DataFrame(pmatrix).to_csv(corpus_dir + 'simlabels_phon_matrix.csv', index=False, header=False)  
  print("phon matrix csv created!")

simlabels = pd.read_csv(corpus_dir+'similaritylabels.csv', header=None).values.reshape(-1,).tolist()
print(f"simlabels is {len(simlabels)} items:", simlabels[:5])
create_phon_matrix(simlabels)


## define foraging models

In [None]:
import os
results_dir = '/content/drive/My Drive/IU-Abhilasha-Mike/Fluency/optimal-foraging-model/data/results/'
corpus_dir = '/content/drive/My Drive/IU-Abhilasha-Mike/Fluency/optimal-foraging-model/data/corpus/'

def modelFits(path, delimiter):

    ### LOAD REQUIRED PACKAGES ###
    import numpy as np
    import pandas as pd
    import re

    ### LOAD BEHAVIORAL DATA ###
    df = pd.read_csv(path, header=None, names=['SID', 'entry'], delimiter=delimiter)

    #correct behavioral fits
    df = forage.prepareData(df)

    ### LOAD SEMANTIC SIMILARITY MATRIX ###

    # (aka 'local cues', here we use cosines from word2vec)

    # Similarity labels
    simlab = []
    ofile = open(corpus_dir + 'similaritylabels.csv','r')#TODO:
    for line in ofile:
        labs = line.split()
        for lab in labs:
            simlab.append(lab)
    ofile.close()

    # Similarity values
    simval = np.zeros((len(simlab), len(simlab)))
    ofile = open(corpus_dir + 'similaritymatrix.csv', 'r')#TODO:
    j=0
    for line in ofile:
        line = re.sub(',\n', '', line)
        sims = line.split(',')
        i=0
        for sim in sims:
            simval[i,j] = sim
            i+=1
        j+=1
    ofile.close()

    # Make sure similarity values are non-zero
    for i in range(0,len(simval)):
        for j in range(0,len(simval)):
            if simval[i,j] <= 0:
                simval[i,j] = 0.0001

    ## PHONEMIC SIMILARTY VALUES ##
    phonval = np.zeros((len(simlab), len(simlab)))
    ofile = open(corpus_dir + 'simlabels_phon_matrix.csv', 'r')#TODO:
    j=0
    for line in ofile:
        line = re.sub(',\n', '', line)
        sims = line.split(',')
        i=0
        for sim in sims:
            phonval[i,j] = sim
            i+=1
        j+=1
    ofile.close()

    # Make sure phonemic values are non-zero
    for i in range(0,len(phonval)):
        for j in range(0,len(phonval)):
            if phonval[i,j] <= 0:
                phonval[i,j] = 0.0001

    ### LOAD FREQUENCY LIST ###
    # (aka 'global cue', using NOW corpus from http://corpus.byu.edu/now/, 4.2 billion words and growing daily)

    freqlab = []
    freqval = []
    ofile = open(corpus_dir + 'frequencies.csv', 'r') #TODO:
    for line in ofile:
        line = re.sub('\n', '', line)
        freqs=line.split(',')
        freqlab.append(freqs[0])
        ## append log of frequency if using psyrev
        freqval.append(np.log(float(freqs[1])))
        #freqval.append(float(freqs[1]))
    ofile.close()
    freqval=np.array(freqval)

    sidlist = list(set(df['SID']))
    full_entdf = pd.DataFrame()
    full_fitlist = []
    ct = 0

    ## COMPUTE CONSECUTIVE SIMILARITY AND FREQUENCY AT SUBJECT LEVEL ##

    for sid in sidlist:
        ct+=1
        print( "SUBJECT " + str(ct) + '/' + str(len(sidlist)) + " " + str(sid))

        # My general initializations
        myfitlist = []
        myentries = np.array(df[df['SID']==sid]['entry'])
        #print("myentries:", myentries)
        myenttimes = np.array(df[df['SID']==sid].index)
        ##print("myenttimes:", myenttimes)
        myused = []
        mytime = []

        # For both frequency and similarity metrics:
            # LIST: Metrics corresponding with my observed entries
            # CURRENT: Full metric values, with observed entries becoming 0
            # HISTORY: State of full metric values (ie, "current" during each entry)

        # My frequency initializations
        # freq current contains frequencies of ALL the words in corpus
        freq_current = np.array(freqval)
        #print("freq_current.shape:",freq_current.shape)
        freq_list = []
        freq_history = []

        # My similarity initializations
        sim_current = simval.copy()
        # sim_current contains the full NxN similarity matrix
        #print("sim_current shape:",sim_current.shape)
        sim_list = []
        sim_history = []

        phon_current = phonval.copy()
        phon_list = []
        phon_history = []

        for i in range(0,len(myentries)):
            word = myentries[i]
            #if word not in myused: # use this to calculate number of correct responses w/out repeats
            if True:   # use this line instead of former to include repeated words along w/line 110,119 comment out

                # Frequency: Get frequency and update relevant lists
                freq_list.append( float(freq_current[freqlab.index(word)]) )
                freq_history.append( np.array(freq_current) )
                #freq_current[freqlab.index(word)] = 0.00000001

                # Get similarity between this word and preceding word
                if i > 0:         
                    sim_list.append( float(sim_current[simlab.index(myentries[i-1]), simlab.index(word)]) )
                    sim_history.append( np.array(sim_current[simlab.index(myentries[i-1]),:]) )

                    phon_list.append( float(phon_current[simlab.index(myentries[i-1]), simlab.index(word)]) )
                    phon_history.append( np.array(phon_current[simlab.index(myentries[i-1]),:]) )
                else:
                    sim_list.append(0)
                    sim_history.append( np.array(sim_current[simlab.index(word),:]) )
                #sim_current[:,simlab.index(word)] = 0.00000001
                    phon_list.append(0)
                    phon_history.append( np.array(phon_current[simlab.index(word),:]) )

                # Update lists
                myused.append(word)
                mytime.append(myenttimes[i])

        # Calculate category switches, based on similarity-drop
        myswitch = np.zeros(len(myused)).astype(int)
        for i in range(1,len(myused)-1):
            if (sim_list[i+1] > sim_list[i]) and (sim_list[i-1] > sim_list[i]):
                myswitch[i] = 1

        # Save my entries with corresponding metrics
        mydf = pd.DataFrame({'sid':[sid]*len(myused) , 'ent':myused, 'freq':freq_list, 'sim':sim_list, 'phon': phon_list,
                             'switch':myswitch, 'time':mytime},
                            columns=['sid','time','ent','freq','sim', 'phon', 'switch'])
        full_entdf = full_entdf.append(mydf)
        # Get parameter fits for the different models
        myfitlist.append(sid)
        myfitlist.append(len(myused))
        ## obtaining the optimal/random fits for the static and dynamic model by calling the getFits function
        myfitlist.extend( forage.getfits(freq_list, freq_history, sim_list, sim_history, phon_list, phon_history) )
        full_fitlist.append(myfitlist)

    print("Fits Complete.")

    # create results directory if it doesn't exist yet
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    # # Output data entries with corresponding metrics for visualization in R
    print(full_entdf)
    full_entdf = full_entdf.reset_index(drop=True)
    full_entdf.to_csv(results_dir  + 'nancy-fullmetrics.csv', index=False, header=True)

    # # Output parameter & model fits
    full_fitlist = pd.DataFrame(full_fitlist)
    full_fitlist.columns = ['subject', 'number_of_items', 
                            'beta_static_frequency', 'beta_static_semantic', 'errors_static_optimal', 'errors_static_random',
                            'beta_dynamic_frequency', 'beta_dynamic_semantic', 'errors_dynamic_optimal', 'errors_dynamic_random',
                            'beta_dynamicjack_frequency', 'beta_dynamicjack_semantic', 'errors_dynamicjack_optimal', 'errors_dynamicjack_random',

                            'beta_plocalstatic_frequency', 'beta_plocalstatic_semantic', 'beta_plocalstatic_phonemic','errors_plocalstatic_optimal', 'errors_plocalstatic_random',

                            'beta_plocaldynamicorig_frequency', 'beta_plocaldynamicorig_semantic', 'beta_plocaldynamicorig_phonemic','errors_plocaldynamicorig_optimal', 'errors_plocaldynamicorig_random',
                            'beta_pglobaldynamicorig_frequency', 'beta_pglobaldynamicorig_semantic', 'beta_pglobaldynamicorig_phonemic','errors_pglobaldynamicorig_optimal', 'errors_pglobaldynamicorig_random',

                            'beta_plocaldynamicjack_frequency', 'beta_plocaldynamicjack_semantic', 'beta_plocaldynamicjack_phonemic','errors_plocaldynamicjack_optimal', 'errors_plocaldynamicjack_random',
                            'beta_pglobaldynamicjack_frequency', 'beta_pglobaldynamicjack_semantic', 'beta_pglobaldynamicjack_phonemic','errors_pglobaldynamicjack_optimal', 'errors_pglobaldynamicjack_random',

                            'beta_pswitchonlydynamicjack_frequency', 'beta_pswitchonlydynamicjack_semantic', 'beta_pswitchonlydynamicjack_phonemic','errors_pswitchonlydynamicjack_optimal', 'errors_pswitchonlydynamicjack_random'
                            ]

    #print("full_fitlist:",full_fitlist)
    full_fitlist.to_csv(results_dir  + 'nancy-fullfits.csv', index=False, header=True)

    print(full_fitlist.head())
    print("Results saved to '" + results_dir + "'.")

class forage:

    def prepareData(data):
        import pandas as pd
        import re
        # load similarity labels
        simlab = []
        ofile = open(corpus_dir + 'similaritylabels.csv','r')
        for line in ofile:
            labs = line.split()
            for lab in labs:
                simlab.append(lab)
        ofile.close()

        ### LOAD CORRECTIONS ###
        # This is a look-up list that maps incorrect words onto accepted words that are in the database
        corrections = pd.read_csv(corpus_dir + 'corrections.txt', header=None, delimiter='\t')
        corrections = corrections.set_index(corrections[0].values)
        corrections.columns = ['_from','_to']

        elist = data['entry'].values
        newlist = []
        notfound = []

        # Use look-up table to check and correct observed entries
        for ent in elist:
            ent = re.sub(r'\W+', '', ent) # Alphanumericize it
            if ent in simlab:
                # If this entry is appropriate, keep it
                newlist.append(ent)
            elif ent[0:len(ent)-1] in simlab:
                # If this entry is plural, correct to the singular verion
                print(f"found the entry {ent[0:len(ent)-1]} in simlab")
                newlist.append(ent[0:len(ent)-1])
            elif ent in corrections._from:
                # If this entry is correctable, correct it
                newlist.append(corrections.loc[ent]._to)
            else:
                # If this entry is not found in either list, mark for removal and warn user.
                newlist.append('NA')
                notfound.append(ent)

        # Remove the rows with inappropriate entries
        data.entry = newlist
        data = data[data.entry!='NA']

        # Warn the user of removed entries
        if len(notfound) > 0:
            print('The following items were not found in the database, and were removed: [' +
                  str(len(notfound)) + ' entries removed] \n')
            print(sorted(set(notfound)))
        else:
            print('All items OK.')
        return data[data.entry!='NA']
        # TODO: return statement might not be necessary...

    def model_static(beta, freql, freqh, siml, simh):
        ## beta contains the optimization parameters for frequency (beta[0]) and semantic similarity (beta[1])
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0: # if first item then its probability is based on just frequency
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            else: # if not first item then its probability is based on its similarity to prev item AND frequency
            # P of item based on frequency and similarity
                numrat = pow(freql[k],beta[0]) * pow(siml[k],beta[1])
                denrat = sum(pow(freqh[k],beta[0]) * pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat) # negative Log likelihood of this item: this will be minimized eventually
        return ct


    def model_dynamic_original(beta, freql, freqh, siml, simh):
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based again on frequency
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(siml[k],beta[1])
                denrat = sum(pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct
        
    def model_dynamic_jack(beta, freql, freqh, siml, simh):
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based again on frequency
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(freql[k],beta[0])*pow(siml[k],beta[1])
                denrat = sum(pow(freqh[k],beta[0])*pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct
      
    def model_static_plocal(beta, freql, freqh, siml, simh, phonl, phonh):
        ## beta contains the optimization parameters for frequency (beta[0]) and semantic similarity (beta[1])
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0: # if first item then its probability is based on just frequency
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            else: # if not first item then its probability is based on its similarity to prev item AND frequency AND phonemic similarity
            # P of item based on frequency and similarity and phonology
                numrat = pow(freql[k],beta[0]) * pow(phonl[k],beta[2]) * pow(siml[k],beta[1])
                denrat = sum(pow(freqh[k],beta[0]) * pow(phonh[k],beta[2])* pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat) # negative Log likelihood of this item: this will be minimized eventually
        return ct

    def model_dynamic_plocal_jack(beta, freql, freqh, siml, simh, phonl, phonh):
      ## here we use phonology as a "local" cue with semantics
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based on frequency 
                numrat = pow(freql[k],beta[0]) 
                denrat = sum(pow(freqh[k],beta[0]))
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(freql[k],beta[0])*pow(phonl[k],beta[2])*pow(siml[k],beta[1])
                denrat = sum(pow(freqh[k],beta[0])*pow(phonh[k],beta[2])*pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct

    def model_dynamic_pglobal_jack(beta, freql, freqh, siml, simh, phonl, phonh):
      ## here we use phonology as a "local" cue with semantics
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based on a combination of frequency and phonemic similarity
                numrat = pow(freql[k],beta[0]) * pow(phonl[k],beta[2]) 
                denrat = sum(pow(freqh[k],beta[0]) * pow(phonh[k],beta[2]) )
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(freql[k],beta[0])*pow(phonl[k],beta[2])*pow(siml[k],beta[1])
                denrat = sum(pow(freqh[k],beta[0])*pow(phonh[k],beta[2])*pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct

    def model_dynamic_pswitchonly_jack(beta, freql, freqh, siml, simh, phonl, phonh):
      ## here we use phonology as a "local" cue with semantics
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based on a combination of frequency and phonemic similarity
                numrat = pow(freql[k],beta[0]) * pow(phonl[k],beta[2]) 
                denrat = sum(pow(freqh[k],beta[0]) * pow(phonh[k],beta[2]) )
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(freql[k],beta[0])*pow(siml[k],beta[1])
                denrat = sum(pow(freqh[k],beta[0])*pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct
    
    def model_dynamic_plocal_original(beta, freql, freqh, siml, simh, phonl, phonh):
      ## here we use phonology as a "local" cue with semantics
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based on frequency 
                numrat = pow(freql[k],beta[0]) 
                denrat = sum(pow(freqh[k],beta[0]))
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(phonl[k],beta[2])*pow(siml[k],beta[1])
                denrat = sum(pow(phonh[k],beta[2])*pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct

    def model_dynamic_pglobal_original(beta, freql, freqh, siml, simh, phonl, phonh):
      ## here we use phonology as a "local" cue with semantics
        import numpy as np
        ct = 0
        for k in range(0, len(freql)):
            if k == 0 :
            # P of item based on frequency alone (freq of this item / freq of all items)
                numrat = pow(freql[k],beta[0])
                denrat = sum(pow(freqh[k],beta[0]))
            elif k > 0 and k < (len(freql)-1) and siml[k+1] > siml[k] and siml[k-1] > siml[k]: ## "dip" based on sim-drop
            # If similarity dips, P of item is based on a combination of frequency and phonemic similarity
                numrat = pow(freql[k],beta[0]) * pow(phonl[k],beta[2]) 
                denrat = sum(pow(freqh[k],beta[0]) * pow(phonh[k],beta[2]) )
            else:
            # P of item based on combined frequency and similarity
                numrat = pow(siml[k],beta[1])
                denrat = sum(pow(simh[k],beta[1]))
            ct += -np.log(numrat/denrat)
        return ct

    def getfits( freq_l, freq_h, sim_l, sim_h, phon_l, phon_h ):
        import numpy as np
        from scipy.optimize import fmin
    #fmin: Uses a Nelder-Mead simplex algorithm to find the minimum of function of variables.
        r1 = np.random.rand()
        r2 = np.random.rand()
        r3 = np.random.rand()

    # STATIC MODEL (no dynamic switching, just focusing on two cues with some weights)

        # 1.) Optimize model parameters
        v = fmin(forage.model_static, [r1, r2], args=(freq_l, freq_h, sim_l, sim_h), ftol = 0.001, disp=False)
        beta_static_freq = float(v[0]) # Optimized weight for frequency cue
        beta_static_semantic = float(v[1]) # Optimized weight for similarity cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_static = forage.model_static([beta_static_freq, beta_static_semantic], freq_l, freq_h, sim_l, sim_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_static = forage.model_static([0, 0], freq_l, freq_h, sim_l, sim_h)

    # ORIGINAL DYNAMIC MODEL (switches dynamically between cues)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_original, [r1,r2], args=(freq_l, freq_h, sim_l, sim_h), ftol = 0.001, disp=False)
        beta_dynamic_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamic_semantic = float(v[1]) # Optimized weight for similarity cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_dynamic = forage.model_dynamic_original([beta_dynamic_freq, beta_dynamic_semantic], freq_l, freq_h, sim_l, sim_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_dynamic = forage.model_dynamic_original([0,0], freq_l, freq_h, sim_l, sim_h)

    # JACK DYNAMIC MODEL (switches dynamically between cues)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_jack, [r1,r2], args=(freq_l, freq_h, sim_l, sim_h), ftol = 0.001, disp=False)
        beta_dynamicjack_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamicjack_semantic = float(v[1]) # Optimized weight for similarity cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_dynamicjack = forage.model_dynamic_jack([beta_dynamic_freq, beta_dynamic_semantic], freq_l, freq_h, sim_l, sim_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_dynamicjack = forage.model_dynamic_jack([0,0], freq_l, freq_h, sim_l, sim_h)
  

    # LOCAL PHONEMIC CUE, STATIC MODEL (no dynamic switching, just focusing on two cues with some weights)

        # 1.) Optimize model parameters
        v = fmin(forage.model_static_plocal, [r1, r2, r3], args=(freq_l, freq_h, sim_l, sim_h, phon_l, phon_h), ftol = 0.001, disp=False)
        beta_static_plocal_freq = float(v[0]) # Optimized weight for frequency cue
        beta_static_plocal_semantic = float(v[1]) # Optimized weight for similarity cue
        beta_static_plocal_phonemic = float(v[2]) # Optimized weight for phonemic cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_plocalstatic = forage.model_static_plocal([beta_static_plocal_freq, beta_static_plocal_semantic, beta_static_plocal_phonemic], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_plocalstatic = forage.model_static_plocal([0, 0, 0], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)
    
    # ORIGINAL DYNAMIC PHON LOCAL MODEL (switches dynamically between cues, phonology,semantic, freq is a LOCAL cue)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_plocal_original, [r1,r2, r3], args=(freq_l, freq_h, sim_l, sim_h, phon_l, phon_h), ftol = 0.001, disp=False)
        beta_dynamic_plocalorig_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamic_plocalorig_semantic = float(v[1]) # Optimized weight for similarity cue
        beta_dynamic_plocalorig_phonemic = float(v[2]) # Optimized weight for phonemic cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_plocaldynamicorig = forage.model_dynamic_plocal_original([beta_dynamic_plocalorig_freq, beta_dynamic_plocalorig_semantic, beta_dynamic_plocalorig_phonemic], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_plocaldynamicorig = forage.model_dynamic_plocal_original([0, 0, 0], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

     # ORIGINAL DYNAMIC PHON GLOBAL MODEL (switches dynamically between cues, phonology is a GLOBAL cue with frequency)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_pglobal_original, [r1,r2, r3], args=(freq_l, freq_h, sim_l, sim_h, phon_l, phon_h), ftol = 0.001, disp=False)
        beta_dynamic_pglobalorig_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamic_pglobalorig_semantic = float(v[1]) # Optimized weight for similarity cue
        beta_dynamic_pglobalorig_phonemic = float(v[2]) # Optimized weight for phonemic cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_pglobaldynamicorig = forage.model_dynamic_pglobal_original([beta_dynamic_pglobalorig_freq, beta_dynamic_pglobalorig_semantic, beta_dynamic_pglobalorig_phonemic], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_pglobaldynamicorig = forage.model_dynamic_pglobal_original([0, 0, 0], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

      # JACK DYNAMIC PHON LOCAL MODEL (switches dynamically between cues, phonology,semantic, freq is a LOCAL cue)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_plocal_jack, [r1,r2, r3], args=(freq_l, freq_h, sim_l, sim_h, phon_l, phon_h), ftol = 0.001, disp=False)
        beta_dynamic_plocaljack_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamic_plocaljack_semantic = float(v[1]) # Optimized weight for similarity cue
        beta_dynamic_plocaljack_phonemic = float(v[2]) # Optimized weight for phonemic cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_plocaldynamicjack = forage.model_dynamic_plocal_jack([beta_dynamic_plocaljack_freq, beta_dynamic_plocaljack_semantic, beta_dynamic_plocaljack_phonemic], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_plocaldynamicjack = forage.model_dynamic_plocal_jack([0, 0, 0], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

     # JACK DYNAMIC PHON GLOBAL MODEL (switches dynamically between cues, phonology is a GLOBAL cue with frequency)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_pglobal_jack, [r1,r2, r3], args=(freq_l, freq_h, sim_l, sim_h, phon_l, phon_h), ftol = 0.001, disp=False)
        beta_dynamic_pglobaljack_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamic_pglobaljack_semantic = float(v[1]) # Optimized weight for similarity cue
        beta_dynamic_pglobaljack_phonemic = float(v[2]) # Optimized weight for phonemic cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_pglobaldynamicjack = forage.model_dynamic_pglobal_jack([beta_dynamic_pglobaljack_freq, beta_dynamic_pglobaljack_semantic, beta_dynamic_pglobaljack_phonemic], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_pglobaldynamicjack = forage.model_dynamic_pglobal_jack([0, 0, 0], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)
      
    # JACK DYNAMIC PHON SWITCH ONLY MODEL (switches dynamically between cues, phonology is a GLOBAL cue with frequency)

        # 1.) Optimize model parameters
        v = fmin(forage.model_dynamic_pswitchonly_jack, [r1,r2, r3], args=(freq_l, freq_h, sim_l, sim_h, phon_l, phon_h), ftol = 0.001, disp=False)
        beta_dynamic_pswitchonlyjack_freq = float(v[0]) # Optimized weight for frequency cue
        beta_dynamic_pswitchonlyjack_semantic = float(v[1]) # Optimized weight for similarity cue
        beta_dynamic_pswitchonlyjack_phonemic = float(v[2]) # Optimized weight for phonemic cue

        # 2.) Determine model fit (errors) at optimal parameters: will return total -LL
        optimal_fit_pswitchonlydynamicjack = forage.model_dynamic_pswitchonly_jack([beta_dynamic_pswitchonlyjack_freq, beta_dynamic_pswitchonlyjack_semantic, beta_dynamic_pswitchonlyjack_phonemic], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)

        # 3.) For comparison, determine model fit (errors) without parameter fits
        random_fit_pswitchonlydynamicjack = forage.model_dynamic_pswitchonly_jack([0, 0, 0], freq_l, freq_h, sim_l, sim_h, phon_l, phon_h)
      
      

        results = [ beta_static_freq, beta_static_semantic, float(optimal_fit_static), float(random_fit_static),
                   beta_dynamic_freq, beta_dynamic_semantic, float(optimal_fit_dynamic), float(random_fit_dynamic),
                   beta_dynamicjack_freq, beta_dynamicjack_semantic, float(optimal_fit_dynamicjack), float(random_fit_dynamicjack),

                   beta_static_plocal_freq, beta_static_plocal_semantic, beta_static_plocal_phonemic, float(optimal_fit_plocalstatic), float(random_fit_plocalstatic),
              
                   beta_dynamic_plocalorig_freq, beta_dynamic_plocalorig_semantic, beta_dynamic_plocalorig_phonemic, float(optimal_fit_plocaldynamicorig), float(random_fit_plocaldynamicorig),
                   beta_dynamic_pglobalorig_freq, beta_dynamic_pglobalorig_semantic, beta_dynamic_pglobalorig_phonemic, float(optimal_fit_pglobaldynamicorig), float(random_fit_pglobaldynamicorig),

                   beta_dynamic_plocaljack_freq, beta_dynamic_plocaljack_semantic, beta_dynamic_plocaljack_phonemic, float(optimal_fit_plocaldynamicjack), float(random_fit_plocaldynamicjack),
                   beta_dynamic_pglobaljack_freq, beta_dynamic_pglobaljack_semantic, beta_dynamic_pglobaljack_phonemic, float(optimal_fit_pglobaldynamicjack), float(random_fit_pglobaldynamicjack),

                   beta_dynamic_pswitchonlyjack_freq, beta_dynamic_pswitchonlyjack_semantic, beta_dynamic_pswitchonlyjack_phonemic, float(optimal_fit_pswitchonlydynamicjack), float(random_fit_pswitchonlydynamicjack)

                   ]

        return results

## run foraging models

In [None]:
datapath = '/content/drive/My Drive/IU-Abhilasha-Mike/Fluency/optimal-foraging-model/data/corpus/nancy-animals.txt'
modelFits(datapath, delimiter = "\t")