# Classifying vowels in Nomlaki

## Description of data
Anna's copy of the Nomlaki notebook. Based (heavily) on work at https://github.com/emilyremirez/ExemPy.

In [75]:
# Set up

%load_ext autoreload

# import ExemPy as gp
from ExemPy.ExemPy import *
from ExemPy.ExemPy.utils import *
from ExemPy.ExemPy.viz import *
from ExemPy.ExemPy.GCM import *
%aimport ExemPy.ExemPy, ExemPy.ExemPy.utils, ExemPy.ExemPy.viz, ExemPy.ExemPy.GCM
%autoreload 1
import math
import random
import matplotlib.pyplot as plt
#%matplotlib inline
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.optimize import minimize
import seaborn as sns

sns.set(style='ticks', context='paper')
colors=["#045447","#A6262E","#FFBB00","#253060","#8EAB27","#36B5AA"]
sns.set_palette(colors)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read in data

In [76]:
#read in file containing all nom vowels, including with and without official spellings
nom_raw = pd.read_csv("nomlaki-vowels.csv")

#rename cols
nom_raw.rename(columns = {'raw_phoneme':'vowel', 'phoneme':'spelling'}, inplace = True)

#remove entries with non-vowels
list_vowels = ['a','e','i','o','u']
nom_vowels = nom_raw[nom_raw['vowel'].isin(list_vowels)]

#nomlaki is df without spellings, n = 2138
nomlaki = nom_vowels.drop(['time','word','spelling'],axis=1)

# #nom_spell is subset containing spellings, n = 316. Use nom_spell to classify based 
# #on Nomlaki textbook spelling instead of IPA classification.

# nom_spell = nom_vowels.drop(['time','word','vowel'],axis=1)
# nom_spell = nom_spell.dropna(subset=['spelling'])


In [77]:
#assigning examplars to nomlaki

exemplars = nomlaki

#if doing spelling, set to exemplars = nom_spell

## Optimize

Define a function to calculate the proportion of error

In [78]:
def calcerror(x,test,exemplars,catslist,fitdims,cval,anchordim=None):
    '''
    Categorizes a data set and returns the proportion of stimuli/test rows
    that were categorized inaccurately. A lower value means a lower amount of
    error. Designed to be used with parameter fitting functions to assign
    values to attention weighting for the dimensions.
    
    Required parameters:
    
    x = Array. Initial guesses for parameters
    
    test = DataFrame. Stimuli to be categorized
    
    exemplars = DataFrame. Exemplar cloud to use for categorization
    
    catslist = List of strings. Each string should correspond to a category that
        should be assigned to the test
    
    fitdims = List of strings. Each string should correspond to a dimension
        for which parameters should be fit.
        
    Optional parameters:
    
    anchordim = String. Dimension for parameter which will not be fit, but will
        instead be hard-coded as 1. This helps constrain the set of possible
        solutions
    
    
    '''
    #x = [z1,z2,z3]
    # dimslist = [z0, z1, z2, z3]
    # dimsdict = {z0 =1, z1 = z1guess, z2 = z2guess, z3=z3guess}
    # dimslist[0] = z0        dl[1] = z1,       dl[2] = z2.      dl[3] = z3
    #            1            x[0]= z1,          x[1] = z2,      x[2] =z3 

    
    dimsvals = {fitdims[i]: x[i] for i in range(len(fitdims))}
    if anchordim != None:
        dimsvals.update({anchordim:1})
    
    choices=multicat(test,cloud,catslist,dimsvals,cval)
    accuracy=checkaccuracy(choices,catslist)
    category=catslist[0]
    err = accuracy[category+"Acc"].value_counts(normalize=True)['n']
    return err

# Set initial parameters for categorization


In [79]:
cval=25
dimsvals={'F1' : .761,
          'F2' : .681,
          'F3' : .407,
          'duration':1}
dimslist=list(dimsvals.keys())
catslist=['vowel']
test=gettestset(exemplars,'vowel',20)          # get a testset with 2 examples each phoneme; 
                                                 # if using spelling set 2nd argument to 'spelling'
cloud = exemplars

In [80]:
fitdims = dimslist[1:]      # Fit all dimensions except item 0
anchordim = dimslist[0]     # Set item 0 to 1

name = 'nom_vowel_optimization'   # name of output spreadsheet
n = 3                        # number of times that random x is generated
#-------------------------------------------

resultslist=[['start','fit','error','evals']] # initialize a list for results 

print("----- Parameters -----")
if anchordim != None: 
    print("Anchored (1):  ", anchordim)
    
print("Optimized:     ", fitdims)
print("")
print("Trials: ",n)
print("")

for i in range(0,n): 
    x=np.divide(random.sample(range(0,300),len(fitdims)),100)
        # Get a random sample of numbers between 0 and 300,
           # divide by 100 to get floats between 0 and 3  
    xguess = x
    result = minimize(calcerror,
                  xguess,  # the initial guess array
                  args=(test,cloud,catslist,fitdims,cval,anchordim), # arguments for the error function
                  method='Powell',  
                  tol=0.05,  # a 'tolerance' value, smaller means more function evaluation, but potentially better fit
                 )
    start = x
    fit = np.round(result.x,3)
    error = result.fun
    evals = result.nfev
    row = [start,fit,error,evals]
    resultslist.append(row)
    
    print ("-----", (i+1) ," -----")
    print("Initial guess:    ", start)
    print("Optimized:        ", fit)
    print(" ")
    print("Number evals: ", evals)
    print("Error:        ", error)
    print("")
results=pd.DataFrame(resultslist)
results.columns = results.iloc[0]
results=results[1:]

# Save the (last set of) fitted parameters in a dimsvals dict
### with keys dimensions, values as attention weights
squeebo = {fitdims[i]: result.x[i] for i in range(len(fitdims))}
squeebo.update({anchordim:1})

# write results to csv
## good for if you want to leave it running while you do something else!
results.to_csv(name+".csv")     

----- Parameters -----
Anchored (1):   F1
Optimized:      ['F2', 'F3', 'duration']

Trials:  3

{'vowel':   vowel  probability
0     a          0.0
1     e          1.0
2     i          0.0
3     o          0.0
4     u          0.0}
hewwo3
  vowel  probability
0     a          0.0
1     e          1.0
2     i          0.0
3     o          0.0
4     u          0.0
testing_hewwo
0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: probability, dtype: float64
hewwo
<class 'pandas.core.frame.DataFrame'>
  vowel  probability
1     e          1.0
{'vowel':   vowel   probability
0     a  0.000000e+00
1     e  1.000000e+00
2     i  6.906010e-22
3     o  0.000000e+00
4     u  0.000000e+00}
hewwo3
  vowel   probability
0     a  0.000000e+00
1     e  1.000000e+00
2     i  6.906010e-22
3     o  0.000000e+00
4     u  0.000000e+00
testing_hewwo
0    0.000000e+00
1    1.000000e+00
2    6.906010e-22
3    0.000000e+00
4    0.000000e+00
Name: probability, dtype: float64
hewwo
<class 'pandas.core.frame.Dat

ValueError: can only convert an array of size 1 to a Python scalar

In [None]:
to_cat = gp.gettestset(exemplars,'vowel',25) #if using spelling set 2nd arg to 'spelling'

# Categorize all exemplars wrt other exemplars

In [None]:
choices=gp.multicat(
    to_cat,       # what to categorize
    cloud,       # exemplar cloud
    catslist,        # categories to categorize on
    dimsvals,         # dictionary with keys as dimensions and values as attention weights
    cval,            # sensitivity value--a pain in the ass. I just keep it at 25 bc i fit that once
    exclude_self=True,  # Don't compare exemplars to themselves
    N=1,             # give everything a resting activation of 1
    runnerup=False   # Don't give us the second choice category
    )

## Get confusion matrix

In [None]:
gp.confusion(choices,'vowel')['vowel'] #if using spelling replace 'vowel' with 'spelling'

# Check accuracy
 - check the accuracy of the categorization, get df with y or n for each cat
 - plot accuracy by category as bar graph
 - get proportion overall correct
 - get proportions by category

In [None]:
acc = gp.checkaccuracy(choices,["vowel"]) #if using spelling replace 'vowel' with 'spelling'
gp.accplot(acc,'vowel')

In [None]:
print("overall accuracy: " + str(gp.overallacc(acc,'vowel'))) #if using spelling replace with 'spelling'
gp.propcorr(acc,'vowel')

In [None]:
# get the subset of data that was categorized inaccurately
inaccuratephons = acc.loc[acc['vowelAcc']=="n"].reset_index() #if using spelling set to 'spellAcc'
inaccuratephons