# Trial Generation Worksheet

## Load SenteType List

In [2]:
import pandas as pd
from numpy.random import RandomState
from itertools import product

sentTypes=pd.read_csv('./mockTrialGen/mockList.csv')
sentTypes.head()

Unnamed: 0,sentType
0,1
1,2
2,3
3,4
4,5


## Steps
1. Assign Relatedness:
    - Shuffle, split list in 2
    - 1st Half: Rel, 2nd Half: Unrel
    - Store subsets *might wanna change this maybe to reduce storage steps*?
2. Assign Speaker:
    - For each subsets of Relatedness assigned lists:
        - shuffle, split in 2
        - First half: Native, 2nd Half: Non-Natuve
    - Store subsets
3. Join Lists:
    - Concatenate all sublists created from step 2
    - Re-shuffle for good measure.
4. From this global file, re-create the file name as a **UNIQUE ID**.
5. Merge this table with the file containing all other measures and variables?
    *This step is a maybe... After all if I have the info, I can always go back and find the variables like cloze ratings etc from the index files post-hoc, rather that making the programme always carry it.**
    -> Probably better to stick to only the experimentally needed info in that trial list.
    
    
## One thing I could do
- Use the assign block funtion, then replace the block numbers (Or make the assign block already assign the correct names.
- Make sure to turn off the suffling every time I run the add_block modification.
- create a function to replace based on grouping.

In [43]:
######## FUNCTIONS
#----------------------
def simple_shuffle(frame, block=None, times=10, seed=None, resetInd=False):
    """
    Shuffles trials a few times.
    
    :param pandas.DataFrame frame: Trials to be shuffled.
    :param block: Optional column to groupby before shuffling.
    :type block: str or None.
    :param int times: Number of times to shuffle. Defaults to 10.
    :param seed: Seed random number generator.
    :type seed: int or None
    :returns: Trial list with rows in random order.
    :rtype: pandas.DataFrame
    """
    prng = RandomState(seed)
    
    def _shuffle(chunk):
        for _ in range(times):
            chunk = chunk.reindex(prng.permutation(chunk.index))
        if resetInd is True:
            chunk=chunk.reset_index(drop=True)
        return chunk
    
    if block is None:
        return _shuffle(frame)
    else:
        return frame.groupby(block).apply(_shuffle)
    
def add_block(frame, size, name='block', start_at=0, id_col=None, shuffle=False, seed=None):
    """
    Creates a new column for block.
    
    :param pandas.DataFrame frame: Trials to be assigned blocks.
    :param int size: Length of each block.
    :param id_col: Column to group by before blocking. Assures that blocks 
        consist of approximately the same number of trials for each unique
        value in id_col
    :type id_col: str or None
    :param seed: Seed random number generator.
    :type seed: int or None
    :returns: Trial list with new column for block.
    :rtype: pandas.DataFrame
    """
    def _assigner(blocks, prng):
        prng.shuffle(blocks)
        i = 0
        while True:
            yield blocks[i]
            if (i+1)%len(blocks):
                prng.shuffle(blocks)
            i = (i+1)%len(blocks)
            
    prng = RandomState(seed)
    blocks = range(len(frame)/size)
    assigner = _assigner(blocks, prng)
    
    def _add(chunk):
        chunk[name] = [assigner.next() for _ in xrange(len(chunk))]
        return chunk
    
    if id_col is None:
        new_frame = _add(frame).sort_values(by=name)
    else:
        new_frame = frame.groupby(id_col).apply(_add).sort_values(by=name)
    
    new_frame[name] = new_frame[name] + start_at
    return new_frame

    

######## IMPLEMENTATIONS
#-------------------------   

#relAss=simple_shuffle(sentTypes,seed=1,resetInd=True)

test=add_block(sentTypes,50,start_at=1,seed=1)
test.head()

Unnamed: 0,sentType,block
0,1,1
37,38,1
40,41,1
42,43,1
43,44,1


In [18]:
gp=test.groupby('block')

100

In [20]:
relConds

Unnamed: 0,1,2
0,rel,unrel


## Maybe Just Create Function with Merging
- create a **basic** `pd` df with, one column being the block number and the other the condition and merge.


In [34]:
relConds=pd.DataFrame({'block':[1,2],'cond':['rel','unrel']})
relAss=pd.merge(test,relConds,how='left')
relAss.head(52)

Unnamed: 0,sentType,block,cond
0,1,1,rel
1,38,1,rel
2,41,1,rel
3,43,1,rel
4,44,1,rel
5,47,1,rel
6,49,1,rel
7,53,1,rel
8,54,1,rel
9,56,1,rel


## Then Merge again using the Native Non-Native


In [42]:
newBlock=add_block(relAss,25,id_col='cond',start_at=1,seed=1)
newBlock=newBlock.sort_values(by=['block','cond'])
speakConds=pd.DataFrame({'block':[1,2,3,4],'speaker':['nat','nonNat','nat','nonNat']})
speakAss=pd.merge(newBlock,speakConds,how='left')
#speakAss=speakAss.drop('block',1)
speakAss.head(50)
speakAss.to_csv('testTrialList.csv',index=False)

# Filenaming Cleanup

In [None]:
### If copying this implementation don't forget to uncomment the last line for the rename to
### actually take place.

import os, re

def renameForProcessing(path):
    os.chdir(path)
    filenames=os.listdir(path)
    #remove from experimental sentences
    for filename in filenames:
        # firstPattern, up to _part
        firstSubs=re.sub(r"\_[ab]\_(Bobby|Jurriaan|Johanneke|Victoria)",'',filename)
        #os.rename(filename,firstSubs)
        # second Sub to remobe '_part'
        newName=re.sub(r"\_part",'',firstSubs)
#        os.rename(filename,newName)

    #remove from control sentences
    filenames=os.listdir(path) #gotta somehow refresh filenames
    for filename in filenames:
        # firstPattern, up to _part
        controlSubs=re.sub(r"\_(Bobby|Jurriaan|Johanneke|Victoria)",'',filename)
#        os.rename(filename,controlSubs)
        


# Pre-Splicing Selection
## Generate List of filenames


In [133]:
fuck="Nat_107_related.TextGrid"
t=re.findall(r'related',filename)
print t
t==['related']

['related']


True

In [19]:
import os, re
import pandas as pd

path='/Users/boutonnetbpa/Dropbox/3.CurrentProjects/AThEME/accentedSpeech/AccPred/stimToProcess'

def listNames(path,addSpliceOptions=False):
    filenames=os.listdir(path)
    count = 0
    for filename in filenames:
        if filename.endswith(".wav"):
            curSentID=re.findall(r'\d+',filename)
            curRel=re.findall(r'(\_related|\_unrelated|\_control)',filename)
            curRel=re.sub(r"\_",'',curRel[0])
            #isControl=re.findall(r'control',filename)
            #if curRel!=['related'] and isControl!=['control']:
                #curRel='unrelated'
           # else:
                #curRel='control'
            if count<1:
                fileList=pd.DataFrame({'filename':filename,'sentID':curSentID,'rel':curRel})
            else:
                curFileRow=pd.DataFrame({'filename':filename,'sentID':curSentID,'rel':curRel})
                fileList=fileList.append(curFileRow)
            count += 1
    fileList=fileList.reset_index(drop=True)
    if addSpliceOptions==True:
        fileList['keepCont']=""
        fileList['splice']=""
    return fileList

test=listNames(path)

In [20]:
test

Unnamed: 0,filename,rel,sentID
0,Nat_107_related.wav,related,107
1,Nat_107_unrelated.wav,unrelated,107
2,Nat_10_related.wav,related,10
3,Nat_10_unrelated.wav,unrelated,10
4,Nat_111_related.wav,related,111
5,Nat_111_unrelated.wav,unrelated,111
6,Nat_118_related.wav,related,118
7,Nat_118_unrelated.wav,unrelated,118
8,Nat_121_related.wav,related,121
9,Nat_121_unrelated.wav,unrelated,121


In [6]:
import pandas as pd
import numpy as np
df=pd.DataFrame({'sentID':[10,10,11,11], 'fileName':['10_R','10_UR','11_R','11_UR'],'keepCont':[1,0,0,1],'splice':[0,1,1,0]})
df

Unnamed: 0,fileName,keepCont,sentID,splice
0,10_R,1,10,0
1,10_UR,0,10,1
2,11_R,0,11,1
3,11_UR,1,11,0


In [25]:
df.to_csv('test.csv',index=False)

## Context/Splicing Selector

In [7]:
df['rel']=['related','unrelated','related','unrelated']
df.keepCont=''
df.splice=''

In [25]:
split=np.array_split(df,2)
split

[  fileName keepCont  sentID splice        rel
 0     10_R               10           related
 1    10_UR               10         unrelated,
   fileName keepCont  sentID splice        rel
 2     11_R               11           related
 3    11_UR               11         unrelated]

In [35]:
import pandas as pd
import numpy as np

def assignSplicingRules(df):
    split=np.array_split(df,2)
    for curDf in xrange(len(split)):
        if curDf==0:
            split[curDf].loc[split[curDf].rel=='related',['keepCont','splice']]=1,0
            split[curDf].loc[split[curDf].rel=='unrelated',['keepCont','splice']]=0,1
        else:
            split[curDf].loc[split[curDf].rel=='related',['keepCont','splice']]=0,1
            split[curDf].loc[split[curDf].rel=='unrelated',['keepCont','splice']]=1,0


    concated=pd.concat(split)
    return concated


Unnamed: 0,fileName,keepCont,sentID,splice,rel
0,10_R,1,10,0,related
1,10_UR,0,10,1,unrelated
2,11_R,0,11,1,related
3,11_UR,1,11,0,unrelated


Unnamed: 0,fileName,keepCont,sentID,splice,rel
0,10_R,1.0,10,0.0,related
1,10_UR,,10,,unrelated


## File Merging/Splicing Generator

In [1]:
import pandas as pd
import easygui

def generateSplicePattern():
    filename=easygui.fileopenbox(filetypes=['*.csv'])
    df=pd.read_csv(filename)

    grouped=df.groupby('sentID')
    #rezDf=pd.DataFrame({'fileA','fileB'})
    count=0
    for k, gp in grouped:
        curSubTable=grouped.get_group(k)
        #keep Context Test
        val1=curSubTable['fileName'][(curSubTable['keepCont']==1)]
        #splice test
        val2=curSubTable['fileName'][(curSubTable['splice']==1)]
        if count<1:
            res=pd.DataFrame({'fileA':val1.values,'fileB':val2.values})
        else:
            curRow=pd.DataFrame({'fileA':val1.values,'fileB':val2.values})
            res=res.append(curRow)
        count+=1

    res=res.reset_index(drop=True)
    #res['outputFilename']=res.fileB.str.cat('_spliced',sep='')
    res['outputFilename']=res['fileB'].apply(lambda x: x+'_spliced')
    return res

if __name__=='__main__':
    pattern=generateSplicePattern()
    #pattern.to_csv('splicePattern.txt', index=False, sep='\t')

In [2]:
pattern

Unnamed: 0,fileA,fileB,outputFilename
0,10_R,10_UR,10_UR_spliced
1,11_UR,11_R,11_R_spliced
