# Trial Generation Implementation
## Base file
- This file will be generated for every participant, ensuring that everything is truely randomised.
- This file then generates the file name of each trial to be played.
- It needs to be merged with the general information/database of each stimuli:
    - durations (file generated by the `getDurations` Praat script.
    - `hasQuestion` + `questionText` from the work done by Marieke.

In [31]:
import pandas as pd
from numpy.random import RandomState
from itertools import product

######## FUNCTIONS
#----------------------
def simple_shuffle(frame, block=None, times=10, seed=None, resetInd=False):
    prng = RandomState(seed)
    
    def _shuffle(chunk):
        for _ in range(times):
            chunk = chunk.reindex(prng.permutation(chunk.index))
        if resetInd is True:
            chunk=chunk.reset_index(drop=True)
        return chunk
    
    if block is None:
        return _shuffle(frame)
    else:
        return frame.groupby(block).apply(_shuffle)
    
def add_blocks(frame, size, name='block', condList=None, id_col=None, start_at=0):

    def _assigner(lim,size):
        num = 0
        n=0
        while n<=lim:
            if n % size == 0 and n != 0:
                num +=1
                n += 1
                #print blocks[num],n, num
                yield blocks[num]
            else:
                n +=1
                #print blocks[num],n,num
                yield blocks[num]

    if condList is None:
        blocks=range(len(frame)/size)
    else:
        blocks=condList
    
    assigner=_assigner(len(frame),size)

    def _add(chunk):
        chunk[name]=[assigner.next() for _ in xrange(len(chunk))]
        return chunk
    
    if id_col is None:
        new_frame=_add(frame).sort_values(by=name)
    else:
        new_frame=frame.groupby(id_col).apply(_add).sort_values(by=id_col)

    if condList is None:
        #new_frame[name]=new_frame[name]+start_at
        return new_frame
    else:
        new_frame[name]=new_frame[name]
        return new_frame

def genTr(frame,firstBlockSize, secondBlockSize, firstBlockName, secondBlockName,seedOfShuffle=None):
    #df=simple_shuffle(df)
    rel=add_blocks(frame,firstBlockSize,name=firstBlockName,condList=['related','unrelated'])
    speak=add_blocks(rel,secondBlockSize,name=secondBlockName,condList=['native','nonNative','native','nonNative'])
    speak.sort_values(by=firstBlockName)
    
    return speak

######## IMPLEMENTATIONS
#-------------------------   

df=pd.DataFrame({'sentID':xrange(1,121)})
rel=add_blocks(df,60,name='relatedness',condList=['related','unrelated'])
speak=add_blocks(df,30,name='speaker',condList=['Nat','nonNat','Nat','nonNat'])
speak=speak[['speaker','sentID','relatedness']]
##speak['ext']=".wav"
speak['filename']=speak.apply(lambda x: '_'.join(x.dropna().astype(str).values),axis=1)
speak=speak.sort_values(by="relatedness")
final=simple_shuffle(speak)
#final.to_csv('testGen.csv',index=False)

In [32]:
final.head()

Unnamed: 0,speaker,sentID,relatedness,filename
95,nonNat,96,unrelated,nonNat_96_unrelated
67,Nat,68,unrelated,Nat_68_unrelated
73,Nat,74,unrelated,Nat_74_unrelated
53,nonNat,54,related,nonNat_54_related
35,nonNat,36,related,nonNat_36_related


In [7]:
dbase=pd.read_table('./stimDatabase/allAbove70.txt',encoding='utf-16')
df=dbase['sentID']




In [8]:
len(dbase['sentID'])

126

## Trim Database to 120
- A length of 126 is gonna cause all sorts of problems, so here are the sentence I'll cut down
- Sentences were cut down simply by taking the ones with the lowest close ratings we had in the database.
### Eliminated sentences
1. 70
2. 105
3. 53
4. 113
5. 36
6. 117

In [24]:
excludedSents=[70,105,53,113,36,117]
#sub=dbase[dbase['sentID'] not in excludedSents]

sub=dbase.query('sentID not in [70,105,53,113,36,117]')
sub.to_csv('./stimDatabase/120allAbove70.csv',encoding='utf-16',index=False)

## GenTrials Based on sentIDs from Dbase

In [54]:
dbase=pd.read_csv('./stimDatabase/120allAbove70.csv',encoding='utf-16')
df=pd.DataFrame({'sentID':dbase['sentID']})
rel=add_blocks(df,60,name='relatedness',condList=['related','unrelated'])
speak=add_blocks(df,30,name='speaker',condList=['Nat','nonNat','Nat','nonNat'])
speak=speak[['speaker','sentID','relatedness']]
speak['filename']=speak.apply(lambda x: '_'.join(x.dropna().astype(str).values),axis=1)
speak=speak.sort_values(by="relatedness")
final=simple_shuffle(speak).reset_index(drop=True)

In [55]:
final.head()

Unnamed: 0,speaker,sentID,relatedness,filename
0,nonNat,95,unrelated,nonNat_95_unrelated
1,nonNat,157,unrelated,nonNat_157_unrelated
2,Nat,22,unrelated,Nat_22_unrelated
3,Nat,75,related,Nat_75_related
4,Nat,158,unrelated,Nat_158_unrelated


## Merge genned file with info from dbase
- hasQuestion
- Question
- yesOrNo

**NOTE:** Durations will need to be pulled from the database generated by the `getDurations` praat script. (or feed into the general database first. For now this is not a priority.