# Prep data file for analysis in R

In [1]:
import pandas as pd
import numpy as np

First get the list of partIDs and their language profiles.

In [54]:
parts = pd.read_csv("../blp/data/parts.csv")
parts['partID'] = parts['partID'].map(str.lower)
trilingual_list = parts[parts['lang_profile']!="mono"].copy().partID.values.tolist()
part_list = parts['partID'].values.tolist()

Now read in the data from each language's perception task. Drop the first 9828 rows from the Spanish task which include pre-experiment testing. Also drop any rows with a partID not in the df `parts`.

In [55]:
spa = pd.read_csv("data/spanish.csv")
spa = spa.iloc[9828:]
spa['partID'] = spa['partID'].map(str.lower)
# should be 50
spa.partID.nunique()

50

In [56]:
spa_parts = spa.partID.values.tolist()
print("Trilingual participants that don't show up in Spanish perception data:")
print(list(set(trilingual_list) - set(spa_parts)))
print("Spanish perception data participants that don't show up in trilingual part list:")
print(list(set(spa_parts) - set(trilingual_list)))

Trilingual participants that don't show up in Spanish perception data:
['vrc053', 'usm074', 'aei079']
Spanish perception data participants that don't show up in trilingual part list:
['bad_value', 'oec103', 'anm099']


Fix.

In [57]:
# aei079 was given ID anm099 by accident
spa.loc[spa['partID']=="anm099",'partID'] = 'aei079'
# vrc053 was given ID oec103 by accident
spa.loc[spa['partID']=="oec103",'partID'] = 'vrc053'
# usm074 had incorrect url
spa.loc[((spa["partID"] == "bad_value") & (spa["subjID"] == "2642c56c09d355")),
        "partID"] = "usm074"

In [58]:
spa = spa[spa['partID'].isin(part_list)].copy()
spa.reset_index(inplace = True, drop = True)
# should be 50
spa['partID'].nunique()

50

Some participants could not complete the third trial and had to restart the experiment, so they show as having two rounds of trial one.

In [59]:
spa['block'] = spa.groupby(['partID', 'trial'], sort = False).cumcount()
spa.head()

Unnamed: 0,subjID,partID,list,block,trial,file1,filedur,loadtime,status,response,rt
0,263f10f5f44fad,rre101,2,0,0,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,429,1085,OK,pondis,3272
1,263f10f5f44fad,rre101,2,0,1,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,421,860,OK,pondís,1122
2,263f10f5f44fad,rre101,2,0,2,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,417,890,OK,pondís,591
3,263f10f5f44fad,rre101,2,0,3,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,420,895,OK,pondís,737
4,263f10f5f44fad,rre101,2,0,4,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,429,955,OK,pondis,1205


Similarly, for Catalan, remove the first 2826 rows.

In [60]:
cat = pd.read_csv("data/catalan.csv")
cat = cat.iloc[2825:]
cat = cat.rename(columns={'SubjID': 'subjID'})
cat['partID'] = cat['partID'].map(str.lower)
cat.partID.nunique()

51

In [61]:
cat_parts = cat.partID.values.tolist()
print("Trilingual participants that don't show up in Catalan perception data:")
print(list(set(trilingual_list) - set(cat_parts)))
print("Catalan perception data participants that don't show up in trilingual part list:")
print(list(set(cat_parts) - set(trilingual_list)))

Trilingual participants that don't show up in Catalan perception data:
['ama302', 'lic063', 'rre101']
Catalan perception data participants that don't show up in trilingual part list:
['0', 'bad_value', 'aoa302', 'ark038']


Fix.

In [62]:
# fixes ama302
cat.partID = cat.partID.str.replace('aoa302','ama302')
# fixes rre101
cat.loc[((cat["partID"] == "bad_value") & (cat["subjID"] == "263ff9a086a788")),
        "partID"] = "rre101"
# fixes ark038 -- next below lic063 in spreadsheet, wrong link was copied and used
cat.loc[cat['partID']=="ark038",'partID'] = 'lic063'

In [63]:
cat = cat[cat['partID'].isin(part_list)].copy()
cat.reset_index(inplace = True, drop = True)
# should be 50
cat['partID'].nunique()

50

Some participants could not complete the third trial and had to restart the experiment, so they show as having two rounds of trial one.

In [64]:
cat['block'] = cat.groupby(['partID', 'trial'], sort = False).cumcount()
cat.head()

Unnamed: 0,subjID,partID,list,block,trial,file1,filedur,loadtime,status,response,rt
0,263e3de6050126,dis022,2,0,0,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,482,395,OK,pondis,3272
1,263e3de6050126,dis022,2,0,1,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,471,240,OK,pondis,1178
2,263e3de6050126,dis022,2,0,2,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,479,185,OK,pondis,930
3,263e3de6050126,dis022,2,0,3,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,472,220,OK,pondis,862
4,263e3de6050126,dis022,2,0,4,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,488,645,OK,pondis,928


The number of columns was changed for the English task partway through data collection. Also, drop the first 1100 rows which were testing before the experiment began.

In [65]:
eng = pd.read_csv('data/english.csv', names = ['subjID', 'partID', 'list', 'block', 'trial',
                                               'file1', 'filedur', 'loadtime', 'status',
                                               'response', 'rt'])
eng = eng.iloc[1101:]
eng['partID'] = eng['partID'].map(str.lower)
eng.partID.nunique()

72

In [66]:
eng_parts = eng.partID.values.tolist()
print("Participants that don't show up in English perception data:")
print(list(set(part_list) - set(eng_parts)))
print("English perception data participants that don't show up in part list:")
print(list(set(eng_parts) - set(part_list)))

Participants that don't show up in English perception data:
[]
English perception data participants that don't show up in part list:
['0', 'bad_value']


In [67]:
eng = eng[eng['partID'].isin(part_list)].copy()
eng.reset_index(inplace = True, drop = True)
# should be 70
eng['partID'].nunique()

70

The English experiment also had an issue with the `block` column. Create a repeating counter only for the main trials (i.e., not the practice trials).

In [68]:
eng = eng[~eng['file1'].str.contains("practice")].copy()
eng['block'] = eng.groupby(['partID', 'trial'], sort = False).cumcount()
eng.reset_index(inplace = True, drop = True)
eng.head()

Unnamed: 0,subjID,partID,list,block,trial,file1,filedur,loadtime,status,response,rt
0,263d1df1212401,acj047,2,0,0,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,429,320,OK,PONdiss,1751.0
1,263d1df1212401,acj047,2,0,1,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,407,255,OK,PONdiss,1043.0
2,263d1df1212401,acj047,2,0,2,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,411,275,OK,PONdiss,1038.0
3,263d1df1212401,acj047,2,0,3,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,408,300,OK,PONdiss,1106.0
4,263d1df1212401,acj047,2,0,4,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,422,355,slow_response,ponDISS,5577.0


In [69]:
# eng_practice = eng[eng['file1'].str.contains("practice")].copy()
# eng_practice['block'] = 'practice'
# eng = pd.concat([eng_main, eng_practice], axis = 0)
# eng.sort_index(inplace = True)
# eng.head()

We want to remove responses that are not 'OK', code responses as 'pen' for penultimate stress and 'ult' for ultimate stress, create a new column with just the filename, then new columns listing the step of each measure. Then, each of these columns needs to be centered, where 1 is converted to -2, 2 is converted to -1, 3 is converted to 0, 4 is converted to 1, and 5 is converted to 2. Additionally, when values of 0 should be converted to 3 in the raw measures (since this is the neutral midpoint) and kept as 0 in the centered column.

In [70]:
spa["response_binary"] = spa["response"].map({'pondis': "pen", 'pondís': "ult"})
cat["response_binary"] = cat["response"].map({'pondis': "pen", 'pondís': "ult"})
eng["response_binary"] = eng["response"].map({'PONdiss': 'pen', 'ponDISS': "ult"})

In [71]:
print("before spa: ",len(spa),"\nbefore cat: ", len(cat), "\nbefore eng: ", len(eng))
spa = spa[spa["status"]=="OK"].copy()
cat = cat[cat["status"]=="OK"].copy()
eng = eng[eng["status"]=="OK"].copy()
print("after spa: ",len(spa),"\nafter cat: ", len(cat), "\nafter eng: ", len(eng))

before spa:  11145 
before cat:  11236 
before eng:  15712
after spa:  11051 
after cat:  11097 
after eng:  15613


In [72]:
# account for practice trials in English task
spa['filename'] = spa['file1'].str.split('spanish/',expand=True)[1]
cat['filename'] = cat['file1'].str.split('catalan/',expand=True)[1]
eng['filename'] = eng['file1'].str.split('english/',expand=True)[1]#apply(lambda x: x.split("english/")[1] if "english" in x else "0")

In [73]:
spa['formant'] = spa['filename'].apply(lambda x: float(x.split("F")[1][0]) if "F" in x else 3)
spa['spectilt'] = spa['filename'].apply(lambda x: float(x.split("st")[1][0]) if "st" in x else 3)
spa['duration'] = spa['filename'].apply(lambda x: float(x.split("dur")[1][0]) if "dur" in x else 3)

cat['formant'] = cat['filename'].apply(lambda x: float(x.split("F")[1][0]) if "F" in x else 3)
cat['spectilt'] = cat['filename'].apply(lambda x: float(x.split("st")[1][0]) if "st" in x else 3)
cat['duration'] = cat['filename'].apply(lambda x: float(x.split("dur")[1][0]) if "dur" in x else 3)

eng['formant'] = eng['filename'].apply(lambda x: float(x.split("F")[1][0]) if "F" in x else 3)
eng['spectilt'] = eng['filename'].apply(lambda x: float(x.split("st")[1][0]) if "st" in x else 3)
eng['duration'] = eng['filename'].apply(lambda x: float(x.split("dur")[1][0]) if "dur" in x else 3)

In [74]:
spa['formant_center'] = spa['formant'].apply(lambda x: x-3)
spa['spectilt_center'] = spa['spectilt'].apply(lambda x: x-3)
spa['duration_center'] = spa['duration'].apply(lambda x: x-3)

cat['formant_center'] = cat['formant'].apply(lambda x: x-3)
cat['spectilt_center'] = cat['spectilt'].apply(lambda x: x-3)
cat['duration_center'] = cat['duration'].apply(lambda x: x-3)

eng['formant_center'] = eng['formant'].apply(lambda x: x-3)
eng['spectilt_center'] = eng['spectilt'].apply(lambda x: x-3)
eng['duration_center'] = eng['duration'].apply(lambda x: x-3)

In [75]:
cat_rt = cat[(cat['rt']>200) & (cat['rt']<2500)].copy()
spa_rt = spa[(spa['rt']>200) & (spa['rt']<2500)].copy()
eng_rt = eng[(eng['rt']>200) & (eng['rt']<2500)].copy()
print("init cat length:", len(cat), "final cat len:", len(cat_rt))
print("init spa length:", len(spa), "final spa len:", len(spa_rt))
print("init eng length:", len(eng), "final eng len:", len(eng_rt))

init cat length: 11097 final cat len: 10688
init spa length: 11051 final spa len: 10883
init eng length: 15613 final eng len: 15170


In [76]:
cat_later = cat_rt[cat_rt['block']!=0].copy()
spa_later = spa_rt[spa_rt['block']!=0].copy()
eng_later = eng_rt[eng_rt['block']!=0].copy()
print("init cat length:", len(cat_rt), "final cat len:", len(cat_later))
print("init spa length:", len(spa_rt), "final spa len:", len(spa_later))
print("init eng length:", len(eng_rt), "final eng len:", len(eng_later))

init cat length: 10688 final cat len: 7260
init spa length: 10883 final spa len: 7278
init eng length: 15170 final eng len: 10217


## Adding BLP info to data

Use `03_blp_pca` in the blp folder to prepare the components from the blp data for each participant.

In [77]:
spa_later = spa_later.merge(parts, on="partID", how = "left")
cat_later = cat_later.merge(parts, on="partID", how = "left")
eng_later = eng_later.merge(parts, on="partID", how = "left")
spa_later.head(2)

Unnamed: 0,subjID,partID,list,block,trial,file1,filedur,loadtime,status,response,rt,response_binary,filename,formant,spectilt,duration,formant_center,spectilt_center,duration_center,lang_profile
0,263f10f5f44fad,rre101,2,1,0,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,420,15,OK,pondís,1128,ult,st3_dur1,3.0,3.0,1.0,0.0,0.0,-2.0,l3_eng
1,263f10f5f44fad,rre101,2,1,1,https://d3uxfe7dw0hhy7.cloudfront.net/annie-he...,427,20,OK,pondís,1361,ult,F2_dur5,2.0,3.0,5.0,-1.0,0.0,2.0,l3_eng


In [78]:
blp_pca = pd.read_csv("../blp/data/pca_blp.csv")
blp_pca.head()

Unnamed: 0,pc1,pc2,partID,lang_profile
0,4.932414,-1.595612,eir057,l1_eng
1,5.450197,-0.136048,dis022,l1_eng
2,3.361367,-2.325886,irm067,l1_eng
3,6.165044,2.991184,afs116,l1_eng
4,3.892734,-2.726815,nir048,l1_eng


In [79]:
spa_later = spa_later.merge(blp_pca, on=["partID", 'lang_profile'], how = "left")
cat_later = cat_later.merge(blp_pca, on=["partID", 'lang_profile'], how = "left")
eng_later = eng_later.merge(blp_pca, on=["partID", 'lang_profile'], how = "left")

In [80]:
spa_later.to_csv("data/span_r.csv", index = None)
cat_later.to_csv("data/cat_r.csv", index = None)
eng_later.to_csv("data/eng_r.csv", index = None)