## Data wrangling

In [24]:
import pandas as pd
import numpy as np
import glob
import preprocessing as pre

First we need to extract data from the excel file. In order to construct the eventual for loop, we'll first need to try extracting data from one file

In [25]:
df_eeg = pre.process_all_excel_files()

Reorder columns of df

In [26]:
df_eeg = df_eeg[['id', 'brain_oscillation','freq_band','electrode','fft_abs_power']]

Remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [27]:
df_eeg = df_eeg[(df_eeg.id != '10') & (df_eeg.id != '18') & (df_eeg.id != '52') & (df_eeg.id != '215')]

Verify data wrangling

In [28]:
print(df_eeg.shape)
df_eeg.head(5)

(28518, 5)


Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power
0,134,Delta,1.0-4.0Hz,FP1,15.565495
1,134,Delta,1.0-4.0Hz,FP2,13.945462
2,134,Delta,1.0-4.0Hz,F7,10.874886
3,134,Delta,1.0-4.0Hz,F3,13.581803
4,134,Delta,1.0-4.0Hz,Fz,15.644595


In [29]:
df_eeg.dtypes

id                   object
brain_oscillation    object
freq_band            object
electrode            object
fft_abs_power        object
dtype: object

In [30]:
print('# unique ids: {}'.format(len(df_eeg.id.unique())))
print(df_eeg.id.value_counts())

# unique ids: 97
121    294
13     294
68     294
413    294
63     294
      ... 
103    294
702    294
42     294
66     294
503    294
Name: id, Length: 97, dtype: int64


In [31]:
print(df_eeg.electrode.value_counts())
print(df_eeg.brain_oscillation.value_counts())
print(df_eeg.freq_band.value_counts())

Pz     1358
F8     1358
A1     1358
A2     1358
P3     1358
FP2    1358
F7     1358
FP1    1358
C3     1358
Fz     1358
T6     1358
T3     1358
T4     1358
Cz     1358
F4     1358
T5     1358
O2     1358
O1     1358
C4     1358
P4     1358
F3     1358
Name: electrode, dtype: int64
Beta1        2037
HighGamma    2037
Gamma1       2037
Beta2        2037
Alpha2       2037
Theta        2037
Gamma        2037
Gamma2       2037
Delta        2037
HighBeta     2037
Alpha1       2037
Beta         2037
Alpha        2037
Beta3        2037
Name: brain_oscillation, dtype: int64
30.0-35.0Hz    2037
30.0-40.0Hz    2037
8.0-10.0Hz     2037
18.0-25.0Hz    2037
15.0-18.0Hz    2037
12.0-25.0Hz    2037
12.0-15.0Hz    2037
1.0-4.0Hz      2037
8.0-12.0Hz     2037
4.0-8.0Hz      2037
25.0-30.0Hz    2037
35.0-40.0Hz    2037
40.0-50.0Hz    2037
10.0-12.0Hz    2037
Name: freq_band, dtype: int64


### Merge dataframes (Neuropsy data with df (eeg))

We now need to import the Neuropsydata

In [32]:
df_neuropsy = pd.read_excel("Neuropsy.xlsx", na_values=".")
print(df_neuropsy.shape)
df_neuropsy.head(5)

(100, 13)


Unnamed: 0,ID,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,86.0,66.0,112.0,94.0,90.0,100.0
4,10,18,1,,,,,98.0,103.0,93.0,92.0,100.0,85.0


Then remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [33]:
df_neuropsy= df_neuropsy.dropna(axis=0, how='any')

Verify it worked

In [34]:
print(df_neuropsy.shape)

(96, 13)


Rename ID variable

In [35]:
#rename id so it can be merged
df_neuropsy.rename(columns = {'ID':'id'}, inplace = True) #rename id so it can be merged
df_neuropsy.head(5)

Unnamed: 0,id,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,86.0,66.0,112.0,94.0,90.0,100.0
5,11,21,1,16.0,26.0,13.0,10.0,45.0,33.0,78.0,69.0,51.0,94.0


#### Now let's merge the two dataframes together

In [36]:
 #change dtype of id column from df
df_eeg['id']=df_eeg['id'].astype(int)

In [37]:
df_full = pd.merge(left=df_eeg, right=df_neuropsy, left_on='id', right_on='id')

In [38]:
df_full.head()

Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,134,Delta,1.0-4.0Hz,FP1,15.565495,17,1,18.0,3.0,6.0,6.0,85.0,91.0,81.0,91.0,90.0,94.0
1,134,Delta,1.0-4.0Hz,FP2,13.945462,17,1,18.0,3.0,6.0,6.0,85.0,91.0,81.0,91.0,90.0,94.0
2,134,Delta,1.0-4.0Hz,F7,10.874886,17,1,18.0,3.0,6.0,6.0,85.0,91.0,81.0,91.0,90.0,94.0
3,134,Delta,1.0-4.0Hz,F3,13.581803,17,1,18.0,3.0,6.0,6.0,85.0,91.0,81.0,91.0,90.0,94.0
4,134,Delta,1.0-4.0Hz,Fz,15.644595,17,1,18.0,3.0,6.0,6.0,85.0,91.0,81.0,91.0,90.0,94.0


In [39]:
#sort by ids
df_full = df_full.sort_values(by=['id'])

In [40]:
#Reorder columns 
df_full = df_full[['id','Age', 'Gender','brain_oscillation','freq_band','electrode','fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_full.head()

Unnamed: 0,id,Age,Gender,brain_oscillation,freq_band,electrode,fft_abs_power,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
24987,1,21,1,Gamma2,35.0-40.0Hz,O2,0.307275,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
24894,1,21,1,Beta1,12.0-15.0Hz,Cz,2.836892,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
24893,1,21,1,Beta1,12.0-15.0Hz,C3,3.389257,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
24892,1,21,1,Beta1,12.0-15.0Hz,T3,1.932996,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
24891,1,21,1,Beta1,12.0-15.0Hz,F8,1.965587,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
