## Data wrangling

In [2]:
import pandas as pd
import numpy as np
import glob
import preprocessing as pre
import seaborn as sn
import matplotlib.pyplot as plt

%matplotlib inline

First we need to extract data from the excel file. In order to construct the eventual for loop, we'll first need to try extracting data from one file

In [3]:
df_eeg = pre.process_all_excel_files()

Reorder columns of df

In [4]:
df_eeg = df_eeg[['id', 'brain_oscillation','freq_band','electrode','fft_abs_power']]

Remove participants (10,18, 52, 215 and 617) because of missing Neuropsy data

In [5]:
df_eeg = df_eeg[(df_eeg.id != '10') & (df_eeg.id != '18') & (df_eeg.id != '52') & (df_eeg.id != '215') & (df_eeg.id != '617')]

Verify data wrangling

In [6]:
print(df_eeg.shape)
df_eeg.head(5)

(7296, 5)


Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power
0,134,Delta,1.0-4.0Hz,FP1,15.565495
1,134,Delta,1.0-4.0Hz,FP2,13.945462
2,134,Delta,1.0-4.0Hz,F7,10.874886
3,134,Delta,1.0-4.0Hz,F3,13.581803
4,134,Delta,1.0-4.0Hz,Fz,15.644595


In [7]:
#sort by ids
df_eeg = df_eeg.sort_values(by=['id'])

In [8]:
df_eeg.dtypes

id                   object
brain_oscillation    object
freq_band            object
electrode            object
fft_abs_power        object
dtype: object

In [9]:
print('# unique ids: {}'.format(len(df_eeg.id.unique())))
print(df_eeg.id.value_counts())

# unique ids: 96
57     76
402    76
213    76
07     76
702    76
       ..
84     76
395    76
99     76
24     76
80     76
Name: id, Length: 96, dtype: int64


In [10]:
print(df_eeg.electrode.value_counts())
print(df_eeg.brain_oscillation.value_counts())
print(df_eeg.freq_band.value_counts())

Pz     384
T3     384
F7     384
FP1    384
T5     384
F3     384
P4     384
O2     384
F4     384
O1     384
C4     384
Fz     384
FP2    384
T6     384
P3     384
F8     384
C3     384
T4     384
Cz     384
Name: electrode, dtype: int64
Theta    1824
Alpha    1824
Beta     1824
Delta    1824
Name: brain_oscillation, dtype: int64
1.0-4.0Hz      1824
8.0-12.0Hz     1824
12.0-25.0Hz    1824
4.0-8.0Hz      1824
Name: freq_band, dtype: int64


#### Export df_eeg into csv format for SNF analysis

In [11]:
df_eeg.to_csv('Data/df_eeg')

#### We now need to import the Neuropsydata

In [12]:
df_neuropsy = pd.read_excel("Neuropsy.xlsx", na_values=".")
print(df_neuropsy.shape)
df_neuropsy.head(5)

(100, 13)


Unnamed: 0,ID,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,86.0,66.0,112.0,94.0,90.0,100.0
4,10,18,1,,,,,98.0,103.0,93.0,92.0,100.0,85.0


Then remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [13]:
df_neuropsy= df_neuropsy.dropna(axis=0, how='any')

Verify it worked

In [14]:
print(df_neuropsy.shape)

(96, 13)


Rename ID variable

In [15]:
#rename id
df_neuropsy.rename(columns = {'ID':'id'}, inplace = True) #rename id so it can be merged
df_neuropsy.head(5)

Unnamed: 0,id,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,86.0,66.0,112.0,94.0,90.0,100.0
5,11,21,1,16.0,26.0,13.0,10.0,45.0,33.0,78.0,69.0,51.0,94.0


#### Create two different df for behavioral data and connors

In [16]:
df_connors = df_neuropsy[['id','Gender','cIM', 'cHR', 'cIE', 'cSC']]
df_behavioral = df_neuropsy[['id','Gender','Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
print(df_connors.head())
print(df_behavioral.head())

   id  Gender   cIM   cHR   cIE   cSC
0   1       1  17.0  31.0  29.0   9.0
1   3       1  10.0   5.0  13.0   1.0
2   4       1  26.0  17.0   7.0  15.0
3   7       1  24.0   8.0   6.0  14.0
5  11       1  16.0  26.0  13.0  10.0
   id  Gender  Aqtot  Aqaudi  Aqvis  RCQtot  RCQaudi  RCQvis
0   1       1   90.0    91.0   92.0    94.0     80.0   110.0
1   3       1   27.0    34.0   41.0    25.0     31.0    38.0
2   4       1   93.0    89.0   96.0    90.0     92.0    90.0
3   7       1   86.0    66.0  112.0    94.0     90.0   100.0
5  11       1   45.0    33.0   78.0    69.0     51.0    94.0


#### Export as csv 

In [17]:
df_connors.to_csv('Data/df_connors')
df_behavioral.to_csv('Data/df_behavioral')