## Data wrangling

In [1]:
import pandas as pd
import numpy as np
import glob
import preprocessing as pre
import seaborn as sn
import matplotlib.pyplot as plt

%matplotlib inline

First we need to extract data from the excel file. In order to construct the eventual for loop, we'll first need to try extracting data from one file

In [2]:
pwd

'/home/bea/bhs2020/Projet_BHS2020/SNF_ADHDsubtypes_project'

In [3]:
df_eeg = pre.process_all_excel_files()
df_eeg

Unnamed: 0,electrode,brain_oscillation,fft_abs_power,freq_band,id
0,FP1,Delta,15.565495,1.0-4.0Hz,134
1,FP2,Delta,13.945462,1.0-4.0Hz,134
2,F7,Delta,10.874886,1.0-4.0Hz,134
3,F3,Delta,13.581803,1.0-4.0Hz,134
4,Fz,Delta,15.644595,1.0-4.0Hz,134
...,...,...,...,...,...
7671,Pz,Beta,4.780903,12.0-25.0Hz,87
7672,P4,Beta,4.276557,12.0-25.0Hz,87
7673,T6,Beta,3.603100,12.0-25.0Hz,87
7674,O1,Beta,12.159364,12.0-25.0Hz,87


Reorder columns of df

In [4]:
df_eeg = df_eeg[['id', 'brain_oscillation','freq_band','electrode','fft_abs_power']]

Remove participants (10,18, 52, 215 and 617) because of missing Neuropsy data

In [5]:
df_eeg = df_eeg[(df_eeg.id != '10') & (df_eeg.id != '18') & (df_eeg.id != '52') & (df_eeg.id != '215') & (df_eeg.id != '617')]

Verify data wrangling

In [6]:
print(df_eeg.shape)
df_eeg.head(5)

(7296, 5)


Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power
0,134,Delta,1.0-4.0Hz,FP1,15.565495
1,134,Delta,1.0-4.0Hz,FP2,13.945462
2,134,Delta,1.0-4.0Hz,F7,10.874886
3,134,Delta,1.0-4.0Hz,F3,13.581803
4,134,Delta,1.0-4.0Hz,Fz,15.644595


In [7]:
#sort by ids
df_eeg = df_eeg.sort_values(by=['id'])

In [8]:
df_eeg.dtypes

id                   object
brain_oscillation    object
freq_band            object
electrode            object
fft_abs_power        object
dtype: object

In [9]:
print('# unique ids: {}'.format(len(df_eeg.id.unique())))
print(df_eeg.id.value_counts())

# unique ids: 96
134    76
23     76
400    76
413    76
403    76
       ..
103    76
207    76
48     76
49     76
80     76
Name: id, Length: 96, dtype: int64


In [10]:
print(df_eeg.electrode.value_counts())
print(df_eeg.brain_oscillation.value_counts())
print(df_eeg.freq_band.value_counts())

C3     384
Fz     384
P4     384
F7     384
F3     384
T3     384
T5     384
O1     384
F8     384
FP1    384
F4     384
C4     384
Cz     384
O2     384
FP2    384
T4     384
Pz     384
T6     384
P3     384
Name: electrode, dtype: int64
Theta    1824
Delta    1824
Alpha    1824
Beta     1824
Name: brain_oscillation, dtype: int64
8.0-12.0Hz     1824
12.0-25.0Hz    1824
1.0-4.0Hz      1824
4.0-8.0Hz      1824
Name: freq_band, dtype: int64


### Merge dataframes (Neuropsy data with df (eeg))

#### We now need to import the Neuropsydata

In [12]:
df_neuropsy = pd.read_excel("Neuropsy.xlsx", na_values=".")
print(df_neuropsy.shape)
df_neuropsy.head(5)

(100, 13)


Unnamed: 0,ID,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,86.0,66.0,112.0,94.0,90.0,100.0
4,10,18,1,,,,,98.0,103.0,93.0,92.0,100.0,85.0


Then remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [13]:
df_neuropsy= df_neuropsy.dropna(axis=0, how='any')

Verify it worked

In [14]:
print(df_neuropsy.shape)

(96, 13)


Rename ID variable

In [15]:
#rename id
df_neuropsy.rename(columns = {'ID':'id'}, inplace = True) #rename id so it can be merged
df_neuropsy.head(5)

Unnamed: 0,id,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,86.0,66.0,112.0,94.0,90.0,100.0
5,11,21,1,16.0,26.0,13.0,10.0,45.0,33.0,78.0,69.0,51.0,94.0


#### Now let's merge the two dataframes together

In [16]:
#change dtype of id column from df
df_eeg['id']=df_eeg['id'].astype(int)

In [17]:
df_full = pd.merge(left=df_eeg, right=df_neuropsy, left_on='id', right_on='id')

In [18]:
df_full.head()

Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power,Age,Gender,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,Delta,1.0-4.0Hz,FP1,15.376173,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
1,1,Theta,4.0-8.0Hz,F3,3.845777,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
2,1,Theta,4.0-8.0Hz,Fz,4.259641,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
3,1,Theta,4.0-8.0Hz,F4,3.55283,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
4,1,Theta,4.0-8.0Hz,F8,2.064215,21,1,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0


In [19]:
#sort by ids
df_full = df_full.sort_values(by=['id'])

In [20]:
#Reorder columns 
df_full = df_full[['id','Age', 'Gender','brain_oscillation','electrode','fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_full.head()

Unnamed: 0,id,Age,Gender,brain_oscillation,electrode,fft_abs_power,cIM,cHR,cIE,cSC,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,Delta,FP1,15.376173,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
54,1,21,1,Delta,F7,7.968847,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
53,1,21,1,Beta,FP2,13.148877,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
52,1,21,1,Beta,T6,4.664137,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0
51,1,21,1,Beta,P4,6.975211,17.0,31.0,29.0,9.0,90.0,91.0,92.0,94.0,80.0,110.0


Adjust data types in dataframe

In [21]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7296 entries, 0 to 5851
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 7296 non-null   int64  
 1   Age                7296 non-null   int64  
 2   Gender             7296 non-null   int64  
 3   brain_oscillation  7296 non-null   object 
 4   electrode          7296 non-null   object 
 5   fft_abs_power      7296 non-null   object 
 6   cIM                7296 non-null   float64
 7   cHR                7296 non-null   float64
 8   cIE                7296 non-null   float64
 9   cSC                7296 non-null   float64
 10  Aqtot              7296 non-null   float64
 11  Aqaudi             7296 non-null   float64
 12  Aqvis              7296 non-null   float64
 13  RCQtot             7296 non-null   float64
 14  RCQaudi            7296 non-null   float64
 15  RCQvis             7296 non-null   float64
dtypes: float64(10), int64(3)

In [22]:
df_full['fft_abs_power'] = df_full['fft_abs_power'].astype(float)
print(df_full.dtypes)

id                     int64
Age                    int64
Gender                 int64
brain_oscillation     object
electrode             object
fft_abs_power        float64
cIM                  float64
cHR                  float64
cIE                  float64
cSC                  float64
Aqtot                float64
Aqaudi               float64
Aqvis                float64
RCQtot               float64
RCQaudi              float64
RCQvis               float64
dtype: object


In [23]:
df_full_eeg = df_full[['id', 'Gender', 'brain_oscillation','electrode', 'fft_abs_power']]
df_full_eeg

Unnamed: 0,id,Gender,brain_oscillation,electrode,fft_abs_power
0,1,1,Delta,FP1,15.376173
54,1,1,Delta,F7,7.968847
53,1,1,Beta,FP2,13.148877
52,1,1,Beta,T6,4.664137
51,1,1,Beta,P4,6.975211
...,...,...,...,...,...
5798,702,1,Delta,Fz,20.934284
5797,702,1,Delta,F3,17.471239
5796,702,1,Delta,F7,11.253618
5803,702,1,Delta,Cz,23.281091


### Descriptive statistics according to gender (male and female)

Clean dataset and keep variables of interest only

In [24]:
df_analysis = df_full[['Gender','brain_oscillation', 'fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
print(df_analysis.head())

    Gender brain_oscillation  fft_abs_power   cIM   cHR   cIE  cSC  Aqtot  \
0        1             Delta      15.376173  17.0  31.0  29.0  9.0   90.0   
54       1             Delta       7.968847  17.0  31.0  29.0  9.0   90.0   
53       1              Beta      13.148877  17.0  31.0  29.0  9.0   90.0   
52       1              Beta       4.664137  17.0  31.0  29.0  9.0   90.0   
51       1              Beta       6.975211  17.0  31.0  29.0  9.0   90.0   

    Aqaudi  Aqvis  RCQtot  RCQaudi  RCQvis  
0     91.0   92.0    94.0     80.0   110.0  
54    91.0   92.0    94.0     80.0   110.0  
53    91.0   92.0    94.0     80.0   110.0  
52    91.0   92.0    94.0     80.0   110.0  
51    91.0   92.0    94.0     80.0   110.0  


### Description statistics by gender 

In [25]:
df_analysis.groupby(['Gender', 'brain_oscillation']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,cIM,cIM,...,RCQaudi,RCQaudi,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Gender,brain_oscillation,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,Alpha,1083.0,12.443215,16.468519,0.83239,4.035082,7.087085,13.598982,181.820101,1083.0,22.701754,...,90.0,122.0,1083.0,74.614035,24.253945,33.0,56.0,67.0,94.0,122.0
1,Beta,1083.0,11.31365,10.134623,1.683068,5.653797,8.779399,13.54769,120.919824,1083.0,22.701754,...,90.0,122.0,1083.0,74.614035,24.253945,33.0,56.0,67.0,94.0,122.0
1,Delta,1083.0,13.76658,6.799273,2.72291,9.085052,12.565857,17.10716,83.603697,1083.0,22.701754,...,90.0,122.0,1083.0,74.614035,24.253945,33.0,56.0,67.0,94.0,122.0
1,Theta,1083.0,8.9696,5.476438,1.270426,5.142222,7.654293,11.070401,35.778731,1083.0,22.701754,...,90.0,122.0,1083.0,74.614035,24.253945,33.0,56.0,67.0,94.0,122.0
2,Alpha,741.0,15.069467,23.046294,0.818976,3.66443,7.431509,16.513614,258.114724,741.0,20.589744,...,97.0,132.0,741.0,72.153846,24.721652,8.0,55.0,75.0,93.0,111.0
2,Beta,741.0,7.966775,5.095113,1.319401,4.645349,6.663302,9.705918,41.659246,741.0,20.589744,...,97.0,132.0,741.0,72.153846,24.721652,8.0,55.0,75.0,93.0,111.0
2,Delta,741.0,12.187028,7.544434,2.363218,7.243324,10.514366,14.992159,78.432796,741.0,20.589744,...,97.0,132.0,741.0,72.153846,24.721652,8.0,55.0,75.0,93.0,111.0
2,Theta,741.0,9.037577,8.017893,1.348965,4.231535,6.75769,10.879996,75.112935,741.0,20.589744,...,97.0,132.0,741.0,72.153846,24.721652,8.0,55.0,75.0,93.0,111.0


#### Create two different df for behavioral data and connors

In [29]:
df_connors = df_neuropsy[['id','Gender','cIM', 'cHR', 'cIE', 'cSC']]
df_behavioral = df_neuropsy[['id','Gender','Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_connors
print(df_behavioral.head())

   id  Gender  Aqtot  Aqaudi  Aqvis  RCQtot  RCQaudi  RCQvis
0   1       1   90.0    91.0   92.0    94.0     80.0   110.0
1   3       1   27.0    34.0   41.0    25.0     31.0    38.0
2   4       1   93.0    89.0   96.0    90.0     92.0    90.0
3   7       1   86.0    66.0  112.0    94.0     90.0   100.0
5  11       1   45.0    33.0   78.0    69.0     51.0    94.0


#### Export as csv 

In [27]:
df_connors.to_csv('Data/df_connors')
df_behavioral.to_csv('Data/df_behavioral')

#### Export df_eeg into csv format for SNF analysis

In [28]:
df_full_eeg.to_csv('Data/df_eeg')