## Data wrangling

In [1]:
import pandas as pd
import numpy as np
import glob
import preprocessing as pre
import seaborn as sn
import matplotlib.pyplot as plt

%matplotlib inline

First we need to extract data from the excel file. In order to construct the eventual for loop, we'll first need to try extracting data from one file

In [2]:
pwd

'/home/bea/bhs2020/Projet_BHS2020/SNF_ADHDsubtypes_project'

In [3]:
df_eeg = pre.process_all_excel_files()
df_eeg

Unnamed: 0,electrode,brain_oscillation,fft_abs_power,freq_band,id
0,FP1,Delta,15.565495,1.0-4.0Hz,134
1,FP2,Delta,13.945462,1.0-4.0Hz,134
2,F7,Delta,10.874886,1.0-4.0Hz,134
3,F3,Delta,13.581803,1.0-4.0Hz,134
4,Fz,Delta,15.644595,1.0-4.0Hz,134
...,...,...,...,...,...
21104,Pz,Beta2,1.224356,15.0-18.0Hz,87
21105,P4,Beta2,1.139701,15.0-18.0Hz,87
21106,T6,Beta2,1.157928,15.0-18.0Hz,87
21107,O1,Beta2,2.315535,15.0-18.0Hz,87


Reorder columns of df

In [4]:
df_eeg = df_eeg[['id', 'brain_oscillation','freq_band','electrode','fft_abs_power']]

Remove participants (10,18, 52, 215 and 617) because of missing Neuropsy data

In [5]:
df_eeg = df_eeg[(df_eeg.id != '10') & (df_eeg.id != '18') & (df_eeg.id != '52') & (df_eeg.id != '215') & (df_eeg.id != '617')]

Verify data wrangling

In [6]:
print(df_eeg.shape)
df_eeg.head(5)

(20064, 5)


Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power
0,134,Delta,1.0-4.0Hz,FP1,15.565495
1,134,Delta,1.0-4.0Hz,FP2,13.945462
2,134,Delta,1.0-4.0Hz,F7,10.874886
3,134,Delta,1.0-4.0Hz,F3,13.581803
4,134,Delta,1.0-4.0Hz,Fz,15.644595


In [7]:
#sort by ids
df_eeg = df_eeg.sort_values(by=['id'])

In [8]:
df_eeg.dtypes

id                   object
brain_oscillation    object
freq_band            object
electrode            object
fft_abs_power        object
dtype: object

In [9]:
print('# unique ids: {}'.format(len(df_eeg.id.unique())))
print(df_eeg.id.value_counts())

# unique ids: 96
48     209
381    209
22     209
76     209
34     209
      ... 
208    209
71     209
17     209
41     209
206    209
Name: id, Length: 96, dtype: int64


In [10]:
print(df_eeg.electrode.value_counts())
print(df_eeg.brain_oscillation.value_counts())
print(df_eeg.freq_band.value_counts())

F3     1056
Pz     1056
F8     1056
O1     1056
O2     1056
FP1    1056
Cz     1056
T4     1056
F4     1056
T5     1056
P4     1056
P3     1056
T3     1056
C4     1056
FP2    1056
F7     1056
C3     1056
T6     1056
Fz     1056
Name: electrode, dtype: int64
Beta2        1824
Alpha1       1824
Gamma        1824
Beta1        1824
Alpha2       1824
HighBeta     1824
Delta        1824
Theta        1824
Beta         1824
Alpha        1824
HighGamma    1824
Name: brain_oscillation, dtype: int64
12.0-25.0Hz    1824
4.0-8.0Hz      1824
12.0-15.0Hz    1824
8.0-12.0Hz     1824
8.0-10.0Hz     1824
1.0-4.0Hz      1824
15.0-18.0Hz    1824
40.0-50.0Hz    1824
30.0-40.0Hz    1824
25.0-30.0Hz    1824
10.0-12.0Hz    1824
Name: freq_band, dtype: int64


### Merge dataframes (Neuropsy data with df (eeg))

#### We now need to import the Neuropsydata

In [11]:
df_neuropsy = pd.read_excel("Neuropsy.xlsx", na_values=".")
print(df_neuropsy.shape)
df_neuropsy.head(5)

(100, 15)


Unnamed: 0,ID,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,8.0,5.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,23.0,11.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,19.0,3.0,86.0,66.0,112.0,94.0,90.0,100.0
4,10,18,1,,,,,,,98.0,103.0,93.0,92.0,100.0,85.0


Then remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [12]:
df_neuropsy= df_neuropsy.dropna(axis=0, how='any')

Verify it worked

In [13]:
print(df_neuropsy.shape)

(96, 15)


Rename ID variable

In [14]:
#rename id
df_neuropsy.rename(columns = {'ID':'id'}, inplace = True) #rename id so it can be merged
df_neuropsy.head(5)

Unnamed: 0,id,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,8.0,5.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,23.0,11.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,19.0,3.0,86.0,66.0,112.0,94.0,90.0,100.0
5,11,21,1,16.0,26.0,13.0,10.0,17.0,13.0,45.0,33.0,78.0,69.0,51.0,94.0


#### Now let's merge the two dataframes together

In [15]:
#change dtype of id column from df
df_eeg['id']=df_eeg['id'].astype(int)

In [16]:
df_full = pd.merge(left=df_eeg, right=df_neuropsy, left_on='id', right_on='id')

In [17]:
df_full.head()

Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,Delta,1.0-4.0Hz,FP1,15.376173,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
1,1,HighBeta,25.0-30.0Hz,FP1,7.356346,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
2,1,Beta,12.0-25.0Hz,O2,5.587346,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
3,1,Beta,12.0-25.0Hz,O1,5.659504,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
4,1,Beta,12.0-25.0Hz,T6,4.664137,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0


In [18]:
#sort by ids
df_full = df_full.sort_values(by=['id'])

### Subtype labelling according to clinical standard

Create a column for categorization of ADHD subtypes

In [19]:
df_full['subtype'] = df_full.apply(lambda x: pre.categorize_subtypes(x['inat'], x['hyper']), axis=1)

df_full.head(5)

Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis,subtype
0,1,Delta,1.0-4.0Hz,FP1,15.376173,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
133,1,Beta2,15.0-18.0Hz,Pz,1.87092,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
134,1,Beta2,15.0-18.0Hz,P3,1.69015,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
135,1,Beta2,15.0-18.0Hz,T5,1.13845,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
136,1,Beta2,15.0-18.0Hz,T4,2.171898,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed


In [20]:
#encoding the subtypes into numerocal labels for future SNF analysis/machine learning
# hyperactive = 0
# inattentive subtype = 1
# mixed subtype = 2
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df_full["adhdtype"] = lb_make.fit_transform(df_full["subtype"])
df_full[["subtype", "adhdtype"]].head(11)


Unnamed: 0,subtype,adhdtype
0,mixed,2
133,mixed,2
134,mixed,2
135,mixed,2
136,mixed,2
137,mixed,2
138,mixed,2
139,mixed,2
140,mixed,2
141,mixed,2


In [21]:
#Reorder columns 
df_full = df_full[['id','Age', 'Gender','inat','hyper',"subtype",'adhdtype', 'brain_oscillation','electrode','fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_full.head()
df_full['adhdtype'].value_counts()

1    10032
2     9614
0      418
Name: adhdtype, dtype: int64

Adjust data types in dataframe

In [22]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20064 entries, 0 to 16092
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20064 non-null  int64  
 1   Age                20064 non-null  int64  
 2   Gender             20064 non-null  int64  
 3   inat               20064 non-null  float64
 4   hyper              20064 non-null  float64
 5   subtype            20064 non-null  object 
 6   adhdtype           20064 non-null  int64  
 7   brain_oscillation  20064 non-null  object 
 8   electrode          20064 non-null  object 
 9   fft_abs_power      20064 non-null  object 
 10  cIM                20064 non-null  float64
 11  cHR                20064 non-null  float64
 12  cIE                20064 non-null  float64
 13  cSC                20064 non-null  float64
 14  Aqtot              20064 non-null  float64
 15  Aqaudi             20064 non-null  float64
 16  Aqvis              200

In [23]:
df_full['fft_abs_power'] = df_full['fft_abs_power'].astype(float)
print(df_full.dtypes)

id                     int64
Age                    int64
Gender                 int64
inat                 float64
hyper                float64
subtype               object
adhdtype               int64
brain_oscillation     object
electrode             object
fft_abs_power        float64
cIM                  float64
cHR                  float64
cIE                  float64
cSC                  float64
Aqtot                float64
Aqaudi               float64
Aqvis                float64
RCQtot               float64
RCQaudi              float64
RCQvis               float64
dtype: object


In [24]:
df_full_eeg = df_full[['id', 'Gender', 'adhdtype','brain_oscillation','electrode', 'fft_abs_power']]
df_full_eeg

Unnamed: 0,id,Gender,adhdtype,brain_oscillation,electrode,fft_abs_power
0,1,1,2,Delta,FP1,15.376173
133,1,1,2,Beta2,Pz,1.870920
134,1,1,2,Beta2,P3,1.690150
135,1,1,2,Beta2,T5,1.138450
136,1,1,2,Beta2,T4,2.171898
...,...,...,...,...,...,...
15958,702,1,1,Theta,Fz,15.355240
15959,702,1,1,Alpha,P3,26.927847
15960,702,1,1,Theta,F4,12.414647
15935,702,1,1,Theta,F3,12.058871


### Descriptive statistics according to gender (male and female)

Clean dataset and keep variables of interest only

In [31]:
df_analysis = df_full[['id','Gender','subtype','adhdtype','brain_oscillation', 'electrode', 'fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
print(df_analysis.head())

     id  Gender subtype  adhdtype brain_oscillation electrode  fft_abs_power  \
0     1       1   mixed         2             Delta       FP1      15.376173   
133   1       1   mixed         2             Beta2        Pz       1.870920   
134   1       1   mixed         2             Beta2        P3       1.690150   
135   1       1   mixed         2             Beta2        T5       1.138450   
136   1       1   mixed         2             Beta2        T4       2.171898   

      cIM   cHR   cIE  cSC  Aqtot  Aqaudi  Aqvis  RCQtot  RCQaudi  RCQvis  
0    17.0  31.0  29.0  9.0   90.0    91.0   92.0    94.0     80.0   110.0  
133  17.0  31.0  29.0  9.0   90.0    91.0   92.0    94.0     80.0   110.0  
134  17.0  31.0  29.0  9.0   90.0    91.0   92.0    94.0     80.0   110.0  
135  17.0  31.0  29.0  9.0   90.0    91.0   92.0    94.0     80.0   110.0  
136  17.0  31.0  29.0  9.0   90.0    91.0   92.0    94.0     80.0   110.0  


### Description statistics by gender, subtype and brain oscillations

In [26]:
df_analysis.groupby(['Gender','subtype', 'brain_oscillation']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,id,id,id,id,id,id,id,adhdtype,adhdtype,...,RCQaudi,RCQaudi,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Gender,subtype,brain_oscillation,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
1,inat,Alpha,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Alpha1,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Alpha2,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Beta,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Beta1,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Beta2,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Delta,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,Gamma,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,HighBeta,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,inat,HighGamma,646.0,179.764706,185.62102,4.0,64.0,84.0,220.0,702.0,646.0,1.0,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0


#### Create two different df for behavioral data and connors

In [27]:
df_connors = df_analysis[['id','Gender','cIM', 'cHR', 'cIE', 'cSC']]
df_behavioral = df_analysis[['id','Gender', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_labels = df_analysis[['id','adhdtype']]

df_labels

Unnamed: 0,id,adhdtype
0,1,2
133,1,2
134,1,2
135,1,2
136,1,2
...,...,...
15958,702,1
15959,702,1
15960,702,1
15935,702,1


#### Export as csv 

In [28]:
df_connors.to_csv('Data/df_connors')
df_behavioral.to_csv('Data/df_behavioral')
df_labels.to_csv('Data/labels')

#### Export df_eeg into csv format for SNF analysis

In [29]:
df_full_eeg.to_csv('Data/df_eeg')

#### Export df_analysis into csv format for visualization

In [32]:
df_analysis.to_csv('Data/df_analysis')