## Data wrangling

In [2]:
import pandas as pd
import numpy as np
import glob
import preprocessing as pre
import seaborn as sn
import matplotlib.pyplot as plt

%matplotlib inline

First we need to extract data from the excel file. In order to construct the eventual for loop, we'll first need to try extracting data from one file

In [3]:
pwd

'/home/bea/bhs2020/Projet_BHS2020/SNF_ADHDsubtypes_project'

In [4]:
df_eeg = pre.process_all_excel_files()
df_eeg

Unnamed: 0,electrode,brain_oscillation,fft_abs_power,freq_band,id
0,FP1,Delta,15.565495,1.0-4.0Hz,134
1,FP2,Delta,13.945462,1.0-4.0Hz,134
2,F7,Delta,10.874886,1.0-4.0Hz,134
3,F3,Delta,13.581803,1.0-4.0Hz,134
4,Fz,Delta,15.644595,1.0-4.0Hz,134
...,...,...,...,...,...
7671,Pz,Beta,4.780903,12.0-25.0Hz,87
7672,P4,Beta,4.276557,12.0-25.0Hz,87
7673,T6,Beta,3.603100,12.0-25.0Hz,87
7674,O1,Beta,12.159364,12.0-25.0Hz,87


Reorder columns of df

In [5]:
df_eeg = df_eeg[['id', 'brain_oscillation','freq_band','electrode','fft_abs_power']]

Remove participants (10,18, 52, 215 and 617) because of missing Neuropsy data

In [6]:
df_eeg = df_eeg[(df_eeg.id != '10') & (df_eeg.id != '18') & (df_eeg.id != '52') & (df_eeg.id != '215') & (df_eeg.id != '617')]

Verify data wrangling

In [7]:
print(df_eeg.shape)
df_eeg.head(5)

(7296, 5)


Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power
0,134,Delta,1.0-4.0Hz,FP1,15.565495
1,134,Delta,1.0-4.0Hz,FP2,13.945462
2,134,Delta,1.0-4.0Hz,F7,10.874886
3,134,Delta,1.0-4.0Hz,F3,13.581803
4,134,Delta,1.0-4.0Hz,Fz,15.644595


In [8]:
#sort by ids
df_eeg = df_eeg.sort_values(by=['id'])

In [9]:
df_eeg.dtypes

id                   object
brain_oscillation    object
freq_band            object
electrode            object
fft_abs_power        object
dtype: object

In [10]:
print('# unique ids: {}'.format(len(df_eeg.id.unique())))
print(df_eeg.id.value_counts())

# unique ids: 96
49     76
702    76
62     76
413    76
87     76
       ..
20     76
19     76
01     76
64     76
201    76
Name: id, Length: 96, dtype: int64


In [11]:
print(df_eeg.electrode.value_counts())
print(df_eeg.brain_oscillation.value_counts())
print(df_eeg.freq_band.value_counts())

Pz     384
FP2    384
P3     384
T6     384
Fz     384
T3     384
Cz     384
C3     384
F4     384
F7     384
O2     384
O1     384
F8     384
P4     384
T5     384
F3     384
FP1    384
C4     384
T4     384
Name: electrode, dtype: int64
Theta    1824
Alpha    1824
Beta     1824
Delta    1824
Name: brain_oscillation, dtype: int64
1.0-4.0Hz      1824
12.0-25.0Hz    1824
8.0-12.0Hz     1824
4.0-8.0Hz      1824
Name: freq_band, dtype: int64


### Merge dataframes (Neuropsy data with df (eeg))

#### We now need to import the Neuropsydata

In [12]:
df_neuropsy = pd.read_excel("Neuropsy.xlsx", na_values=".")
print(df_neuropsy.shape)
df_neuropsy.head(5)

(100, 15)


Unnamed: 0,ID,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,8.0,5.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,23.0,11.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,19.0,3.0,86.0,66.0,112.0,94.0,90.0,100.0
4,10,18,1,,,,,,,98.0,103.0,93.0,92.0,100.0,85.0


Then remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [13]:
df_neuropsy= df_neuropsy.dropna(axis=0, how='any')

Verify it worked

In [14]:
print(df_neuropsy.shape)

(96, 15)


Rename ID variable

In [15]:
#rename id
df_neuropsy.rename(columns = {'ID':'id'}, inplace = True) #rename id so it can be merged
df_neuropsy.head(5)

Unnamed: 0,id,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
1,3,20,1,10.0,5.0,13.0,1.0,8.0,5.0,27.0,34.0,41.0,25.0,31.0,38.0
2,4,18,1,26.0,17.0,7.0,15.0,23.0,11.0,93.0,89.0,96.0,90.0,92.0,90.0
3,7,23,1,24.0,8.0,6.0,14.0,19.0,3.0,86.0,66.0,112.0,94.0,90.0,100.0
5,11,21,1,16.0,26.0,13.0,10.0,17.0,13.0,45.0,33.0,78.0,69.0,51.0,94.0


#### Now let's merge the two dataframes together

In [16]:
#change dtype of id column from df
df_eeg['id']=df_eeg['id'].astype(int)

In [17]:
df_full = pd.merge(left=df_eeg, right=df_neuropsy, left_on='id', right_on='id')

In [18]:
df_full.head()

Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis
0,1,Delta,1.0-4.0Hz,FP1,15.376173,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
1,1,Theta,4.0-8.0Hz,F3,3.845777,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
2,1,Theta,4.0-8.0Hz,Fz,4.259641,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
3,1,Theta,4.0-8.0Hz,F4,3.55283,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0
4,1,Theta,4.0-8.0Hz,F8,2.064215,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0


In [19]:
#sort by ids
df_full = df_full.sort_values(by=['id'])

### Subtype labelling according to clinical standard

Create a column for categorization of ADHD subtypes

In [20]:
df_full['subtype'] = df_full.apply(lambda x: pre.categorize_subtypes(x['inat'], x['hyper']), axis=1)

df_full.head(5)

Unnamed: 0,id,brain_oscillation,freq_band,electrode,fft_abs_power,Age,Gender,cIM,cHR,cIE,cSC,inat,hyper,Aqtot,Aqaudi,Aqvis,RCQtot,RCQaudi,RCQvis,subtype
0,1,Delta,1.0-4.0Hz,FP1,15.376173,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
54,1,Delta,1.0-4.0Hz,F7,7.968847,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
53,1,Beta,12.0-25.0Hz,FP2,13.148877,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
52,1,Beta,12.0-25.0Hz,T6,4.664137,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed
51,1,Beta,12.0-25.0Hz,P4,6.975211,21,1,17.0,31.0,29.0,9.0,18.0,20.0,90.0,91.0,92.0,94.0,80.0,110.0,mixed


In [21]:
#encoding the subtypes into numerocal labels for future SNF analysis/machine learning
# hyperactive = 0
# inattentive subtype = 1
# mixed subtype = 2
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df_full["adhdtype"] = lb_make.fit_transform(df_full["subtype"])
df_full[["subtype", "adhdtype"]].head(11)


Unnamed: 0,subtype,adhdtype
0,mixed,2
54,mixed,2
53,mixed,2
52,mixed,2
51,mixed,2
50,mixed,2
49,mixed,2
48,mixed,2
47,mixed,2
46,mixed,2


In [22]:
#Reorder columns 
df_full = df_full[['id','Age', 'Gender','inat','hyper','subtype','adhdtype', 'brain_oscillation','electrode','fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_full.head()
df_full['subtype'].value_counts()

inat     3648
mixed    3496
hyper     152
Name: subtype, dtype: int64

Adjust data types in dataframe

In [23]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7296 entries, 0 to 5851
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 7296 non-null   int64  
 1   Age                7296 non-null   int64  
 2   Gender             7296 non-null   int64  
 3   inat               7296 non-null   float64
 4   hyper              7296 non-null   float64
 5   subtype            7296 non-null   object 
 6   adhdtype           7296 non-null   int64  
 7   brain_oscillation  7296 non-null   object 
 8   electrode          7296 non-null   object 
 9   fft_abs_power      7296 non-null   object 
 10  cIM                7296 non-null   float64
 11  cHR                7296 non-null   float64
 12  cIE                7296 non-null   float64
 13  cSC                7296 non-null   float64
 14  Aqtot              7296 non-null   float64
 15  Aqaudi             7296 non-null   float64
 16  Aqvis              7296 

In [24]:
df_full['fft_abs_power'] = df_full['fft_abs_power'].astype(float)
print(df_full.dtypes)

id                     int64
Age                    int64
Gender                 int64
inat                 float64
hyper                float64
subtype               object
adhdtype               int64
brain_oscillation     object
electrode             object
fft_abs_power        float64
cIM                  float64
cHR                  float64
cIE                  float64
cSC                  float64
Aqtot                float64
Aqaudi               float64
Aqvis                float64
RCQtot               float64
RCQaudi              float64
RCQvis               float64
dtype: object


In [25]:
df_full_eeg = df_full[['id', 'Gender', 'brain_oscillation','electrode', 'fft_abs_power']]
df_full_eeg

Unnamed: 0,id,Gender,brain_oscillation,electrode,fft_abs_power
0,1,1,Delta,FP1,15.376173
54,1,1,Delta,F7,7.968847
53,1,1,Beta,FP2,13.148877
52,1,1,Beta,T6,4.664137
51,1,1,Beta,P4,6.975211
...,...,...,...,...,...
5798,702,1,Delta,Fz,20.934284
5797,702,1,Delta,F3,17.471239
5796,702,1,Delta,F7,11.253618
5803,702,1,Delta,Cz,23.281091


### Descriptive statistics according to gender (male and female)

Clean dataset and keep variables of interest only

In [26]:
df_analysis = df_full[['Gender','adhdtype','brain_oscillation', 'fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
print(df_analysis.head())

    Gender  adhdtype brain_oscillation  fft_abs_power   cIM   cHR   cIE  cSC  \
0        1         2             Delta      15.376173  17.0  31.0  29.0  9.0   
54       1         2             Delta       7.968847  17.0  31.0  29.0  9.0   
53       1         2              Beta      13.148877  17.0  31.0  29.0  9.0   
52       1         2              Beta       4.664137  17.0  31.0  29.0  9.0   
51       1         2              Beta       6.975211  17.0  31.0  29.0  9.0   

    Aqtot  Aqaudi  Aqvis  RCQtot  RCQaudi  RCQvis  
0    90.0    91.0   92.0    94.0     80.0   110.0  
54   90.0    91.0   92.0    94.0     80.0   110.0  
53   90.0    91.0   92.0    94.0     80.0   110.0  
52   90.0    91.0   92.0    94.0     80.0   110.0  
51   90.0    91.0   92.0    94.0     80.0   110.0  


### Description statistics by gender, subtype and brain oscillations

In [27]:
df_analysis.groupby(['Gender','adhdtype', 'brain_oscillation']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,fft_abs_power,cIM,cIM,...,RCQaudi,RCQaudi,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis,RCQvis
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Gender,adhdtype,brain_oscillation,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
1,1,Alpha,646.0,10.259383,12.4635,0.83239,3.227184,5.853061,12.304067,105.947306,646.0,23.970588,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,1,Beta,646.0,10.068747,9.129827,1.683068,4.828373,7.685771,12.644479,92.808932,646.0,23.970588,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,1,Delta,646.0,12.652896,5.467191,2.72291,8.342831,12.123746,15.807374,34.598893,646.0,23.970588,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,1,Theta,646.0,8.515123,5.044777,1.270426,4.841011,7.372008,10.701126,35.399409,646.0,23.970588,...,94.0,113.0,646.0,76.558824,24.811515,37.0,55.0,76.5,97.0,122.0
1,2,Alpha,437.0,15.671488,20.633477,1.667285,5.401762,7.965819,16.272682,181.820101,437.0,20.826087,...,85.0,122.0,437.0,71.73913,23.135335,33.0,57.0,65.0,94.0,122.0
1,2,Beta,437.0,13.153941,11.220124,2.649552,6.941925,10.370845,15.371943,120.919824,437.0,20.826087,...,85.0,122.0,437.0,71.73913,23.135335,33.0,57.0,65.0,94.0,122.0
1,2,Delta,437.0,15.412895,8.121238,3.507888,9.749313,13.497751,18.694702,83.603697,437.0,20.826087,...,85.0,122.0,437.0,71.73913,23.135335,33.0,57.0,65.0,94.0,122.0
1,2,Theta,437.0,9.641436,6.001697,1.548818,5.521364,7.992626,11.819243,35.778731,437.0,20.826087,...,85.0,122.0,437.0,71.73913,23.135335,33.0,57.0,65.0,94.0,122.0
2,0,Alpha,38.0,29.624061,37.935936,1.946109,3.797357,10.443736,39.252719,139.567341,38.0,5.5,...,106.0,106.0,38.0,85.0,12.161081,73.0,73.0,85.0,97.0,97.0
2,0,Beta,38.0,10.384103,6.530313,3.028548,5.835936,7.208286,12.462524,25.890655,38.0,5.5,...,106.0,106.0,38.0,85.0,12.161081,73.0,73.0,85.0,97.0,97.0


#### Create two different df for behavioral data and connors

In [32]:
df_connors = df_full[['id','Gender','cIM', 'cHR', 'cIE', 'cSC']]
df_behavioral = df_full[['id','Gender', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_labels = df_full[['id', 'adhdtype']]

df_labels

Unnamed: 0,id,adhdtype
0,1,2
54,1,2
53,1,2
52,1,2
51,1,2
...,...,...
5798,702,1
5797,702,1
5796,702,1
5803,702,1


#### Export as csv 

In [29]:
df_connors.to_csv('Data/df_connors')
df_behavioral.to_csv('Data/df_behavioral')
df_labels.to_csv('Data/labels')

#### Export df_eeg into csv format for SNF analysis

In [30]:
df_full_eeg.to_csv('Data/df_eeg')

#### Export df_analysis into csv format for visualization

In [31]:
df_analysis.to_csv('Data/df_analysis')