## Data wrangling

In [1]:
import pandas as pd
import numpy as np
import glob
import preprocessing as pre
import seaborn as sn
import matplotlib.pyplot as plt




%matplotlib inline

First we need to extract data from the excel file. In order to construct the eventual for loop, we'll first need to try extracting data from one file

In [2]:
df_eeg = pre.process_all_excel_files()

Reorder columns of df

In [3]:
df_eeg = df_eeg[['id', 'brain_oscillation','freq_band','electrode','fft_abs_power']]

KeyError: "None of [Index(['id', 'brain_oscillation', 'freq_band', 'electrode', 'fft_abs_power'], dtype='object')] are in the [columns]"

Remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [None]:
df_eeg = df_eeg[(df_eeg.id != '10') & (df_eeg.id != '18') & (df_eeg.id != '52') & (df_eeg.id != '215')]

Verify data wrangling

In [None]:
print(df_eeg.shape)
df_eeg.head(5)

In [None]:
df_eeg.dtypes

In [None]:
print('# unique ids: {}'.format(len(df_eeg.id.unique())))
print(df_eeg.id.value_counts())

In [None]:
print(df_eeg.electrode.value_counts())
print(df_eeg.brain_oscillation.value_counts())
print(df_eeg.freq_band.value_counts())

### Merge dataframes (Neuropsy data with df (eeg))

We now need to import the Neuropsydata

In [None]:
df_neuropsy = pd.read_excel("Neuropsy.xlsx", na_values=".")
print(df_neuropsy.shape)
df_neuropsy.head(5)

Then remove participants (10,18, 52 and 215) because of missing Neuropsy data

In [None]:
df_neuropsy= df_neuropsy.dropna(axis=0, how='any')

Verify it worked

In [None]:
print(df_neuropsy.shape)

Rename ID variable

In [None]:
#rename id so it can be merged and fft_abs_power
df_neuropsy.rename(columns = {'ID':'id'}, inplace = True) #rename id so it can be merged
df_neuropsy.head(5)

#### Now let's merge the two dataframes together

In [None]:
 #change dtype of id column from df
df_eeg['id']=df_eeg['id'].astype(int)

In [None]:
df_full = pd.merge(left=df_eeg, right=df_neuropsy, left_on='id', right_on='id')

In [None]:
df_full.head()

In [None]:
#sort by ids
df_full = df_full.sort_values(by=['id'])

In [None]:
#Reorder columns 
df_full = df_full[['id','Age', 'Gender','brain_oscillation','electrode','fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
df_full.head()

Adjust data types in dataframe

In [None]:
df_full.info()

In [None]:
df_full['fft_abs_power'] = df_full['fft_abs_power'].astype(float)
print(df_full.dtypes)

## Descriptive statistics

#### Check for missing values

In [None]:
print(df_full.isnull().values.any())
print(df_full.isnull().values.sum())

### Descriptive statistics according to gender (male and female)

Clean dataset and keep variables of interest only

In [None]:
df_analysis = df_full[['Gender','electrode','brain_oscillation', 'fft_abs_power', 'cIM', 'cHR', 'cIE', 'cSC', 'Aqtot', 'Aqaudi', 'Aqvis', 'RCQtot', 'RCQaudi', 'RCQvis']]
print(df_analysis.head())

### Description statistics by gender 

In [None]:
df_analysis.groupby(['Gender', 'brain_oscillation']).describe()

### Correlation matrix for each freq band

#### Alpha

In [None]:
### Not adjusted

heatmap_alpha = df_analysis.corr()
sn.heatmap(heatmap_alpha, annot=True)
plt.show()

T-tests

In [None]:
import numpy as np
import scipy.stats as stats

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from pylab import rcParams
from scipy.stats import f_oneway
from scipy.stats import ttest_ind

In [None]:
#Might be useful at some point?
df_alpha_fft = df_alpha['fft_abs_power']
df_beta_fft = df_beta['fft_abs_power']
df_delta_fft = df_delta['fft_abs_power']
df_theta_fft = df_theta['fft_abs_power']
#df_alpha_fft


In [None]:
from scipy import stats
stats.ttest_rel(df_alpha_fft, df_beta_fft)


In [None]:
stats.ttest_rel(df_alpha_fft, df_delta_fft)

In [None]:
stats.ttest_rel(df_alpha_fft, df_theta_fft)

In [None]:
stats.f_oneway(df_alpha_fft, df_beta_fft, df_delta_fft, df_theta_fft)

In [None]:
#https://benalexkeen.com/comparative-statistics-in-python-using-scipy/
#source for 4 previous cells.
#however, we need correction for multiple comparisons

In [None]:
#NE PAS ROULER, C'EST LONG!
#les corrections par permutations c'est cool, MAIS comme c'est très long à rouler, 
#on pourrait se concenter à faire des corrections de bonferroni et garder les permutations
#pour le machine learning, là ou on n'a pas le choix de les utiliser. 
#import numpy as np
#from mlxtend.evaluate import permutation_test

#x = df_alpha_fft
#y = df_beta_fft

#print('Observed pearson R: %.2f' % np.corrcoef(x, y)[1][0])


#p_value = permutation_test(x, y,
#                           method='exact',
#                           func=lambda x, y: np.corrcoef(x, y)[1][0],
 #                          seed=0)
#print('P value: %.2f' % p_value)

In [None]:
import pandas
from statsmodels.formula.api import ols

data = df_analysis

model = ols('fft_abs_power ~ Gender + cIM + brain_oscillation', data).fit()
print(model.summary())

# Here, we don't need to define a contrast, as we are testing a single
# coefficient of our model, and not a combination of coefficients.
# However, defining a contrast, which would then be a 'unit contrast',
# will give us the same results
print(model.f_test())

In [None]:
from statsmodels.formula.api import ols
model = ols("fft_abs_power ~ brain_oscillation", df_analysis).fit()
print(model.summary())