# Evaluation of the Predictive Capabilities of Select Brain Attributes for Sex and Age Group

Copyright 2019 Ariana Olson and Emma Price

[MIT License](https://opensource.org/licenses/MIT)

In [1]:
%matplotlib inline
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import thinkstats2
import thinkplot

import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras.models
import keras.layers

import pickle

from warnings import simplefilter
simplefilter('ignore', FutureWarning)

Using TensorFlow backend.


### NKI SBA Volume Data
The Nathan Kline Institute completed MRIs where they measured the "volume of cortical and subcortical gray and white matter." This dataset displayed the largest Cohen Effects between men and women amongst the different areas measured.

Load the NKI SBA volume data

In [20]:
volume_df = pd.read_excel('brain_mosaic_data/NKI_SBA_Volume.xlsx')
volume_df.head()

Unnamed: 0,bio_sex,age,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus-Proper,Left-Caudate,Left-Putamen,Left-Pallidum,...,rh_rostralanteriorcingulate_volume,rh_rostralmiddlefrontal_volume,rh_superiorfrontal_volume,rh_superiorparietal_volume,rh_superiortemporal_volume,rh_supramarginal_volume,rh_frontalpole_volume,rh_temporalpole_volume,rh_transversetemporal_volume,rh_insula_volume
0,2,41,5443,408,14526,47120,6506,2944,4535,1828,...,1151,13009,16499,10019,9201,7703,783,1945,923,6297
1,2,52,3228,309,13065,48090,6942,2918,5000,1734,...,1308,13780,18011,11602,12166,10272,747,1921,790,5537
2,2,54,5642,95,9687,38398,6385,2892,4351,1051,...,1903,13247,17477,10010,8808,8331,1193,2043,484,6033
3,2,56,4959,231,11442,41214,6195,3604,5135,1592,...,1753,12481,17378,9682,10091,9396,802,2336,799,6159
4,2,31,6664,161,12059,44711,6613,2981,4558,1462,...,1396,11924,21013,12134,10928,10362,699,2549,911,5853


Make the column names play nicely with StatsModels -- Code from Allen Downey.

In [21]:
def clean_name(name):
    """Make column names play nicely with StatsModels.
    """
    name = name.replace('-', '_').lower()
    d = {'3rd_ventricle':'third_ventricle', '4th_ventricle':'fourth_ventricle'}
    return d.get(name, name)

In [22]:
columns = [clean_name(name) for name in volume_df.columns]
volume_df.columns = columns
volume_df.head()

Unnamed: 0,bio_sex,age,left_lateral_ventricle,left_inf_lat_vent,left_cerebellum_white_matter,left_cerebellum_cortex,left_thalamus_proper,left_caudate,left_putamen,left_pallidum,...,rh_rostralanteriorcingulate_volume,rh_rostralmiddlefrontal_volume,rh_superiorfrontal_volume,rh_superiorparietal_volume,rh_superiortemporal_volume,rh_supramarginal_volume,rh_frontalpole_volume,rh_temporalpole_volume,rh_transversetemporal_volume,rh_insula_volume
0,2,41,5443,408,14526,47120,6506,2944,4535,1828,...,1151,13009,16499,10019,9201,7703,783,1945,923,6297
1,2,52,3228,309,13065,48090,6942,2918,5000,1734,...,1308,13780,18011,11602,12166,10272,747,1921,790,5537
2,2,54,5642,95,9687,38398,6385,2892,4351,1051,...,1903,13247,17477,10010,8808,8331,1193,2043,484,6033
3,2,56,4959,231,11442,41214,6195,3604,5135,1592,...,1753,12481,17378,9682,10091,9396,802,2336,799,6159
4,2,31,6664,161,12059,44711,6613,2981,4558,1462,...,1396,11924,21013,12134,10928,10362,699,2549,911,5853


Make the `bio_sex` variable binary for StatsModels based on gender

In [23]:
volume_df['bio_sex'].replace(1, 0, inplace=True)
volume_df['bio_sex'].replace(2, 1, inplace=True)

In [24]:
volume_df['bio_sex'].mean()

0.6213235294117647

In [25]:
volume_df['bio_sex'].value_counts()

1    169
0    103
Name: bio_sex, dtype: int64

Create a total volume column by summing all of the individual volume features

In [27]:
features = [name for name in volume_df.columns if name.endswith('volume')]
volume_df['total_volume'] = volume_df[features].sum(axis=1)
volume_df['total_volume'].describe()

count       272.000000
mean     455371.117647
std       56890.820457
min      336198.000000
25%      416941.500000
50%      450121.000000
75%      489089.250000
max      684464.000000
Name: total_volume, dtype: float64

Normalize the volume features to the total volume by dividing each column that is a volume column by the total volume column

In [35]:
def divide_by_vol(columns):
    for column in columns:
        volume_df[column + '_norm'] = volume_df[column]/volume_df['total_volume']
        
columns = [name for name in volume_df.columns if name.endswith('volume')]
divide_by_vol(columns)
volume_df.drop('total_volume_norm', axis = 1)

Unnamed: 0,bio_sex,age,left_lateral_ventricle,left_inf_lat_vent,left_cerebellum_white_matter,left_cerebellum_cortex,left_thalamus_proper,left_caudate,left_putamen,left_pallidum,...,rh_rostralanteriorcingulate_volume_norm,rh_rostralmiddlefrontal_volume_norm,rh_superiorfrontal_volume_norm,rh_superiorparietal_volume_norm,rh_superiortemporal_volume_norm,rh_supramarginal_volume_norm,rh_frontalpole_volume_norm,rh_temporalpole_volume_norm,rh_transversetemporal_volume_norm,rh_insula_volume_norm
0,1,41,5443,408,14526,47120,6506,2944,4535,1828,...,0.003024,0.034181,0.043351,0.026325,0.024175,0.020239,0.002057,0.005110,0.002425,0.016545
1,1,52,3228,309,13065,48090,6942,2918,5000,1734,...,0.003255,0.034290,0.044819,0.028870,0.030274,0.025561,0.001859,0.004780,0.001966,0.013778
2,1,54,5642,95,9687,38398,6385,2892,4351,1051,...,0.004965,0.034565,0.045602,0.026119,0.022982,0.021738,0.003113,0.005331,0.001263,0.015742
3,1,56,4959,231,11442,41214,6195,3604,5135,1592,...,0.004770,0.033962,0.047287,0.026346,0.027459,0.025567,0.002182,0.006356,0.002174,0.016759
4,1,31,6664,161,12059,44711,6613,2981,4558,1462,...,0.003286,0.028069,0.049465,0.028564,0.025725,0.024392,0.001645,0.006000,0.002145,0.013778
5,1,23,2603,324,15231,51930,6527,3802,6341,1706,...,0.004042,0.029387,0.048673,0.029289,0.021248,0.018831,0.001774,0.004057,0.001842,0.015405
6,1,35,3478,87,17917,48317,8241,3474,5797,1534,...,0.005587,0.031515,0.043662,0.035192,0.024065,0.022613,0.002775,0.005000,0.001894,0.015696
7,1,61,6790,209,16639,46757,5972,2827,3913,1306,...,0.004684,0.031781,0.042980,0.032454,0.019897,0.019935,0.002245,0.007684,0.001402,0.014414
8,1,26,2796,248,16287,48580,6388,4187,4902,1228,...,0.004524,0.029925,0.047673,0.029907,0.026144,0.022742,0.001479,0.004177,0.001922,0.013678
9,1,43,9019,283,16504,46340,6348,3129,4018,1195,...,0.003979,0.028817,0.046295,0.028197,0.027354,0.018744,0.002226,0.006260,0.002246,0.014571


In [37]:
volume_df.to_hdf('NKI_SBA_Volume.hdf5', key='NKI')

### 1000 Functional Connectomes Voxel-Based Morphometry
The 1000 Functional Connectomes project is a group that is collecting and opening to the public brain scans. They collected information using Voxel-Based Morphometry that "measures differences in local concentrations of brain tissue, through a voxel-wise comparison of multiple brain images." [https://en.wikipedia.org/wiki/Voxel-based_morphometry](Wikipedia).

Load the connection data set in

In [38]:
df_1000 = pd.read_excel('brain_mosaic_data/1000_Connectomes_age_18-26_VBM.xlsx')
df_1000.head()

Unnamed: 0,bio_sex,age,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,Frontal_Sup_Orb_R,Frontal_Mid_L,Frontal_Mid_R,...,Cerebelum_00_L,Cerebelum_00_R,Vermis_0_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_00
0,1,18,0.436035,0.406023,0.361779,0.369537,0.495171,0.47909,0.420118,0.444269,...,0.257104,0.247064,0.431084,0.471217,0.572555,0.590969,0.575713,0.558999,0.524192,0.226187
1,1,18,0.422395,0.430086,0.38327,0.390875,0.477471,0.450275,0.448891,0.460303,...,0.245738,0.197609,0.383902,0.421114,0.501804,0.56084,0.566743,0.572814,0.505971,0.202734
2,1,18,0.43954,0.430842,0.385297,0.376834,0.52061,0.4725,0.454426,0.417339,...,0.245801,0.246097,0.418835,0.424117,0.510495,0.597151,0.604129,0.618324,0.606951,0.285061
3,1,18,0.473102,0.449665,0.416976,0.417434,0.53688,0.532697,0.47111,0.502046,...,0.215978,0.202993,0.395247,0.431441,0.524903,0.577355,0.629283,0.642825,0.570416,0.231166
4,1,18,0.382771,0.380443,0.347929,0.359899,0.463163,0.453011,0.393531,0.393259,...,0.236,0.277785,0.370647,0.434968,0.583758,0.564699,0.543617,0.546831,0.565252,0.24779


Make a binary variable for StatsModels.

In [41]:
df_1000['bio_sex'].replace(1, 0, inplace=True)
df_1000['bio_sex'].replace(2, 1, inplace=True)

In [42]:
df_1000['bio_sex'].mean()

0.616

In [43]:
df_1000['bio_sex'].value_counts()

1    385
0    240
Name: bio_sex, dtype: int64

In [46]:
df_1000.to_hdf('1000_Connectomes_age_18-26_VBM.hdf5', key='VBM')