# Are brains like faces?


Copyright 2019 Allen Downey

[MIT License](https://opensource.org/licenses/MIT)

In [3]:
%matplotlib inline
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import thinkstats2
import thinkplot

import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras.models
import keras.layers

import pickle

from warnings import simplefilter
simplefilter('ignore', FutureWarning)

Using TensorFlow backend.


Load the NKI SBA volume data, since it seems to have the most features with the highest Cohen effects.

In [4]:
volume_df = pd.read_excel('brain_mosaic_data/NKI_SBA_Volume.xlsx')
volume_df.head()

Unnamed: 0,bio_sex,age,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus-Proper,Left-Caudate,Left-Putamen,Left-Pallidum,...,rh_rostralanteriorcingulate_volume,rh_rostralmiddlefrontal_volume,rh_superiorfrontal_volume,rh_superiorparietal_volume,rh_superiortemporal_volume,rh_supramarginal_volume,rh_frontalpole_volume,rh_temporalpole_volume,rh_transversetemporal_volume,rh_insula_volume
0,2,41,5443,408,14526,47120,6506,2944,4535,1828,...,1151,13009,16499,10019,9201,7703,783,1945,923,6297
1,2,52,3228,309,13065,48090,6942,2918,5000,1734,...,1308,13780,18011,11602,12166,10272,747,1921,790,5537
2,2,54,5642,95,9687,38398,6385,2892,4351,1051,...,1903,13247,17477,10010,8808,8331,1193,2043,484,6033
3,2,56,4959,231,11442,41214,6195,3604,5135,1592,...,1753,12481,17378,9682,10091,9396,802,2336,799,6159
4,2,31,6664,161,12059,44711,6613,2981,4558,1462,...,1396,11924,21013,12134,10928,10362,699,2549,911,5853


Make the column names play nicely with StatsModels.

In [5]:
def clean_name(name):
    """Make column names play nicely with StatsModels.
    """
    name = name.replace('-', '_').lower()
    d = {'3rd_ventricle':'third_ventricle', '4th_ventricle':'fourth_ventricle'}
    return d.get(name, name)

In [7]:
columns = [clean_name(name) for name in volume_df.columns]
volume_df.columns = columns
volume_df.head()

Unnamed: 0,bio_sex,age,left_lateral_ventricle,left_inf_lat_vent,left_cerebellum_white_matter,left_cerebellum_cortex,left_thalamus_proper,left_caudate,left_putamen,left_pallidum,...,rh_rostralanteriorcingulate_volume,rh_rostralmiddlefrontal_volume,rh_superiorfrontal_volume,rh_superiorparietal_volume,rh_superiortemporal_volume,rh_supramarginal_volume,rh_frontalpole_volume,rh_temporalpole_volume,rh_transversetemporal_volume,rh_insula_volume
0,2,41,5443,408,14526,47120,6506,2944,4535,1828,...,1151,13009,16499,10019,9201,7703,783,1945,923,6297
1,2,52,3228,309,13065,48090,6942,2918,5000,1734,...,1308,13780,18011,11602,12166,10272,747,1921,790,5537
2,2,54,5642,95,9687,38398,6385,2892,4351,1051,...,1903,13247,17477,10010,8808,8331,1193,2043,484,6033
3,2,56,4959,231,11442,41214,6195,3604,5135,1592,...,1753,12481,17378,9682,10091,9396,802,2336,799,6159
4,2,31,6664,161,12059,44711,6613,2981,4558,1462,...,1396,11924,21013,12134,10928,10362,699,2549,911,5853


Make a binary variable for StatsModels.

In [10]:
volume_df['male'] = (volume_df['bio_sex'] == 1).astype(int)
volume_df['male'].mean()

0.6213235294117647

In [11]:
volume_df['bio_sex'].replace(1, 0, inplace=True)
volume_df['bio_sex'].replace(2, 1, inplace=True)

Select the volumetric features (I don't know what the other features are).

In [13]:
features = [name for name in volume_df.columns if name.endswith('volume')]
len(features)

68

Compute total volume.

In [14]:
volume_df['total_volume'] = volume_df[features].sum(axis=1)
volume_df['total_volume'].describe()

count       272.000000
mean     455371.117647
std       56890.820457
min      336198.000000
25%      416941.500000
50%      450121.000000
75%      489089.250000
max      684464.000000
Name: total_volume, dtype: float64

In [15]:
volume_df.to_hdf('NKI_SBA_Volume.hdf5', key='NKI')

Load the connection data set in

In [16]:
df_1000 = pd.read_excel('brain_mosaic_data/1000_Connectomes_age_18-26_VBM.xlsx')
df_1000.head()

Unnamed: 0,bio_sex,age,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,Frontal_Sup_Orb_R,Frontal_Mid_L,Frontal_Mid_R,...,Cerebelum_00_L,Cerebelum_00_R,Vermis_0_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_00
0,1,18,0.436035,0.406023,0.361779,0.369537,0.495171,0.47909,0.420118,0.444269,...,0.257104,0.247064,0.431084,0.471217,0.572555,0.590969,0.575713,0.558999,0.524192,0.226187
1,1,18,0.422395,0.430086,0.38327,0.390875,0.477471,0.450275,0.448891,0.460303,...,0.245738,0.197609,0.383902,0.421114,0.501804,0.56084,0.566743,0.572814,0.505971,0.202734
2,1,18,0.43954,0.430842,0.385297,0.376834,0.52061,0.4725,0.454426,0.417339,...,0.245801,0.246097,0.418835,0.424117,0.510495,0.597151,0.604129,0.618324,0.606951,0.285061
3,1,18,0.473102,0.449665,0.416976,0.417434,0.53688,0.532697,0.47111,0.502046,...,0.215978,0.202993,0.395247,0.431441,0.524903,0.577355,0.629283,0.642825,0.570416,0.231166
4,1,18,0.382771,0.380443,0.347929,0.359899,0.463163,0.453011,0.393531,0.393259,...,0.236,0.277785,0.370647,0.434968,0.583758,0.564699,0.543617,0.546831,0.565252,0.24779


In [17]:
columns = [clean_name(name) for name in df_1000.columns]
df_1000.columns = columns
df_1000.head()

Unnamed: 0,bio_sex,age,precentral_l,precentral_r,frontal_sup_l,frontal_sup_r,frontal_sup_orb_l,frontal_sup_orb_r,frontal_mid_l,frontal_mid_r,...,cerebelum_00_l,cerebelum_00_r,vermis_0_2,vermis_3,vermis_4_5,vermis_6,vermis_7,vermis_8,vermis_9,vermis_00
0,1,18,0.436035,0.406023,0.361779,0.369537,0.495171,0.47909,0.420118,0.444269,...,0.257104,0.247064,0.431084,0.471217,0.572555,0.590969,0.575713,0.558999,0.524192,0.226187
1,1,18,0.422395,0.430086,0.38327,0.390875,0.477471,0.450275,0.448891,0.460303,...,0.245738,0.197609,0.383902,0.421114,0.501804,0.56084,0.566743,0.572814,0.505971,0.202734
2,1,18,0.43954,0.430842,0.385297,0.376834,0.52061,0.4725,0.454426,0.417339,...,0.245801,0.246097,0.418835,0.424117,0.510495,0.597151,0.604129,0.618324,0.606951,0.285061
3,1,18,0.473102,0.449665,0.416976,0.417434,0.53688,0.532697,0.47111,0.502046,...,0.215978,0.202993,0.395247,0.431441,0.524903,0.577355,0.629283,0.642825,0.570416,0.231166
4,1,18,0.382771,0.380443,0.347929,0.359899,0.463163,0.453011,0.393531,0.393259,...,0.236,0.277785,0.370647,0.434968,0.583758,0.564699,0.543617,0.546831,0.565252,0.24779


Make a binary variable for StatsModels.

In [18]:
df_1000['male'] = (df_1000['bio_sex'] == 1).astype(int)
df_1000['male'].mean()

0.384

In [6]:
df['bio_sex'].replace(1, 0, inplace=True)
df['bio_sex'].replace(2, 1, inplace=True)

In [19]:
df_1000.to_hdf('1000_Connectomes_age_18-26_VBM.hdf5', key='NKI')