# Exctract demographic info and cognitive metrics from the HCP behavioral data

In [40]:
import os, sys, time, h5py, zipfile
import numpy as np
import pandas as pd
from statistics import median
import urllib
import csv
import requests

Loading the HCP spreadsheet and select the subset of columns we are interesting on working with:
- Subject ID
- Age
- Gender
- Mini Mental State Examination (MMSE) total score
- Penn matrix to measure fluid intelligence (number of correct answers)
- Speed of processing score

In [41]:
hcp_behav = pd.read_csv('/home/anna/Scrivania/NeuroHackademy2019/unrestricted_annatruzzi91_8_5_2019_13_21_16.csv')
hcp_behav.keys()
behav_subset = hcp_behav[['Subject','Age','Gender','MMSE_Score','PMAT24_A_CR','ProcSpeed_AgeAdj']]

Ages are expressed in intervals (the actual ace is a Tier 1 restricted information).
Therefore we need to extract the min age for each participants in order to create 4 groups (one per age interval)

In [42]:
min_age_vector=[]
max_age_vector=[]
for item in behav_subset['Age']:
    if '+' in item:
        min_age_vector.append(int(item[0:2]))
        max_age_vector.append(90)
    else:
        min_age_vector.append(int(item[0:2]))
        max_age_vector.append(int(item[-2:]))

min_age_vector=np.asarray(min_age_vector)
max_age_vector=np.asarray(max_age_vector)

Inspect variables of interest (i.e. range, count occurrences, median)

In [43]:
## Age
ageunique, agecounts = np.unique(min_age_vector, return_counts=True)
print('Age')
print(dict(zip(ageunique, agecounts)))
agelist = ageunique

## MMSE 
unique, counts = np.unique(np.array(hcp_behav['MMSE_Score']), return_counts=True)
print('MMSE')
print(dict(zip(unique, counts)))

## Pemm matrices
print('Pemm')
print('min Pemm score is '+str(min(hcp_behav['PMAT24_A_CR'])))
print('max Pemm score is '+str(max(hcp_behav['PMAT24_A_CR'])))
print('median Pemm score is '+str(median(hcp_behav['PMAT24_A_CR'])))

## Processing speed
print('Processing speed')
print('min Processing speed score is '+str(min(hcp_behav['ProcSpeed_AgeAdj'])))
print('max Processing speed score is '+str(max(hcp_behav['ProcSpeed_AgeAdj'])))
print('median Processing speed score is '+str(median(hcp_behav['ProcSpeed_AgeAdj'])))

Age
{22: 247, 26: 527, 31: 418, 36: 14}
MMSE
{23: 1, 24: 1, 26: 31, 27: 84, 28: 197, 29: 440, 30: 452}
Pemm
min Pemm score is 4.0
max Pemm score is 24.0
median Pemm score is 10.0
Processing speed
min Processing speed score is 45.31
max Processing speed score is 149.3
median Processing speed score is 103.13


Add columns to the dataframe in which to define the category each participant is in for each variable of interest:
- Age --> 0 (min age = 22),1 (min age = 26),2 (min age = 31),3 (min age = 36)
- MMSE --> o (lower than median), 1 (higher than median)
- Pemm matrices --> o (lower than median), 1 (higher than median)
- Processing speed --> o (lower than median), 1 (higher than median)

In [44]:
## Age
age_category = np.zeros(shape=len(behav_subset['Age']))
behav_subset['min_age'] = pd.Series(min_age_vector)
behav_subset['age_category'] = pd.Series(age_category)
behav_subset.loc[behav_subset['min_age'] == agelist[0],['age_category']] = 0
behav_subset.loc[behav_subset['min_age'] == agelist[1],['age_category']] = 1
behav_subset.loc[behav_subset['min_age'] == agelist[2],['age_category']] = 2
behav_subset.loc[behav_subset['min_age'] == agelist[3],['age_category']] = 3

## MMSE
MMSE_level = np.zeros(shape=len(behav_subset['MMSE_Score']))
behav_subset['MMSE_Level'] = pd.Series(MMSE_level)
behav_subset.loc[behav_subset['MMSE_Score'] <= median(behav_subset['MMSE_Score']),['MMSE_Level']] = 0
behav_subset.loc[behav_subset['MMSE_Score'] > median(behav_subset['MMSE_Score']),['MMSE_Level']] = 1

## Penn matrices
Penn_level = np.zeros(shape=len(behav_subset['PMAT24_A_CR']))
behav_subset['Penn_Level'] = pd.Series(Penn_level)
behav_subset.loc[behav_subset['PMAT24_A_CR'] <= median(behav_subset['PMAT24_A_CR']),['Penn_Level']] = 0
behav_subset.loc[behav_subset['PMAT24_A_CR'] > median(behav_subset['PMAT24_A_CR']),['Penn_Level']] = 1

## Processing speed
ProcSpeed_level = np.zeros(shape=len(behav_subset['ProcSpeed_AgeAdj']))
behav_subset['ProcSpeed_Level'] = pd.Series(ProcSpeed_level)
behav_subset.loc[behav_subset['ProcSpeed_AgeAdj'] <= median(behav_subset['ProcSpeed_AgeAdj']),['ProcSpeed_Level']] = 0
behav_subset.loc[behav_subset['ProcSpeed_AgeAdj'] > median(behav_subset['ProcSpeed_AgeAdj']),['ProcSpeed_Level']] = 1

behav_subset



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

Unnamed: 0,Subject,Age,Gender,MMSE_Score,PMAT24_A_CR,ProcSpeed_AgeAdj,min_age,age_category,MMSE_Level,Penn_Level,ProcSpeed_Level
0,100004,22-25,M,29,19.0,69.88,22,0.0,0.0,1.0,0.0
1,100206,26-30,M,30,20.0,131.89,26,1.0,1.0,1.0,1.0
2,100307,26-30,F,29,17.0,112.36,26,1.0,0.0,1.0,1.0
3,100408,31-35,M,30,7.0,96.00,31,2.0,1.0,0.0,0.0
4,100610,26-30,M,30,23.0,96.12,26,1.0,1.0,1.0,0.0
5,101006,31-35,F,28,11.0,71.99,31,2.0,0.0,1.0,0.0
6,101107,22-25,M,29,14.0,97.87,22,0.0,0.0,1.0,0.0
7,101208,31-35,F,30,8.0,96.97,31,2.0,1.0,0.0,0.0
8,101309,26-30,M,29,15.0,87.33,26,1.0,0.0,1.0,0.0
9,101410,26-30,M,29,19.0,91.83,26,1.0,0.0,1.0,0.0


In [46]:
behav_subset.to_csv('/home/anna/Scrivania/Neurohackademy2019/gan_dl_BehavioralData.csv',sep=',')

NotADirectoryError: [Errno 20] Not a directory: '/home/anna/Scrivania/Neurohackademy2019/gan_dl_BehavioralData.csv'