In [1]:
# Common imports

# from __future__ import print_function
import numpy as np, matplotlib.pyplot as plt, pandas as pd
import os
import re

# For output stability across multiple runs of the notebook
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

### Formatting data

We'll be taking the ADNI data and creating a pandas datafram out of it. The columns will be left hemisphere surface area and thickness followed by the sanme for right hemisphere. Thus we will have about 296 features. 

In [2]:

'''
######

Formatting data to look like

        LH_SA ... LH_TH ... RH_SA ... RH_TH
Subject

######
'''

BS = 0 # Brain Structure Name
SA = 2 # Surface Area (mm^2)
TH = 4 # Average Thickness
VOL = 3 # Volume in mm^3 in aseg.stats

columns = []
TEMPLATE = "ADNI/{subject}/stats/{hemisphere}.aparc.a2009s.stats"
VOLUME_FILE = "ADNI/{subject}/stats/aseg.stats"
DATA_DIR = "ADNI/"
TIMESTAMP="%Y%m%d%H%M%S%f"
PATTERN = "ADNI_(\d{3}_S_\d{4})_(\d*)"

# Might have stolen from stackoverflow
def decomment(file):
    for line in file:
        raw = line.split("#")[0].strip()
        if raw: yield raw

def load_sample(sample, hemi):
    
    with open(sample) as datafile:
        data = []
        for row in decomment(datafile):
            stats = row.split()
            data.append([stats[i] for i in [BS,SA,TH]])
    
    data = np.array(data)
    
    # Build columns if first entry
    if len(columns) == 0:
        for hemi in ["lh","rh"]:
            for structure in data[:,0]:
                columns.extend([structure+"_SA_"+hemi, structure+"_TH_"+hemi])
    
    return data

def load_data(subject_number):
    lh_data = TEMPLATE.format(subject=subject_number, hemisphere="lh")
    rh_data = TEMPLATE.format(subject=subject_number, hemisphere="rh")


    lh_features = load_sample(lh_data, "lh")
    rh_features = load_sample(rh_data, "rh")
    combined = np.concatenate((lh_features, rh_features), axis=0)
    
    raw_data = [combined[:,1:3].flatten(order="C")] # Column-wise collapse 2D array
    
    return raw_data

# Calculates Intra-Cranial Volume
def get_ICV(subject_number):
    file = VOLUME_FILE.format(subject=subject_number)
    ICV = 0
    
    with open(file) as datafile:
        for row in decomment(datafile):
            stats = row.split()
            ICV += float(stats[VOL])

    return ICV

In [3]:
subject_list = [x for x in filter(lambda dirname: not dirname.startswith("."), os.listdir(DATA_DIR))]

raw_data = []
for subject in subject_list:
    subject_data = load_data(subject)
    subject_ICV = get_ICV(subject)

    result = re.match(PATTERN,subject)
    pID = result.group(1)
    scan_date = result.group(2)
    subject_data = np.concatenate(([pID,scan_date, subject_ICV], subject_data),axis=None).reshape(1,-1)
    
    if len(raw_data) == 0:
        raw_data = subject_data
    else:
#         print(subject_data[0,:2],subject_data.shape)
        if subject_data.shape[1] == raw_data.shape[1]: # Some samples have incomplete data so feature columns do not match
            raw_data = np.append(raw_data, subject_data, axis=0)
raw_data.shape

(1093, 299)

In [4]:
# a=np.array([[1,2,3]])
# b=np.array([[4,5,6]])
# np.append([],b,axis=0)
subject_ICV

251762.59999999992

In [8]:
features = np.concatenate((["PTID", "scandate", "ICV"],columns),axis=None)

full_data = pd.DataFrame(raw_data, columns=features).astype({"PTID":str})
full_data.head()

Unnamed: 0,PTID,scandate,ICV,G_and_S_frontomargin_SA_lh,G_and_S_frontomargin_TH_lh,G_and_S_occipital_inf_SA_lh,G_and_S_occipital_inf_TH_lh,G_and_S_paracentral_SA_lh,G_and_S_paracentral_TH_lh,G_and_S_subcentral_SA_lh,...,S_suborbital_SA_rh,S_suborbital_TH_rh,S_subparietal_SA_rh,S_subparietal_TH_rh,S_temporal_inf_SA_rh,S_temporal_inf_TH_rh,S_temporal_sup_SA_rh,S_temporal_sup_TH_rh,S_temporal_transverse_SA_rh,S_temporal_transverse_TH_rh
0,094_S_2216,20110504083504461,307244.6000000001,936,1.984,1158,2.107,993,2.306,1226,...,254,1.421,1007,1.937,763,1.759,4349,2.025,249,1.579
1,099_S_2063,20141104100338447,263318.8,1021,2.267,1179,2.134,1139,1.915,1124,...,79,2.173,503,2.081,547,2.306,4723,2.269,252,2.297
2,029_S_2376,20110705181758518,303135.80000000005,855,2.16,1291,2.287,1137,1.961,1451,...,309,2.457,1195,1.804,635,1.97,4895,2.071,344,1.642
3,098_S_4003,20160504154447525,234729.1,849,2.122,909,2.272,788,2.43,717,...,250,2.222,626,1.87,784,1.826,3182,2.056,195,2.179
4,021_S_2077,20141021152650834,278496.20000000007,762,2.237,969,2.141,1153,2.13,1156,...,180,2.88,1127,2.245,789,1.924,4399,2.014,243,1.826


## Get the first scan for every patient. get earliest timestamp per patient

In [9]:
import re
import datetime

TIMESTAMP="%Y%m%d%H%M%S%f"

s=subject_list[0]
PATTERN = "ADNI_(\d{3}_S_\d{4})_(\d*)"
print("Original: ",s)

result=re.match(PATTERN,s)
pID = result.group(1)
scan_date = result.group(2)
a=pd.to_datetime(scan_date, format=TIMESTAMP)
# datetime.date.fromtimestamp(scan_date)
# scan_date

Original:  ADNI_094_S_2216_20110504083504461


In [10]:
old = np.datetime64("20110504083504461",format=TIMESTAMP)
a == pd.to_datetime("20110504083504461", format=TIMESTAMP)

True

In [26]:
import csv

filename="ADNI_labels.csv"
with open(filename) as f:
    df = pd.read_csv(f).astype({"PTID":str,"DX_bl":str, "EXAMDATE.x": np.datetime64})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 14 columns):
Unnamed: 0    520 non-null int64
ScanId        520 non-null object
PTID          520 non-null object
EXAMDATE.x    520 non-null datetime64[ns]
VISCODE       520 non-null object
DX_bl         520 non-null object
AGE           520 non-null float64
PTGENDER      520 non-null object
PTEDUCAT      520 non-null int64
PTETHCAT      520 non-null object
PTRACCAT      520 non-null object
PTMARRY       520 non-null object
CDRSB         418 non-null float64
MMSE          421 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(2), object(8)
memory usage: 57.0+ KB


**We will only keep the first date by sorting on patient IDs and then dates. Removing duplicates from patient IDs will then guarantee that only the first date remains.**

In [33]:
col_names = ["PTID","EXAMDATE.x", "DX_bl"]
# labels = df[col_names].astype([str,"date", str])
labels=df.sort_values(by=col_names[:2])
# labels=labels.drop_duplicates(subset="PTID").copy()

In [34]:
labels.head()
# labels.tail()

Unnamed: 0.1,Unnamed: 0,ScanId,PTID,EXAMDATE.x,VISCODE,DX_bl,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,PTMARRY,CDRSB,MMSE
0,78,S156236,003_S_2374,2012-05-16,m12,EMCI,81.3,Female,18,Not Hisp/Latino,White,Never married,0.5,29.0
3,481,S168567,005_S_4910,2012-09-21,bl,AD,81.4,Female,15,Not Hisp/Latino,White,Married,7.0,25.0
1,302,S177586,005_S_4910,2012-12-13,m03,AD,81.4,Female,15,Not Hisp/Latino,White,Married,,
2,303,S185876,005_S_4910,2013-04-03,m06,AD,81.4,Female,15,Not Hisp/Latino,White,Married,10.0,19.0
4,359,S177601,005_S_5038,2012-12-13,bl,AD,81.5,Male,18,Not Hisp/Latino,White,Married,5.0,25.0


**Now we can use the label files to get only the valid patients from our feature matrix**

Use the `isin` dtaframe method and then index using the boolean result

In [35]:
# df[df["PTID"] == labels["PTID"]]
label_filter = full_data["PTID"].isin(labels["PTID"])
cleaned_data = full_data[label_filter]
cleaned_data["scandate"] = pd.to_datetime(cleaned_data["scandate"], format=TIMESTAMP)
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 673 entries, 0 to 1090
Columns: 299 entries, PTID to S_temporal_transverse_TH_rh
dtypes: datetime64[ns](1), object(298)
memory usage: 1.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [36]:
cleaned_data.head()

Unnamed: 0,PTID,scandate,ICV,G_and_S_frontomargin_SA_lh,G_and_S_frontomargin_TH_lh,G_and_S_occipital_inf_SA_lh,G_and_S_occipital_inf_TH_lh,G_and_S_paracentral_SA_lh,G_and_S_paracentral_TH_lh,G_and_S_subcentral_SA_lh,...,S_suborbital_SA_rh,S_suborbital_TH_rh,S_subparietal_SA_rh,S_subparietal_TH_rh,S_temporal_inf_SA_rh,S_temporal_inf_TH_rh,S_temporal_sup_SA_rh,S_temporal_sup_TH_rh,S_temporal_transverse_SA_rh,S_temporal_transverse_TH_rh
0,094_S_2216,2011-05-04 08:35:04.461,307244.6000000001,936,1.984,1158,2.107,993,2.306,1226,...,254,1.421,1007,1.937,763,1.759,4349,2.025,249,1.579
2,029_S_2376,2011-07-05 18:17:58.518,303135.80000000005,855,2.16,1291,2.287,1137,1.961,1451,...,309,2.457,1195,1.804,635,1.97,4895,2.071,344,1.642
3,098_S_4003,2016-05-04 15:44:47.525,234729.1,849,2.122,909,2.272,788,2.43,717,...,250,2.222,626,1.87,784,1.826,3182,2.056,195,2.179
4,021_S_2077,2014-10-21 15:26:50.834,278496.20000000007,762,2.237,969,2.141,1153,2.13,1156,...,180,2.88,1127,2.245,789,1.924,4399,2.014,243,1.826
5,021_S_5099,2013-06-11 14:47:47.885,221848.6000000001,752,2.073,960,2.521,873,2.374,810,...,197,3.412,790,2.276,665,2.535,2914,2.168,183,2.166


In [37]:
cleaned_data.sort_values(by=["PTID"])
# cleaned_data=cleaned_data.drop_duplicates(subset="PTID").copy()
# cleaned_data.info()
cleaned_data[cleaned_data["PTID"].str.match("005_S_5119")]

Unnamed: 0,PTID,scandate,ICV,G_and_S_frontomargin_SA_lh,G_and_S_frontomargin_TH_lh,G_and_S_occipital_inf_SA_lh,G_and_S_occipital_inf_TH_lh,G_and_S_paracentral_SA_lh,G_and_S_paracentral_TH_lh,G_and_S_subcentral_SA_lh,...,S_suborbital_SA_rh,S_suborbital_TH_rh,S_subparietal_SA_rh,S_subparietal_TH_rh,S_temporal_inf_SA_rh,S_temporal_inf_TH_rh,S_temporal_sup_SA_rh,S_temporal_sup_TH_rh,S_temporal_transverse_SA_rh,S_temporal_transverse_TH_rh
306,005_S_5119,2013-03-28 16:47:21.347,212182.9,497,1.896,641,2.529,797,2.081,839,...,129,2.177,710,2.035,316,2.192,2451,2.005,251,1.727
764,005_S_5119,2015-06-26 16:04:53.575,214968.9,415,2.139,659,2.477,889,1.968,783,...,156,1.853,675,2.001,420,2.045,2226,1.863,268,1.908
1063,005_S_5119,2013-06-25 13:45:43.868,210950.5,483,2.009,618,2.57,859,1.93,795,...,188,2.117,687,1.92,197,2.051,2231,1.905,211,1.658


In [38]:
labels[labels["PTID"].str.match("005_S_5119")]

Unnamed: 0.1,Unnamed: 0,ScanId,PTID,EXAMDATE.x,VISCODE,DX_bl,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,PTMARRY,CDRSB,MMSE
10,489,S185458,005_S_5119,2013-03-28,bl,AD,77.0,Female,12,Not Hisp/Latino,White,Married,8.0,23.0
8,390,S193388,005_S_5119,2013-06-25,m03,AD,77.0,Female,12,Not Hisp/Latino,White,Married,,
9,391,S217193,005_S_5119,2014-04-22,m12,AD,77.0,Female,12,Not Hisp/Latino,White,Married,10.0,18.0


In [22]:
# Sanity check to see if all patients show up in both data frames
all(cleaned_data["PTID"].isin(labels["PTID"]))

True

In [23]:
labels_df = labels[["PTID","DX_bl"]]
final_cleaned_data = pd.merge(cleaned_data,labels_df, how="inner", on="PTID")

In [24]:
final_cleaned_data.describe()

Unnamed: 0,PTID,scandate,ICV,G_and_S_frontomargin_SA_lh,G_and_S_frontomargin_TH_lh,G_and_S_occipital_inf_SA_lh,G_and_S_occipital_inf_TH_lh,G_and_S_paracentral_SA_lh,G_and_S_paracentral_TH_lh,G_and_S_subcentral_SA_lh,...,S_suborbital_TH_rh,S_subparietal_SA_rh,S_subparietal_TH_rh,S_temporal_inf_SA_rh,S_temporal_inf_TH_rh,S_temporal_sup_SA_rh,S_temporal_sup_TH_rh,S_temporal_transverse_SA_rh,S_temporal_transverse_TH_rh,DX_bl
count,143,143,143.0,143.0,143.0,143.0,143.0,143.0,143.0,143.0,...,143.0,143.0,143.0,143.0,143.0,143.0,143.0,143.0,143.0,143
unique,143,143,143.0,123.0,129.0,125.0,133.0,117.0,131.0,132.0,...,137.0,125.0,131.0,129.0,133.0,135.0,121.0,100.0,132.0,5
top,016_S_4097,2012-12-21 14:13:35.107000,295583.4,773.0,1.931,921.0,2.336,979.0,2.393,952.0,...,2.082,865.0,2.102,750.0,2.335,3171.0,2.008,205.0,2.186,EMCI
freq,1,1,1.0,3.0,3.0,3.0,2.0,4.0,2.0,2.0,...,2.0,2.0,3.0,4.0,2.0,2.0,3.0,4.0,2.0,46
first,,2010-06-24 13:16:52.281000,,,,,,,,,...,,,,,,,,,,
last,,2016-06-15 15:50:31.715000,,,,,,,,,...,,,,,,,,,,


Can do some analytics on distribution now

In [25]:
s=final_cleaned_data.groupby("DX_bl")
s["PTID"].describe()

Unnamed: 0_level_0,count,unique,top,freq
DX_bl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AD,26,26,007_S_5196,1
CN,30,30,016_S_4097,1
EMCI,46,46,029_S_2395,1
LMCI,21,21,027_S_4804,1
SMC,20,20,027_S_5109,1


In [26]:
# savefile = "ICV_ADNI.csv"
final_cleaned_data.to_csv(savefile, index=False)

# DONE!

Now we can easily read data from the csv