In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
%matplotlib inline

First, we have to decide how to deal with longitudinality. For now, we will ignore longitudinal data and treat each row as a unique patient. This means we need to take out any 'baseline' data.

In [87]:
df = pd.read_csv('ADNIMERGE.csv')
all_cols = list(df)
df.dtypes

# baseline data is indicated by '_bl' OR described as a baseline measurement in ADNIMERGE_DICT
important_cols = [i for i in all_cols[8:-3] if i[-2:] != 'bl']

In [88]:
important_cols = ['RID'] + important_cols

In [89]:
data = df[important_cols]

`AGE` is probably an important predictor for a diagnosis. However, ADNIMERGE_DICT says that `AGE` is the age of the patient at baseline. Because we are treating each row as a separate patient, we should use `Month_bl` (months from baseline) to calculate a 'new age' for each row. That is, a 65 year old who gets a second exam 12 months later will be treated as a different, 66 year old person.

In [91]:
data['EXACT_AGE'] = df.AGE + (df.Month_bl/12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [79]:
# now we need to one-hot encode the categorical predictors
# BUT since we will eventually do a classification with DX,
# we'll first convert those values and then one-hot encode
# things like marriage, etc

DX_class = []
for i in data.DX.values:
    if i == 'CN':
        DX_class.append(0)
    elif i == 'MCI':
        DX_class.append(1)
    elif i == 'Dementia':
        DX_class.append(2)
    else:
        DX_class.append(i) # for NaNs. will deal with missingness next.
# len(DX_class),len(data.DX.values)
# now drop the original DX column
data.drop('DX',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [83]:
# get columns with categorical data
cat_cols = [i for i in data.columns if isinstance(data[i][0], str)]

['PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'FLDSTRENG', 'FSVERSION']

In [31]:
# now we need to deal with missingness
data.DX.isnull().mean()

0.31251440424060845

In [59]:
# what percent of each column is missing?
def percent_in_each_col(df):
    percent_missing = [df[i].isnull().mean()*100 for i in list(df)]
    percent_missing = np.asarray(percent_missing)
    in_each = pd.DataFrame(percent_missing,df.columns,columns=["Percent"])
    print(in_each[in_each.Percent > 0])
    print("Average percent missing for all columns: ", in_each[in_each.Percent > 0].mean())


In [60]:
percent_in_each_col(data)

                         Percent
APOE4                   0.453253
FDG                    74.241377
PIB                    98.286856
AV45                   83.398633
CDRSB                  30.736729
ADAS11                 31.174618
ADAS13                 31.835292
MMSE                   29.522932
RAVLT_immediate        30.383345
RAVLT_learning         30.383345
RAVLT_forgetting       30.583084
RAVLT_perc_forgetting  31.120842
FAQ                    29.000538
MOCA                   60.052239
EcogPtMem              59.360836
EcogPtLang             59.483752
EcogPtVisspat          59.798725
EcogPtPlan             59.560575
EcogPtOrgan            60.382577
EcogPtDivatt           59.798725
EcogPtTotal            59.453023
EcogSPMem              59.099639
EcogSPLang             59.061228
EcogSPVisspat          59.829454
EcogSPPlan             59.545210
EcogSPOrgan            60.866559
EcogSPDivatt           60.121380
EcogSPTotal            59.099639
FLDSTRENG              45.586541
FSVERSION 

In [61]:
# In lecture, Kevin mentioned that imputing using the median
# will often give you the best results

from sklearn.preprocessing import Imputer

imp = Imputer(strategy="median")
imputed = imp.fit_transform(data)
full_data = pd.DataFrame(imputed,columns = important_cols)

percent_in_each_col(full_data)

ValueError: could not convert string to float: 'CN'