# Data Extraction  
This script is intended to pull relevant variables from the raw data set.

The final useable data-split into train/test/validate sets- appears in the "result" folder.

This code can also be used to ingest NEW data in the NACC format and convert it into data readable by our model.

## Import Data
To start, place the raw data in the raw data folder. The git ignore file should keep it from being added to the repository.

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import Imputer
from __future__ import print_function

print(os.getcwd())

variables = []
var_dict = {}

In [None]:
rawdata = pd.read_csv("raw/patel01222016.csv", low_memory=False)

*NACCID*

In [None]:
variables.append("NACCID")

### Informant Interview Scores

*MEMORY, ORIENT, JUDGMENT, COMMUN, HOMEHOBB, PERSCARE, COMPORT, CDRLANG, CDRGLOB*  
0.0 = No impairement  
0.5 = Questionable impairement  
1.0 = Mild impairment  
2.0 = Moderate impairment  
3.0 = Severe impairment  

*CDRSUM* is 0 to 18

These variables are scored based on structured interviews with caretakers. They describe memory, orientation, judgement, community involvement, home and hobbies, and personal care. *CDRGLOB* is a total impairment score computed from the previous values. CDRSUM is the simple total of the previous values.

In [None]:
variables.append("MEMORY")
variables.append("ORIENT")
variables.append("JUDGMENT")
variables.append("COMMUN")
variables.append("HOMEHOBB")
variables.append("PERSCARE")
variables.append("CDRSUM")
variables.append("CDRGLOB")

*NACCGDS*  
Geriatric Depression Scale (higher is more depression)
Score 0-15  
88 = couldn't be calculated  
-4 = not available  
Recoding this variable to fill in error codes with NaN  
**NOT CURRENTLY AVAILABLE**

gds = []
for i in range(len(rawdata)):
    if rawdata['NACCGDS'][i] > 15:
        gds.append(np.nan)
    elif rawdata['NACCGDS'][i] < 0:
        gds.append(np.nan)
    else:
        gds.append(rawdata['NACCGDS'][i])
    print(str(len(gds)) + '/' + str(len(rawdata)), end='\r')
print(str(len(gds)) + '/' + str(len(rawdata)))
var_dict['gds'] = gds

### Demographics  

*NACCAGE*

Age at time of visit from 18 to 120. Birth year and month are collected, but not day. Therefore, age is estimated using birth day set to 1.

In [None]:
variables.append('NACCAGE')

*RACE*  
Race  
1 = White  
2 = Black or African American  
3 = American Indian or Alaska Native  
4 = Native Hawaiian or Pacific Islander  
5 = Asian  
50 = Other (specify)  
99 = Unknown  

*RACEX*  
If Race = 50, the specify text is listed here.

Recode into *race*  
Same codes as above, except 50 and 99 are now NaN.

In [None]:
races = []
for race in rawdata['RACE']:
    if race > 5:
        races.append(np.nan)
    else:
        races.append(race)
var_dict['race'] = races
del races

*SEX*  
1=Male  
2=Female  

Recode into *female*  
0 = male  
1 = female

In [None]:
sex = []
for sx in rawdata['SEX']:
    if sx == 1:
        sex.append('Male')
    elif sx == 2:
        sex.append('Female')
    else:
        sex.append(np.nan)
var_dict['sex'] = sex
del sex

*HANDED*  
1=left  
2=right  
3=ambidextrous  
9=uknown  

Recode into *handed*  
left  
right  
ambidextrous  
NaN  

In [None]:
handed = []
for hnd in rawdata['HANDED']:
    if hnd == 1:
        handed.append('left')
    if hnd == 2:
        handed.append('right')
    if hnd == 3:
        handed.append('ambidextrous')
    if hnd == 9:
        handed.append(np.nan)
var_dict['handed'] = handed
del handed

*EDUC*  
0 to 36  
99 = Unknown  

Recode to *educ*  
[0 - 12) = elementary  
[12 - 16) = high school  
[16 - 18) = college  
[18 - 20) = masters  
[20 - 37) = doctorate  
[37 - $\infty$ ) = NaN

In [None]:
educ = []
for edu in rawdata['EDUC']:
    if edu <12:
        educ.append('elementary')
    elif edu < 16:
        educ.append('high school')
    elif edu < 18:
        educ.append('college')
    elif edu < 20:
        educ.append('masters')
    elif edu < 37:
        educ.append('doctorate')
    else:
        educ.append(np.nan)
var_dict['educ'] = educ
del educ

*NACCLIVS*  
1 = Lives alone  
2 = Lives with spouse or partner  
3 = Lives with relative or friend  
4 = Lives with group  
5 = Other  
6 = Unknown

Recode into *living_sit*  
alone  
partner  
friend  
group  
other  
NaN

In [None]:
living_sit = []
for lv in rawdata['NACCLIVS']:
    if lv == 1:
        living_sit.append('alone')
    elif lv == 2:
        living_sit.append('partner')
    elif lv == 3:
        living_sit.append('friend')
    elif lv == 4:
        living_sit.append('group')
    elif lv == 5:
        living_sit.append('other')
    else:
        living_sit.append(np.nan)
var_dict['living_sit'] = living_sit
del living_sit

*INDEPEND*  

1=Living independently  
2=Requires some assistance with complex activities  
3=Requires some assistance with basic activities  
4=Completely dependent  
9=Unknown  

Recode to *independ*  
independent  
some assistance  
significant assistance  
fully assissted

In [None]:
independ = []
for ind in rawdata['INDEPEND']:
    if ind == 1:
        independ.append('independent')
    elif ind == 2:
        independ.append('some assistance')
    elif ind == 3:
        independ.append('significant assistance')
    elif ind == 4:
        independ.append('fully assisted')
    else:
        independ.append(np.nan)
var_dict['independ'] = independ
del independ

### Diagnosis Labels

*NACCALZD*  
0 = No (assumed assessed and found not present)  
1 = Yes, subject with any cognitive impairment and AD indicated as the etiologic diagnosis.  
8 = No cognitive impairment  

*NACCBVFT*  
0 = No, Subjects with dementia syndrom or etiologic diagnosis other than bvFTD.  
1 = Yes, subject with bvFTD dementia syndrome.  
8 = No cognitive impairment

Recode into *labels*.  
AD  
bvFTD  
NaN

In [None]:
labels = []
for i in range (len(rawdata)):
    if rawdata["NACCALZD"][i] == 1:
        if rawdata["NACCBVFT"][i] == 1:
            labels.append(np.nan) # error code -8
        else:
            labels.append('AD') # AD = 0
    elif rawdata["NACCBVFT"][i] == 1:
        labels.append('bvFTD') # bvFTD = 1
    else:
        labels.append(np.nan)
var_dict['labels'] = labels
del labels

### Physical/Neurological Exam Findings

*FOCLDEF*, *GAITDIS*, *EYEMOVE*, *PARKSIGN*, *RESTTRL*, *RESTTRR*, *SLOWINGL*, *SLOWINGR*, *RIGIDL*, *RIGIDR*, *BRADY*, *PARKGAIT*, *POSTINST*, *CVDSIGNS*, *CORTDEF*, *SIVDFIND*, *CVDMOTL*, *CVDMOTR*, *CORTVISL*, *CORTVISR*, *SOMATL*, *SOMATR*, *POSTCORT*

0 = No, symptom not present  
1 = Yes, symptom present  
9/-4/8 = Unknown/Not available/skipped  
Recode to Yes/No/NaN  

In [None]:
tests = ["FOCLDEF", "GAITDIS", "EYEMOVE", "PARKSIGN", "RESTTRL", "RESTTRR",
         "SLOWINGL", "SLOWINGR", "RIGIDL", "RIGIDR", "BRADY", "PARKGAIT",
         "POSTINST", "CVDSIGNS", "CORTDEF", "SIVDFIND", "CVDMOTL", "CVDMOTR",
         "CORTVISL", "CORTVISR", "SOMATL", "SOMATR", "POSTCORT"]

for test in tests:
    findings = []
    for finding in rawdata[test]:
        if finding == 0:
            findings.append('No')
        elif finding == 1:
            findings.append('Yes')
        else:
            findings.append(np.nan)
    var_dict[test] = findings

*HEIGHT*  
36 to 87.9, all others are recoded to error.  

*WEIGHT*  
50-400, all others are recoded to error.  

*BMI* = WEIGHT * 703 / HEIGHT^2  

In [None]:
height = []
for h in rawdata['HEIGHT']:
    if (h < 36.0 or h > 87.9):
        height.append(np.nan)
    else:
        height.append(h)

weight= []
for w in rawdata['WEIGHT']:
    if (w < 50 or w > 400):
        weight.append(np.nan)
    else:
        weight.append(w)     

bmi = []
for h,w in zip(height, weight):
    if not np.isnan(h) and not np.isnan(w):
        bmi.append(w * 703 / h^2)
    else:
        bmi.append(np.nan)

var_dict['bmi'] = bmi
del height
del weight
del bmi

*COGMODE*  
0=No impairment in cognition  
1=gradual  
2=subacute  
3=abrupt  
4=other  
99=unknown

In [None]:
decline = []
for cm in rawdata['COGMODE']:
    if cm==0:
        decline.append('No Impairment')
    elif cm==1:
        decline.append('Gradual')
    elif cm==2:
        decline.append('Subacute')
    elif cm==3:
        decline.append('Abrupt')
    else:
        decline.append(np.nan)
var_dict['decline'] = decline
del decline

*DECAGE*  
Estimated age at which cog decline began.  
15-110  
All other values recode to NaN

In [None]:
decage = []
for da in rawdata['DECAGE']:
    if da < 0 or da > 110:
        decage.append(np.nan)
    else:
        decage.append(da)
var_dict['decage'] = decage
del decage

*DECCLBE, BEAPATHY, BEDEP, BEVHALL, BEVWELL, BEAHALL, BEDEL, BEDISIN, BEIRRIT, BEAGIT, BEPERCH, BEREM, BEANX*  
0 = No  
1 = Yes  
9/-4/8 = Unknown/Not available/skipped  
Recode to Yes/No/NaN  

In [None]:
tests = ["DECCLBE", "BEAPATHY", "BEDEP", "BEVHALL", "BEVWELL",
         "BEAHALL", "BEDEL", "BEDISIN", "BEIRRIT", "BEAGIT", "BEPERCH",
         "BEREM", "BEANX"]

for test in tests:
    findings = []
    for finding in rawdata[test]:
        if finding == 0:
            findings.append('No')
        elif finding == 1:
            findings.append('Yes')
        else:
            findings.append(np.nan)
    var_dict[test] = findings

*NACCBEHF*  
Predominant symptom that was first recognized as a decline in behavior.  
0=No behavioral symptoms  
1=Apathy/withdrawal  
2=Depressed mood  
3=Pyschosis  
4=Disinhibition  
5=Irritability  
6=Agitation  
7=Personality change  
8=REM sleep behavior disorder  
9=Anxiety  
10=Other  

In [None]:
naccbehf = []
for hf in rawdata['NACCBEHF']:
    if hf==0:
        naccbehf.append('No behavioral symptoms')
    elif hf==1:
        naccbehf.append('Apathy/withdrawal')
    elif hf==2:
        naccbehf.append('ApathyDepressed mood')
    elif hf==3:
        naccbehf.append('Psychosis')
    elif hf==4:
        naccbehf.append('Disinhibition')
    elif hf==5:
        naccbehf.append('Irritability')
    elif hf==6:
        naccbehf.append('Agitation')
    elif hf==7:
        naccbehf.append('Personality change')
    elif hf==8:
        naccbehf.append('REM sleep behavior disorder')
    elif hf==9:
        naccbehf.append('Anxiety')
    elif hf==10:
        naccbehf.append('Other')
    else:
        naccbehf.append(np.nan)
var_dict['naccbehf'] = naccbehf
del naccbehf

*BEMODE*  
Mode of behavioral symptom onset.  
0=No symptoms  
1=Gradual  
2=Subacute  
3=Abrupt  
4=Other  
99=Unknown

In [None]:
bemode =[]
for be in rawdata['BEMODE']:
    if be==0:
        bemode.append('No behavior symptoms')
    elif be==1:
        bemode.append('Gradual')
    elif be==2:
        bemode.append('Abrupt')
    elif be==3:
        bemode.append('Other')
    else:
        bemode.append(np.nan)
var_dict['bemode']=bemode
del bemode

*BEAGE*  
Age of behavioral symptom onset.  
15 to 110, all other values are error codes.

In [None]:
beage = []
for age in rawdata['BEAGE']:
    if (age<15 or age>110):
        beage.append(age)
    else:
        beage.append(np.nan)
var_dict['beage']=beage
del beage
    

*DECCLMOT* - Is the subject experiencing any motor symptoms?  
*MOGAIT* - Meaningful changes in motor gait?  
*MOFALLS* - Meaningful changes in falls?  
*MOTREM* - Meaningful changes in tremors?  
*MOSLOW* - Meaningful changes in slowness?  
0=No  
1=Yes

In [None]:
measurements = ['DECCLMOT', 'MOGAIT', 'MOFALLS', 'MOTREM', 'MOSLOW']

for measure in measurements:
    values = []
    for value in rawdata[measure]:
        if value==0:
            values.append("No")
        elif value==1:
            values.append("Yes")
        else:
            values.append(np.nan)
    var_dict[measure]=values

*NACCMOTF*  - Predominant motor symptom  
0=No Motor symptoms  
1=Gait disorder  
2=falls  
3=tremor  
4=slowness  

In [None]:
values = []
for val in rawdata['NACCMOTF']:
    if val==0:
        values.append('No behavior symptoms')
    elif val==1:
        values.append('Gait disorder')
    elif val==2:
        values.append('Falls')
    elif val==3:
        values.append('Tremor')
    elif val==4:
        values.append('Slowness')
    else:
        values.append(np.nan)
var_dict['naccmotf']=values
del values

*MOMODE* - mode of motor symptom onset  
0=No motor symptoms  
1=Gradual  
2=Subacute  
3=Abrupt  
4=Other  
99=Unknown

In [None]:
values = []
for val in rawdata['MOMODE']:
    if val==0:
        values.append('No behavior symptoms')
    elif val==1:
        values.append('Gradual')
    elif val==2:
        values.append('Subacute')
    elif val==3:
        values.append('Abrupt')
    elif val==4:
        values.append('Other')
    else:
        values.append(np.nan)
var_dict['momode']=values
del values

*FRSTCHG* - Primary first domain that was recognized as changed  
1=Cognition  
2=Behavior  
3=Motor function  

In [None]:
values = []
for val in rawdata['FRSTCHG']:
    if val==1:
        values.append('Cognition')
    elif val==2:
        values.append('Behavior')
    elif val==3:
        values.append('Motor function')
    else:
        values.append(np.nan)
        
var_dict['FRSTCHG']=values
del values

### Neuropsychological Battery Scores
*MMSEORDA*  - Orientation, Time  
0-5  

In [None]:
values = []
for val in rawdata['MMSEORDA']:
    if (val>=0 and val<=5):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['mmseorda']=values
del values

*MMSEORLO* - Orientation, Place  
0-5

In [None]:
values = []
for val in rawdata['MMSEORLO']:
    if (val>=0 and val<=5):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['mmseorlo']=values
del values

*PENTAGON* - Drew the MMSE pentagon  
0-1

In [None]:
values = []
for val in rawdata['PENTAGON']:
    if (val>=0 and val<=1):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['pentagon']=values
del values

*NACCMMSE* - Total MMSE score  
0-30

In [None]:
values = []
for val in rawdata['NACCMMSE']:
    if (val>=0 and val<=30):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['naccmmse']=values
del values

*LOGIMEM* - Story units recalled  
0-25

In [None]:
values = []
for val in rawdata['LOGIMEM']:
    if (val>=0 and val<=25):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['logimem']=values
del values

*MEMUNITS* - Story units recalled, delayed recalled  
0-25

In [None]:
values = []
for val in rawdata['MEMUNITS']:
    if (val>=0 and val<=25):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['memunits']=values
del values

*UDSBENTC* - Benson figure copy  
0-17

In [None]:
values = []
for val in rawdata['UDSBENTC']:
    if (val>=0 and val<=17):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['udsbentc']=values
del values

*UDSBENTD* - Benson figure delayed recall  
0-17

In [None]:
values = []
for val in rawdata['UDSBENTD']:
    if (val>=0 and val<=17):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['udsbentd']=values
del values

*UDSBENRS* - Benson figure recognition  
0=No  
1=Yes

In [None]:
values = []
for val in rawdata['UDSBENRS']:
    if val==0:
        values.append("No")
    elif val==1:
        values.append("Yes")
    else:
        values.append(np.nan)
var_dict['udsbenrs']=values
del values

*DIGIFLEN* - Digits forwards length  
0-8

In [None]:
values = []
for val in rawdata['DIGIFLEN']:
    if (val>=0 and val<=8):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['digiflen']=values
del values

*DIGIBLEN* - Digits forwards length  
0-8

In [None]:
values = []
for val in rawdata['DIGIBLEN']:
    if (val>=0 and val<=8):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['digiblen']=values
del values

*ANIMALS* - Animals named in a minute  
0-77

In [None]:
values = []
for val in rawdata['ANIMALS']:
    if (val>=0 and val<=77):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['animals']=values
del values

*VEG* - Vegetables named in a minute  
0-77

In [None]:
values = []
for val in rawdata['VEG']:
    if (val>=0 and val<=77):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['vegetables']=values
del values

*TRAILA* - Seconds to complete trail making test A  
0-150

In [None]:
values = []
for val in rawdata['TRAILA']:
    if (val>=0 and val<=150):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['traila']=values
del values

*TRAILARR* - Errors made when completing trail making test A  
0-40

In [None]:
values = []
for val in rawdata['TRAILARR']:
    if (val>=0 and val<=40):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['trailarr']=values
del values

*TRAILALI* - Correct lines drawn when completing trail making test A  
0-24

In [None]:
values = []
for val in rawdata['TRAILALI']:
    if (val>=0 and val<=24):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['trailali']=values
del values

*TRAILB* - Seconds to complete trail making test A  
0-300

In [None]:
values = []
for val in rawdata['TRAILB']:
    if (val>=0 and val<=300):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['trailb']=values
del values

*TRAILBRR* - Errors made when completing trail making test A  
0-40

In [None]:
values = []
for val in rawdata['TRAILBRR']:
    if (val>=0 and val<=40):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['trailbrr']=values
del values

*TRAILBLI* - Correct lines drawn when completing trail making test A  
0-24

In [None]:
values = []
for val in rawdata['TRAILBLI']:
    if (val>=0 and val<=24):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['trailbli']=values
del values

*WAIS* - wais-r digit symbol test  
0-93

In [None]:
values = []
for val in rawdata['WAIS']:
    if (val>=0 and val<=93):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['wais']=values
del values

*BOSTON* - Boston Naming Test  
0-30

In [None]:
values = []
for val in rawdata['BOSTON']:
    if (val>=0 and val<=30):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['boston']=values
del values

*UDSVERFC* - Words starting with F generated in a minute  
0-40  

In [None]:
values = []
for val in rawdata['UDSVERFC']:
    if (val>=0 and val<=40):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['UDSVERFC']=values
del values

*UDSVERLC* - Words starting with L generated in a minute  
0-40

In [None]:
values = []
for val in rawdata['UDSVERLC']:
    if (val>=0 and val<=40):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['udsverlc']=values
del values

*MOCATOTS* - MoCA total raw score  
0-30

In [None]:
values = []
for val in rawdata['MOCATOTS']:
    if (val>=0 and val<=30):
        values.append(val)
    else:
        values.append(np.nan)
var_dict['mocatots']=values
del values

# NOTE FOR NIHAR-
# CONTINUE FROM MOCATOTS

# Intermediate Data
Some variables require recoding after the dataset is assembled. These variables are added to the dataset here.

In [None]:
variables.append('SMOKYRS')

# Create Dataframe

In [None]:
# Add the remaining variables to the dictionary
for variable in variables:
    var_dict[variable] = rawdata[variable]

# turn the dictionary into a pandas dataframe
selectdata = pd.DataFrame(var_dict)

## *SMOKYRS*

Total years smoked cigarettes.  
0-87  
88=Not applicable  
99=Unknown  
-4=Not available  

Assessment skip patterns may preclude response to question.

Recode for all subjects, taking the oldest value in the dataset, and removing error codes.

In [None]:
# Sort the row by NACCID and then by ascending age
selectdata = selectdata.sort(['NACCID', 'NACCAGE'], ascending=[1, 1])
selectdata.index = range(1,len(selectdata) + 1)

for i in range(len(selectdata)):
    if (selectdata['SMOKYRS'][10] < 0 or selectdata['SMOKYRS'][10] > 87):
        selectdata.set_value(i, 'SMOKYRS', np.nan)

for i in range(2,len(selectdata)):
    # If the NACCID is equal to the previous NACCID
    if selectdata['NACCID'][i] == selectdata['NACCID'][i-1]:
        # Change the SMOKYRS to equal the previous SMOKYRS
        if not np.isnan(selectdata['SMOKYRS'][i]):
            selectdata.set_value(i, 'SMOKYRS', selectdata['SMOKYRS'][i-1])
    print(str(i) + '/' + str(len(selectdata)), end='\r')
print(str(len(selectdata)) + '/' + str(len(selectdata)))


# Sort the row by NACCID and then by descending age
selectdata = selectdata.sort(['NACCID', 'NACCAGE'], ascending=[1, 0])
selectdata.index = range(1,len(selectdata) + 1)

for i in range(2,len(selectdata)):
    # If the NACCID is equal to the previous NACCID
    if selectdata['NACCID'][i] == selectdata['NACCID'][i-1]:
        # Change the SMOKYRS to equal the previous SMOKYRS
        if not np.isnan(selectdata['SMOKYRS'][i]):
            selectdata.set_value(i, 'SMOKYRS', selectdata['SMOKYRS'][i-1])
    print(str(i) + '/' + str(len(selectdata)), end='\r')
print(str(len(selectdata)) + '/' + str(len(selectdata)))


# Filter subjects  

Load and filter for only the previously selected NACCID's

In [None]:
randomization = pd.read_csv("randomization/allsubjects.csv", low_memory=False)
selectdata = selectdata[selectdata['NACCID'].isin(randomization['NACCID'])]

print(selectdata['labels'].value_counts(dropna=False))

Assign each subject entry with the latest diagnosis.

In [None]:
# Sort the row by NACCID and then by descending age
selectdata = selectdata.sort(['NACCID', 'NACCAGE'], ascending=[1, 0])
selectdata.index = range(1,len(selectdata) + 1)

for i in range(2,len(selectdata)):
    # If the NACCID is equal to the previous NACCID
    if selectdata['NACCID'][i] == selectdata['NACCID'][i-1]:
        # Change the label to equal the previous label
        selectdata.set_value(i, 'labels', selectdata['labels'][i-1])
    print(str(i) + '/' + str(len(selectdata)), end='\r')
print(str(len(selectdata)) + '/' + str(len(selectdata)))
selectdata['labels'].value_counts(dropna=False)

Remove the non-coded values.

In [None]:
selectdata = selectdata[selectdata['labels'] > -1]
print(selectdata['labels'].value_counts(dropna=False))

## Preview Results

In [None]:
selectdata[:15]

# Split into train/testvalidate

In [None]:
randomization = pd.read_csv("randomization/allsubjects.csv", low_memory=False)

train = selectdata[selectdata['NACCID'].isin(randomization[randomization['group']=="train"]['NACCID'])]
test = selectdata[selectdata['NACCID'].isin(randomization[randomization['group']=="test"]['NACCID'])]
validate = selectdata[selectdata['NACCID'].isin(randomization[randomization['group']=="validate"]['NACCID'])]

# Save the results to csv.
Final results are in the "result" folder.

In [None]:
selectdata.to_csv('result/selectdata.csv', index=False)
train.to_csv('result/train.csv', index=False)
test.to_csv('result/test.csv', index=False)
validate.to_csv('result/validate.csv', index=False)