### Use the preprocessed data for ML model building. 


In [201]:
# Setup
import numpy as np 
import pandas as pd 
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

ALZHEIMERS_PATH = "dataset/afterpreprocessing"
alzheimers_dfs = []
file_names = ["Overall_Health", "Mental_Health", "Smoking_and_Alcohol_Use", "Screenings_and_Vaccines", "Nutrition_Physical_Activity_Obesity", "Caregiving", "Cognitive_Decline"]

for i in range(len(file_names)):
    csv_path = os.path.join(ALZHEIMERS_PATH, f'{file_names[i]}.csv')
    df = pd.read_csv(csv_path)
    alzheimers_dfs.append(df)

alzheimers_dfs[0].head()

all_alzheimers_data = pd.read_csv(os.path.join(ALZHEIMERS_PATH, f'all_alzheimers_data.csv'))

all_alzheimers_data.fillna(0, inplace=True)


### Full Feature Set With Decision Tree

In [202]:
# Training and Visualizing a Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

def DecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    for i in range(len(df)):
        input = df[i].drop(['LocationDesc'], axis=1)
        target = df[i]['LocationDesc']
        tree_clf.fit(input, target)
        graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=target, rounded=True, filled=True))

        graph.format = 'png'
        graph.render( IMAGES_PATH + '/decision_tree/' + file_names[i], view=False)

DecisionTreeModel(alzheimers_dfs)

In [203]:
# split input and target into training and testing sets
from sklearn.model_selection import train_test_split
def train_test_decision_tree(df):
    for i in range(len(df)):
        input = df[i].drop(['LocationDesc'], axis=1)
        target = df[i]['LocationDesc']
        input_train, input_test, target_train, target_test = train_test_split(input, target, test_size=0.2, random_state=42)
        print(f'{file_names[i]} \n Input Train and Test Shape', input_train.shape, input_test.shape,
              '\n Target Train and Test Shape',target_train.shape, target_test.shape)
        print('----------------------------------------------------')
        
train_test_decision_tree(alzheimers_dfs)


Overall_Health 
 Input Train and Test Shape (141, 13) (36, 13) 
 Target Train and Test Shape (141,) (36,)
----------------------------------------------------
Mental_Health 
 Input Train and Test Shape (140, 3) (36, 3) 
 Target Train and Test Shape (140,) (36,)
----------------------------------------------------
Smoking_and_Alcohol_Use 
 Input Train and Test Shape (140, 3) (36, 3) 
 Target Train and Test Shape (140,) (36,)
----------------------------------------------------
Screenings_and_Vaccines 
 Input Train and Test Shape (141, 11) (36, 11) 
 Target Train and Test Shape (141,) (36,)
----------------------------------------------------
Nutrition_Physical_Activity_Obesity 
 Input Train and Test Shape (141, 5) (36, 5) 
 Target Train and Test Shape (141,) (36,)
----------------------------------------------------
Caregiving 
 Input Train and Test Shape (122, 6) (31, 6) 
 Target Train and Test Shape (122,) (31,)
----------------------------------------------------
Cognitive_Decline 
 

### Reduced Feature Set With Random Forest

In [204]:
# top feature selector
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def top_feature_selector(df, top_num=20):
    input = df.drop(['LocationDesc'], axis=1)
    target = df['LocationDesc']
    bestfeatures = SelectKBest(score_func=chi2, k=3)
    fit = bestfeatures.fit(input,target)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(input.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    print(featureScores.nlargest(top_num,'Score'))  #print 10 best features
    print('----------------------------------------------------')
    return dfcolumns


We are removing features that would not help us predict the probability of a patient having Alzheimer's:

- "Duration of caregiving among older adults" - Many older adults seek caregiving due to an Alzheimer's diagnosis.
- "Intensity of caregiving among older adults" - Many older adults seek caregiving due to an Alzheimer's diagnosis.
- "Talked with health care professional about subjective cognitive decline or memory loss" - Our risk prediction model is for those considering seeking testing for Alzheimer's, so speaking with a health care professional is implied.
- "Expect to provide care for someone in the next two years" - This feature is irrelevant.
- "Provide care for a friend or family member in past month" - This feature is irrelevant.
- "Provide care for someone with cognitive impairment within the past month" - This feature is irrelevant.
- "Self-rated health (fair to poor health)" - This feature is subjective, so we believe it might not always be accurate.
- "Self-rated health (good to excellent health)" - This feature is subjective, so we believe it might not always be accurate.