### Use the preprocessed data for ML model building. 


In [367]:
# Setup
import numpy as np 
import pandas as pd 
import os
import pickle

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def save_model(model, folder_path="models", file_name="untitled_model.sav"):
    pickle.dump(model, open(os.path.join(folder_path, file_name), 'wb'))

ALZHEIMERS_PATH = "dataset/afterpreprocessing"
alzheimers_dfs = []
file_names = ["Overall_Health", "Mental_Health", "Smoking_and_Alcohol_Use", "Screenings_and_Vaccines", "Nutrition_Physical_Activity_Obesity", "Caregiving", "Cognitive_Decline"]

for i in range(len(file_names)):
    csv_path = os.path.join(ALZHEIMERS_PATH, f'{file_names[i]}.csv')
    df = pd.read_csv(csv_path)
    alzheimers_dfs.append(df)

alzheimers_dfs[0].head()

all_alzheimers_data = pd.read_csv(os.path.join(ALZHEIMERS_PATH, f'all_alzheimers_data.csv'))

all_alzheimers_data.fillna(0, inplace=True)


### Full Feature Set With Decision Tree

In [368]:
# Training and Visualizing a Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
DECISION_TREE_PATH = os.path.join(MODELS_PATH, 'decision_tree')

def FullDecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    for i in range(len(df)):
        input = df[i].drop(['LocationDesc'], axis=1)
        target = df[i]['LocationDesc']
        tree_clf.fit(input, target)
        graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=target, rounded=True, filled=True))
        graph.format = 'png'
        graph.render( IMAGES_PATH + '/decision_tree/' + file_names[i], view=False)
        filename = f'{file_names[i]}_decision_tree_model.sav'
        save_model(tree_clf, DECISION_TREE_PATH, filename)

FullDecisionTreeModel(alzheimers_dfs)

In [369]:
# split input and target into training and testing sets
from sklearn.model_selection import train_test_split
def train_test_decision_tree(df):
    for i in range(len(df)):
        input = df[i].drop(['LocationDesc'], axis=1)
        target = df[i]['LocationDesc']
        input_train, input_test, target_train, target_test = train_test_split(input, target, test_size=0.2, random_state=42)
        print(f'{file_names[i]} \n Input Train and Test Shape', input_train.shape, input_test.shape,
              '\n Target Train and Test Shape',target_train.shape, target_test.shape)
        print('----------------------------------------------------')
        
train_test_decision_tree(alzheimers_dfs)


Overall_Health 
 Input Train and Test Shape (141, 13) (36, 13) 
 Target Train and Test Shape (141,) (36,)
----------------------------------------------------
Mental_Health 
 Input Train and Test Shape (140, 3) (36, 3) 
 Target Train and Test Shape (140,) (36,)
----------------------------------------------------
Smoking_and_Alcohol_Use 
 Input Train and Test Shape (140, 3) (36, 3) 
 Target Train and Test Shape (140,) (36,)
----------------------------------------------------
Screenings_and_Vaccines 
 Input Train and Test Shape (141, 11) (36, 11) 
 Target Train and Test Shape (141,) (36,)
----------------------------------------------------
Nutrition_Physical_Activity_Obesity 
 Input Train and Test Shape (141, 5) (36, 5) 
 Target Train and Test Shape (141,) (36,)
----------------------------------------------------
Caregiving 
 Input Train and Test Shape (122, 6) (31, 6) 
 Target Train and Test Shape (122,) (31,)
----------------------------------------------------
Cognitive_Decline 
 

### Reduced Feature Set With K-Means

In [370]:
# top feature selector
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def top_feature_selector(df, top_num=20):
    input = df.drop(['LocationDesc'], axis=1)
    target = df['LocationDesc']
    bestfeatures = SelectKBest(score_func=chi2, k=3)
    fit = bestfeatures.fit(input,target)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(input.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    print(featureScores.nlargest(top_num,'Score'))  #print 10 best features
    print('----------------------------------------------------')
    return dfcolumns


We are removing features that would not help us predict the probability of a patient having Alzheimer's:

- "Duration of caregiving among older adults" - Many older adults seek caregiving due to an Alzheimer's diagnosis.
- "Intensity of caregiving among older adults" - Many older adults seek caregiving due to an Alzheimer's diagnosis.
- "Talked with health care professional about subjective cognitive decline or memory loss" - Our risk prediction model is for those considering seeking testing for Alzheimer's, so speaking with a health care professional is implied.
- "Expect to provide care for someone in the next two years" - This feature is irrelevant.
- "Provide care for a friend or family member in past month" - This feature is irrelevant.
- "Provide care for someone with cognitive impairment within the past month" - This feature is irrelevant.
- "Self-rated health (fair to poor health)" - This feature is subjective, so we believe it might not always be accurate.
- "Self-rated health (good to excellent health)" - This feature is subjective, so we believe it might not always be accurate.

In [371]:
# Dropping irrelevant features
all_alzheimers_data = all_alzheimers_data.drop(columns=["Duration of caregiving among older adults", 
                                                        "Intensity of caregiving among older adults", 
                                                        "Talked with health care professional about subjective cognitive decline or memory loss",
                                                        "Expect to provide care for someone in the next two years",
                                                        "Provide care for a friend or family member in past month",
                                                        "Provide care for someone with cognitive impairment within the past month",
                                                        "Self-rated health (fair to poor health)",
                                                        "Self-rated health (good to excellent health)",
                                                        ], axis=1)

We are separating the dataset by age group before running feature selection, as feature correlation may differ between age groups.

In [372]:
# Separate dataframe by AgeGroup
alzheimers_data_age0 = all_alzheimers_data.loc[all_alzheimers_data['AgeGroup'] == 0].reset_index().drop(['index', 'AgeGroup'], axis=1) # 50-64 y/o
alzheimers_data_age1 = all_alzheimers_data.loc[all_alzheimers_data['AgeGroup'] == 1].reset_index().drop(['index', 'AgeGroup'], axis=1) # 65+ y/o
alzheimers_data_age2 = all_alzheimers_data.loc[all_alzheimers_data['AgeGroup'] == 2].reset_index().drop(['index', 'AgeGroup'], axis=1) # Overall (50+ y/o)

alzheimers_data_age0.head()
alzheimers_data_age1.head()
alzheimers_data_age2.head()

# Get top features for each AgeGroup
age0_features = top_feature_selector(alzheimers_data_age0)
age1_features = top_feature_selector(alzheimers_data_age1)
age2_features = top_feature_selector(alzheimers_data_age2)

alzheimers_data_age0 = alzheimers_data_age0.drop('LocationDesc', axis=1)
alzheimers_data_age1 = alzheimers_data_age1.drop('LocationDesc', axis=1)
alzheimers_data_age2 = alzheimers_data_age2.drop('LocationDesc', axis=1)


                                                Specs       Score
29  Need assistance with day-to-day activities bec...  172.647783
28  Functional difficulties associated with subjec...  162.803707
8   Severe joint pain among older adults with arth...  103.098425
2   Fair or poor health among older adults with ar...   83.298946
13                                    Current smoking   77.589801
1   Disability status, including sensory or mobili...   66.913574
26  No leisure-time physical activity within past ...   62.545320
23  Up-to-date with recommended vaccines and scree...   56.635056
12                 Binge drinking within past 30 days   52.592358
11                   Lifetime diagnosis of depression   50.982909
30  Subjective cognitive decline or memory loss am...   47.942302
19                 Influenza vaccine within past year   47.361629
18                           High blood pressure ever   46.016760
0                        Arthritis among older adults   43.591150
24        

In [373]:
# Keeping the top 20 features
alzheimers_data_age0 = alzheimers_data_age0.drop(columns=age0_features[0][20:].tolist(), axis=1)
alzheimers_data_age1 = alzheimers_data_age1.drop(columns=age1_features[0][20:].tolist(), axis=1)
alzheimers_data_age2 = alzheimers_data_age2.drop(columns=age2_features[0][20:].tolist(), axis=1)

alzheimers_data_age0.head()

Unnamed: 0,Arthritis among older adults,"Disability status, including sensory or mobility limitations",Fair or poor health among older adults with arthritis,Fall with injury within last year,Oral health: tooth retention,Physically unhealthy days (mean number of days),Prevalence of sufficient sleep,Recent activity limitations in past month,Severe joint pain among older adults with arthritis,Taking medication for high blood pressure,Frequent mental distress,Lifetime diagnosis of depression,Binge drinking within past 30 days,Current smoking,Cholesterol checked in past 5 years,Colorectal cancer screening,Diabetes screening within past 3 years,Ever had pneumococcal vaccine,High blood pressure ever,Influenza vaccine within past year
0,46.67,41.5,43.045,12.16,68.626667,6.2,59.5,8.183333,47.006667,87.88,16.233333,26.063333,10.083333,22.173333,92.986667,66.133333,65.876667,39.536667,57.92,41.433333
1,32.14,27.6,30.965,14.235714,79.52,4.943333,65.466667,5.731034,26.23,70.314286,11.033333,18.368966,14.87931,22.963333,87.993333,60.64,61.336,38.532143,40.666667,36.49
2,33.278571,34.494286,43.808696,10.711765,79.647619,5.974359,65.633333,7.3,38.647059,79.883333,13.022857,19.85,12.209677,16.969697,91.12,57.69,64.58,41.04,44.342857,38.9475
3,43.83,42.736,47.335,14.986667,68.586667,6.86,62.026667,8.193333,46.88,88.693333,16.441379,26.443333,10.888889,24.04,91.671429,63.293333,65.12,44.206667,58.12,43.58
4,27.788889,27.967647,36.572727,9.9,85.895238,4.782927,62.219048,6.618919,36.964706,79.055,12.4,17.113889,12.784375,12.988889,92.385714,62.514286,59.56,37.405882,41.742857,40.259524


Encode LocationDesc

In [374]:
# from sklearn.preprocessing import OneHotEncoder
# enc = OneHotEncoder(sparse=False)

# def encode_location(df):
#     location = df.loc[:, ['LocationDesc']]
#     location_enc = enc.fit_transform(location)
#     # location_enc.toarray()
#     print(len(df.loc[:, ['LocationDesc']]))
#     print(len(location_enc))
#     df.loc[i, ['LocationDesc']] = location_enc[i]
#     return df

# # Encode location
# alzheimers_data_age0 = encode_location(alzheimers_data_age0)

test = alzheimers_data_age0.iloc[0, :].tolist()
print(test)
alzheimers_data_age0.head()

[46.67, 41.5, 43.045, 12.16, 68.62666666666668, 6.2, 59.5, 8.183333333333334, 47.00666666666667, 87.88000000000001, 16.233333333333334, 26.063333333333333, 10.083333333333334, 22.173333333333336, 92.98666666666666, 66.13333333333334, 65.87666666666667, 39.53666666666666, 57.92, 41.43333333333333]


Unnamed: 0,Arthritis among older adults,"Disability status, including sensory or mobility limitations",Fair or poor health among older adults with arthritis,Fall with injury within last year,Oral health: tooth retention,Physically unhealthy days (mean number of days),Prevalence of sufficient sleep,Recent activity limitations in past month,Severe joint pain among older adults with arthritis,Taking medication for high blood pressure,Frequent mental distress,Lifetime diagnosis of depression,Binge drinking within past 30 days,Current smoking,Cholesterol checked in past 5 years,Colorectal cancer screening,Diabetes screening within past 3 years,Ever had pneumococcal vaccine,High blood pressure ever,Influenza vaccine within past year
0,46.67,41.5,43.045,12.16,68.626667,6.2,59.5,8.183333,47.006667,87.88,16.233333,26.063333,10.083333,22.173333,92.986667,66.133333,65.876667,39.536667,57.92,41.433333
1,32.14,27.6,30.965,14.235714,79.52,4.943333,65.466667,5.731034,26.23,70.314286,11.033333,18.368966,14.87931,22.963333,87.993333,60.64,61.336,38.532143,40.666667,36.49
2,33.278571,34.494286,43.808696,10.711765,79.647619,5.974359,65.633333,7.3,38.647059,79.883333,13.022857,19.85,12.209677,16.969697,91.12,57.69,64.58,41.04,44.342857,38.9475
3,43.83,42.736,47.335,14.986667,68.586667,6.86,62.026667,8.193333,46.88,88.693333,16.441379,26.443333,10.888889,24.04,91.671429,63.293333,65.12,44.206667,58.12,43.58
4,27.788889,27.967647,36.572727,9.9,85.895238,4.782927,62.219048,6.618919,36.964706,79.055,12.4,17.113889,12.784375,12.988889,92.385714,62.514286,59.56,37.405882,41.742857,40.259524


In [375]:
def ReducedDecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    input =  df.drop(columns='LocationDesc')
    target = df['LocationDesc']
    tree_clf.fit(input, target)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=target, rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/all_alzheimers_data', view=False)
    filename = 'all_alzheimers_data_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

ReducedDecisionTreeModel(all_alzheimers_data)

In [376]:
from sklearn.cluster import KMeans

KMEANS_PATH = os.path.join(MODELS_PATH, 'kmeans')

kmeans_clf = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(alzheimers_data_age0)

save_model(kmeans_clf, KMEANS_PATH, 'alzheimers_kmeans_model.sav')


In [377]:
# Load models
loaded_decision_tree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "Overall_Health_decision_tree_model.sav"), 'rb'))
loaded_kmeans_model = pickle.load(open(os.path.join(KMEANS_PATH, "alzheimers_kmeans_model.sav"), 'rb'))

### Save your built models as .model files
```python
# Fit the model on training set
model = Any model of your choice
model.fit(X_train, Y_train)
# save the model to disk
import pickle
filename = 'finalized_model_M1.sav'
pickle.dump(model, open(filename, 'wb'))
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)
```