### Use the preprocessed data for ML model building. 

In [214]:
# Setup
import numpy as np 
import pandas as pd 
import os
import pickle

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def save_model(model, folder_path="models", file_name="untitled_model.sav"):
    pickle.dump(model, open(os.path.join(folder_path, file_name), 'wb'))

ALZHEIMERS_PATH = "dataset/afterpreprocessing"
alzheimers_dfs = []
file_names = ["Overall_Health", "Mental_Health", "Smoking_and_Alcohol_Use", "Screenings_and_Vaccines", "Nutrition_Physical_Activity_Obesity", "Caregiving", "Cognitive_Decline"]

for i in range(len(file_names)):
    csv_path = os.path.join(ALZHEIMERS_PATH, f'{file_names[i]}.csv')
    df = pd.read_csv(csv_path)
    alzheimers_dfs.append(df)

alzheimers_dfs[0].head()

all_alzheimers_data = pd.read_csv(os.path.join(ALZHEIMERS_PATH, f'all_alzheimers_data.csv'))

all_alzheimers_data.fillna(0, inplace=True)

### Full Feature Set With Decision Tree

In [215]:
# Training and Visualizing a Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
DECISION_TREE_PATH = os.path.join(MODELS_PATH, 'decision_tree')

def FullDecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    for i in range(len(df)):
        input = df[i].drop(['LocationDesc'], axis=1)
        target = df[i]['LocationDesc']
        tree_clf.fit(input, target)
        graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=target, rounded=True, filled=True))
        graph.format = 'png'
        graph.render( IMAGES_PATH + '/decision_tree/' + file_names[i], view=False)
        filename = f'{file_names[i]}_decision_tree_model.sav'
        save_model(tree_clf, DECISION_TREE_PATH, filename)

FullDecisionTreeModel(alzheimers_dfs)

### Full Feature Set With Apriori

We converted our data from continuous to discrete using `KBinsDiscretizer`, since Apriori works better with categorical/discrete data.

In [217]:
# Binning values to convert data from continuous to discrete/categorical
from sklearn.preprocessing import KBinsDiscretizer

data = all_alzheimers_data.iloc[:, 2:]

kbins = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
kbins.fit(data)
binned = kbins.transform(data)
df = pd.DataFrame(binned, columns = all_alzheimers_data.columns[2:])
df.head()
apriori_all_alzheimers_data = all_alzheimers_data
apriori_all_alzheimers_data.iloc[:, 2:] = df
apriori_all_alzheimers_data


Unnamed: 0,LocationDesc,AgeGroup,Arthritis among older adults,"Disability status, including sensory or mobility limitations",Fair or poor health among older adults with arthritis,Fall with injury within last year,Oral health: tooth retention,Physically unhealthy days (mean number of days),Prevalence of sufficient sleep,Recent activity limitations in past month,...,Obesity,Duration of caregiving among older adults,Expect to provide care for someone in the next two years,Intensity of caregiving among older adults,Provide care for a friend or family member in past month,Provide care for someone with cognitive impairment within the past month,Functional difficulties associated with subjective cognitive decline or memory loss among older adults,Need assistance with day-to-day activities because of subjective cognitive decline or memory loss,Subjective cognitive decline or memory loss among older adults,Talked with health care professional about subjective cognitive decline or memory loss
0,Alabama,0.0,11.0,9.0,8.0,13.0,10.0,11.0,8.0,16.0,...,18.0,17.0,18.0,10.0,18.0,6.0,15.0,13.0,18.0,13.0
1,Alabama,1.0,18.0,13.0,8.0,8.0,3.0,11.0,14.0,11.0,...,10.0,19.0,10.0,11.0,13.0,7.0,10.0,10.0,18.0,10.0
2,Alabama,2.0,15.0,12.0,8.0,11.0,7.0,13.0,11.0,14.0,...,14.0,18.0,15.0,10.0,16.0,7.0,13.0,12.0,18.0,11.0
3,Alaska,0.0,2.0,2.0,3.0,18.0,15.0,4.0,11.0,8.0,...,10.0,17.0,15.0,9.0,15.0,7.0,10.0,11.0,10.0,12.0
4,Alaska,1.0,13.0,10.0,3.0,13.0,9.0,6.0,16.0,9.0,...,10.0,18.0,12.0,10.0,13.0,9.0,8.0,7.0,16.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,Wisconsin,1.0,14.0,7.0,2.0,5.0,9.0,5.0,16.0,7.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,Wisconsin,2.0,10.0,5.0,3.0,8.0,12.0,6.0,13.0,11.0,...,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174,Wyoming,0.0,4.0,3.0,3.0,7.0,15.0,4.0,12.0,10.0,...,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,Wyoming,1.0,11.0,10.0,2.0,9.0,9.0,4.0,18.0,10.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The Apriori model takes too long to run, so we will choose a quicker technique.

In [220]:
# Apriori https://github.com/ymoch/apyori

from apyori import apriori


# def NewAprioriModel(df):
#     association_rules = apriori(df, min_support=0.6, min_confidence=0.8, min_lift=1.0, max_length=None)
#     association_results = list(association_rules)
#     return association_results
# NewAprioriModel(apriori_all_alzheimers_data)

def AprioriModel(df):
    for i in range(2, len(df)):
        records = []
        records.append(df.iloc[:, i].values)
        print(records)
        association_rules = apriori(records, min_support=0.8, min_confidence=0.8, min_lift=1.0, min_length=None)
        association_results = list(association_rules)
        print(association_results)
        
AprioriModel(all_alzheimers_data)



[array([11., 18., 15.,  2., 13.,  6.,  3., 11.,  6.,  9., 15., 12.,  0.,
       11.,  5.,  2., 12.,  6.,  1., 12.,  5.,  4., 13.,  8.,  0., 11.,
        4.,  3., 12.,  8.,  4., 13.,  7.,  0.,  5.,  3.,  0.,  8.,  4.,
        3., 13.,  7.,  3., 13.,  7.,  6., 14.,  9.,  3., 13.,  7.,  5.,
       12.,  9., 11., 16., 13.,  6., 16., 10.,  6., 14., 11.,  1., 12.,
        5.,  2., 13.,  7.,  9., 16., 12.,  4., 13.,  7.,  1., 11.,  5.,
        8., 16., 11.,  7., 15., 10.,  6., 13.,  9.,  3., 12.,  7.,  0.,
       10.,  4.,  3., 13.,  7.,  1., 10.,  4.,  3., 10.,  6.,  2., 11.,
        5.,  4., 14.,  8.,  5., 13.,  9.,  2., 13.,  6.,  6., 14., 11.,
        6., 14.,  9.,  4., 12.,  7.,  7., 15., 11.,  2., 10.,  6.,  4.,
       13.,  7.,  4., 13.,  7.,  7., 15., 10.,  4., 11.,  7.,  8., 16.,
       12.,  1., 13.,  6.,  3., 12.,  7.,  3., 12.,  7.,  4., 12.,  8.,
        8.,  8.,  8.,  4., 14.,  8.,  3., 11.,  7.,  1., 11.,  5., 15.,
       19., 17.,  5., 14., 10.,  4., 11.,  7.])]


KeyboardInterrupt: 

### Full Feature Set with Anomaly Detection

In [221]:
# Anomaly Detection algorthim 

from sklearn.ensemble import IsolationForest

ISOLATION_FOREST_PATH = os.path.join(MODELS_PATH, 'isolation_forest')

def IsolationForestModel(df):
    for i in range(len(df)):
        input = df[i].drop(['LocationDesc'], axis=1)
        target = df[i]['LocationDesc']
        clf = IsolationForest(max_samples=100, random_state=42)
        clf.fit(input)
        filename = f'{file_names[i]}_isolation_forest_model.sav'
        save_model(clf, ISOLATION_FOREST_PATH, filename)

IsolationForestModel(alzheimers_dfs)

### Full Feature Set With Support Vector Machines

In [222]:
from sklearn.svm import OneClassSVM

OneClassSVM_PATH = os.path.join(MODELS_PATH, 'one_class_svm')

def FullOneClassSVMModel(df):
    input = df.drop(columns=['LocationDesc','AgeGroup'])
    # define and fit outlier detection model
    one_class_svm_clf = OneClassSVM(gamma='auto').fit(input)
    save_model(one_class_svm_clf, OneClassSVM_PATH, 'alzheimers_one_class_svm_model.sav')

FullOneClassSVMModel(all_alzheimers_data)


## Top Feature Selector

In [223]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def top_feature_selector(df, top_num=20):
    input = df.drop(['LocationDesc'], axis=1)
    target = df['LocationDesc']
    bestfeatures = SelectKBest(score_func=chi2, k=3)
    fit = bestfeatures.fit(input,target)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(input.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    print(featureScores.nlargest(top_num,'Score'))  #print 10 best features
    print('----------------------------------------------------')
    return dfcolumns

In [None]:
# Dropping irrelevant features
all_alzheimers_data = all_alzheimers_data.drop(columns=["Duration of caregiving among older adults", 
                                                        "Intensity of caregiving among older adults", 
                                                        "Talked with health care professional about subjective cognitive decline or memory loss",
                                                        "Expect to provide care for someone in the next two years",
                                                        "Provide care for a friend or family member in past month",
                                                        "Provide care for someone with cognitive impairment within the past month",
                                                        "Self-rated health (fair to poor health)",
                                                        "Self-rated health (good to excellent health)",
                                                        ], axis=1)

In [None]:
# Separate dataframe by AgeGroup
alzheimers_data_age0 = all_alzheimers_data.loc[all_alzheimers_data['AgeGroup'] == 0].reset_index().drop(['index', 'AgeGroup'], axis=1) # 50-64 y/o
alzheimers_data_age1 = all_alzheimers_data.loc[all_alzheimers_data['AgeGroup'] == 1].reset_index().drop(['index', 'AgeGroup'], axis=1) # 65+ y/o
alzheimers_data_age2 = all_alzheimers_data.loc[all_alzheimers_data['AgeGroup'] == 2].reset_index().drop(['index', 'AgeGroup'], axis=1) # Overall (50+ y/o)

alzheimers_data_age0.head()
alzheimers_data_age1.head()
alzheimers_data_age2.head()

# Get top features for each AgeGroup
age0_features = top_feature_selector(alzheimers_data_age0)
age1_features = top_feature_selector(alzheimers_data_age1)
age2_features = top_feature_selector(alzheimers_data_age2)

alzheimers_data_age0 = alzheimers_data_age0.drop('LocationDesc', axis=1)
alzheimers_data_age1 = alzheimers_data_age1.drop('LocationDesc', axis=1)
alzheimers_data_age2 = alzheimers_data_age2.drop('LocationDesc', axis=1)


In [None]:
# Keeping the top 20 features
alzheimers_data_age0 = alzheimers_data_age0.drop(columns=age0_features[0][20:].tolist(), axis=1)
alzheimers_data_age1 = alzheimers_data_age1.drop(columns=age1_features[0][20:].tolist(), axis=1)
alzheimers_data_age2 = alzheimers_data_age2.drop(columns=age2_features[0][20:].tolist(), axis=1)

alzheimers_data_age0.head()

### Reduced Feature Set With Decision Tree

In [None]:
def ReducedDecisionTreeModel(df):
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    input =  df.drop(columns='LocationDesc')
    target = df['LocationDesc']
    tree_clf.fit(input, target)
    graph = Source(export_graphviz(tree_clf, out_file=None, feature_names=input.columns, class_names=target, rounded=True, filled=True))
    graph.format = 'png'
    graph.render( IMAGES_PATH + '/decision_tree/all_alzheimers_data', view=False)
    filename = 'all_alzheimers_data_decision_tree_model.sav'
    save_model(tree_clf, DECISION_TREE_PATH, filename)

ReducedDecisionTreeModel(all_alzheimers_data)

### Reduced Feature Set With K-Means

In [None]:
from sklearn.cluster import KMeans

KMEANS_PATH = os.path.join(MODELS_PATH, 'kmeans')

kmeans_clf = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(alzheimers_data_age0)

save_model(kmeans_clf, KMEANS_PATH, 'alzheimers_kmeans_model.sav')


### Reduced Feature Set With Support Vector Machines

In [None]:
from sklearn.svm import OneClassSVM

def ReducedOneClassSVMModel(df):
    input = df
    # define and fit outlier detection model
    one_class_svm_clf = OneClassSVM(gamma='auto').fit(input)
    save_model(one_class_svm_clf, OneClassSVM_PATH, 'alzheimers_one_class_svm_model.sav')

ReducedOneClassSVMModel(alzheimers_data_age0)


## Load Models

In [None]:

loaded_decision_tree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "Overall_Health_decision_tree_model.sav"), 'rb'))
loaded_kmeans_model = pickle.load(open(os.path.join(KMEANS_PATH, "alzheimers_kmeans_model.sav"), 'rb'))
loaded_one_class_svm_model = pickle.load(open(os.path.join(OneClassSVM_PATH, "alzheimers_one_class_svm_model.sav"), 'rb'))
loaded_isolation_forest_model = pickle.load(open(os.path.join(ISOLATION_FOREST_PATH, "Overall_Health_isolation_forest_model.sav"), 'rb'))