In [19]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sfunc import run_forest, one_hot, answer, intensity, quality

In [20]:
alzheimers = pd.read_csv('Datasets/alzheimers.csv')

In [21]:
pd.set_option('display.max_columns', 100)

In [22]:
print(len(alzheimers))

74283


In [23]:
alzheimers.columns = alzheimers.columns.str.strip().str.replace('’', '', regex=False)
alzheimers = alzheimers.rename(columns={
    'Genetic Risk Factor (APOE-ε4 allele)': 'Genetic Risk',
    'Urban vs Rural Living': 'Residence Type' 
})
print(alzheimers.columns)
print(alzheimers['Education Level'].value_counts())
alzheimers.head()

Index(['Country', 'Age', 'Gender', 'Education Level', 'BMI',
       'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption',
       'Diabetes', 'Hypertension', 'Cholesterol Level',
       'Family History of Alzheimers', 'Cognitive Test Score',
       'Depression Level', 'Sleep Quality', 'Dietary Habits',
       'Air Pollution Exposure', 'Employment Status', 'Marital Status',
       'Genetic Risk', 'Social Engagement Level', 'Income Level',
       'Stress Levels', 'Residence Type', 'Alzheimers Diagnosis'],
      dtype='object')
Education Level
14    3810
1     3782
6     3780
15    3774
7     3747
4     3741
8     3718
9     3715
3     3715
17    3711
2     3709
16    3704
11    3701
12    3686
10    3685
5     3685
18    3677
13    3664
0     3646
19    3633
Name: count, dtype: int64


Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimers,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk,Social Engagement Level,Income Level,Stress Levels,Residence Type,Alzheimers Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,Normal,No,90,Low,Poor,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,Normal,No,65,Low,Good,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,Normal,No,43,High,Good,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,Normal,No,81,Medium,Average,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,Normal,No,49,High,Poor,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [24]:
#drop unnecessary columns
alzheimers.drop(['Country', 'Employment Status', 'Marital Status'], axis=1, inplace=True)

In [26]:
#initialize label encoder for columns where order preservation is unimportant

#manual encoding for columns where order preservation is important
intensity = {'Low': 0, 'Medium': 1, 'High': 2}
quality = {'Poor': 0, 'Average': 1, 'Good': 2}
answer = {'No': 0, 'Yes': 1}

#one-hot encoding

print(alzheimers.columns)
alzheimers = one_hot(alzheimers, ['Gender', 'Residence Type'])

alzheimers['Physical Activity Level'] = alzheimers['Physical Activity Level'].map(intensity)
 
alzheimers['Smoking Status'] = alzheimers['Smoking Status'].map({'Never': 0, 'Former': 1, 'Current': 2}) #Other custom encoding maps

alzheimers['Alcohol Consumption'] = alzheimers['Alcohol Consumption'].map({'Never': 0, 'Occasionally': 1, 'Regularly': 2})

alzheimers['Cholesterol Level'] = alzheimers['Cholesterol Level'].map({'Normal': 0, 'High': 1})

alzheimers['Family History of Alzheimers'] = alzheimers['Family History of Alzheimers'].map(answer)

alzheimers['Depression Level'] = alzheimers['Depression Level'].map(intensity)

alzheimers['Sleep Quality'] = alzheimers['Sleep Quality'].map(quality)

alzheimers['Dietary Habits'] = alzheimers['Dietary Habits'].map({'Healthy': 0, 'Average': 1, 'Unhealthy': 2})

alzheimers['Air Pollution Exposure'] = alzheimers['Air Pollution Exposure'].map(intensity)

alzheimers['Genetic Risk'] = alzheimers['Genetic Risk'].map(answer)

alzheimers['Social Engagement Level'] = alzheimers['Social Engagement Level'].map(intensity)

alzheimers['Income Level'] = alzheimers['Income Level'].map(intensity)

alzheimers['Stress Levels'] = alzheimers['Stress Levels'].map(intensity)

#target variables
alzheimers['Alzheimers Diagnosis'] = alzheimers['Alzheimers Diagnosis'].map(answer)
alzheimers['Diabetes'] = alzheimers['Diabetes'].map(answer)
alzheimers['Hypertension'] = alzheimers['Hypertension'].map(answer)

Index(['Age', 'Gender', 'Education Level', 'BMI', 'Physical Activity Level',
       'Smoking Status', 'Alcohol Consumption', 'Diabetes', 'Hypertension',
       'Cholesterol Level', 'Family History of Alzheimers',
       'Cognitive Test Score', 'Depression Level', 'Sleep Quality',
       'Dietary Habits', 'Air Pollution Exposure', 'Genetic Risk',
       'Social Engagement Level', 'Income Level', 'Stress Levels',
       'Residence Type', 'Alzheimers Diagnosis'],
      dtype='object')


KeyError: "None of [Index([('Gender', 'Residence Type')], dtype='object')] are in the [columns]"

In [None]:
print(alzheimers['Alzheimers Diagnosis'].value_counts(normalize=True))

Alzheimers Diagnosis
0    0.586541
1    0.413459
Name: proportion, dtype: float64
Risk
0    0.689387
1    0.310613
Name: proportion, dtype: float64
Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [None]:
test_forest = run_forest(alzheimers, 'Alzheimers Diagnosis')
cleaned_alzheimers = alzheimers[['Age', 'BMI', 'Genetic Risk', 'Education Level', 'Sleep Quality', 'Social Engagement Level', 'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption', 'Depression Level', 'Alzheimers Diagnosis',]]
alzheimers_forest = run_forest(cleaned_alzheimers, 'Outcome')

Age                             0.274886
BMI                             0.105542
Cognitive Test Score            0.098700
Education Level                 0.079469
Genetic Risk                    0.037206
Sleep Quality                   0.029636
Social Engagement Level         0.029620
Physical Activity Level         0.029548
Alcohol Consumption             0.029545
Smoking Status                  0.029498
Depression Level                0.029492
Air Pollution Exposure          0.029236
Stress Levels                   0.029164
Income Level                    0.028926
Dietary Habits                  0.028846
Family History of Alzheimers    0.022538
Residence Type                  0.017797
Cholesterol Level               0.016620
Hypertension                    0.016293
Diabetes                        0.014026
Gender_Female                   0.011713
Gender_Male                     0.011699
dtype: float64
0.7155549572592044
CV Accuracy: 0.7205955173298314 (+/- 0.004321680682027182)

[[0.