In [13]:
import pandas as pd
import numpy as np

from sfunc import run_forest, one_hot, answer, intensity, quality

In [14]:
alzheimers = pd.read_csv('Datasets/alzheimers.csv')

In [15]:
pd.set_option('display.max_columns', 100)

In [16]:
print(len(alzheimers))

74283


In [17]:
alzheimers.columns = alzheimers.columns.str.strip().str.replace('’', '', regex=False)
alzheimers = alzheimers.rename(columns={
    'Genetic Risk Factor (APOE-ε4 allele)': 'Genetic Risk',
    'Urban vs Rural Living': 'Residence Type' 
})
print(alzheimers.columns)
print(alzheimers['Education Level'].value_counts())
alzheimers.head()

Index(['Country', 'Age', 'Gender', 'Education Level', 'BMI',
       'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption',
       'Diabetes', 'Hypertension', 'Cholesterol Level',
       'Family History of Alzheimers', 'Cognitive Test Score',
       'Depression Level', 'Sleep Quality', 'Dietary Habits',
       'Air Pollution Exposure', 'Employment Status', 'Marital Status',
       'Genetic Risk', 'Social Engagement Level', 'Income Level',
       'Stress Levels', 'Residence Type', 'Alzheimers Diagnosis'],
      dtype='object')
Education Level
14    3810
1     3782
6     3780
15    3774
7     3747
4     3741
8     3718
9     3715
3     3715
17    3711
2     3709
16    3704
11    3701
12    3686
10    3685
5     3685
18    3677
13    3664
0     3646
19    3633
Name: count, dtype: int64


Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimers,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk,Social Engagement Level,Income Level,Stress Levels,Residence Type,Alzheimers Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,Normal,No,90,Low,Poor,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,Normal,No,65,Low,Good,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,Normal,No,43,High,Good,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,Normal,No,81,Medium,Average,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,Normal,No,49,High,Poor,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [18]:
#drop unnecessary columns
alzheimers.drop(['Country', 'Employment Status', 'Marital Status'], axis=1, inplace=True)

In [19]:
#manual encoding for columns where order preservation is important
intensity = {'Low': 0, 'Medium': 1, 'High': 2}
quality = {'Poor': 0, 'Average': 1, 'Good': 2}
answer = {'No': 0, 'Yes': 1}

#one-hot encoding

print(alzheimers.columns)
alzheimers = one_hot(alzheimers, ['Residence Type'])

alzheimers['Gender'] = alzheimers['Gender'].map({'Male': 0, 'Female': 1})

alzheimers['Physical Activity Level'] = alzheimers['Physical Activity Level'].map(intensity)
 
alzheimers['Smoking Status'] = alzheimers['Smoking Status'].map({'Never': 0, 'Former': 1, 'Current': 2}) #Other custom encoding maps

alzheimers['Alcohol Consumption'] = alzheimers['Alcohol Consumption'].map({'Never': 0, 'Occasionally': 1, 'Regularly': 2})

alzheimers['Cholesterol Level'] = alzheimers['Cholesterol Level'].map({'Normal': 0, 'High': 1})

alzheimers['Family History of Alzheimers'] = alzheimers['Family History of Alzheimers'].map(answer)

alzheimers['Depression Level'] = alzheimers['Depression Level'].map(intensity)

alzheimers['Sleep Quality'] = alzheimers['Sleep Quality'].map(quality)

alzheimers['Dietary Habits'] = alzheimers['Dietary Habits'].map({'Healthy': 0, 'Average': 1, 'Unhealthy': 2})

alzheimers['Air Pollution Exposure'] = alzheimers['Air Pollution Exposure'].map(intensity)

alzheimers['Genetic Risk'] = alzheimers['Genetic Risk'].map(answer)

alzheimers['Social Engagement Level'] = alzheimers['Social Engagement Level'].map(intensity)

alzheimers['Income Level'] = alzheimers['Income Level'].map(intensity)

alzheimers['Stress Levels'] = alzheimers['Stress Levels'].map(intensity)

#target variables
alzheimers['Alzheimers Diagnosis'] = alzheimers['Alzheimers Diagnosis'].map(answer)
alzheimers['Diabetes'] = alzheimers['Diabetes'].map(answer)
alzheimers['Hypertension'] = alzheimers['Hypertension'].map(answer)

Index(['Age', 'Gender', 'Education Level', 'BMI', 'Physical Activity Level',
       'Smoking Status', 'Alcohol Consumption', 'Diabetes', 'Hypertension',
       'Cholesterol Level', 'Family History of Alzheimers',
       'Cognitive Test Score', 'Depression Level', 'Sleep Quality',
       'Dietary Habits', 'Air Pollution Exposure', 'Genetic Risk',
       'Social Engagement Level', 'Income Level', 'Stress Levels',
       'Residence Type', 'Alzheimers Diagnosis'],
      dtype='object')


In [20]:
print(alzheimers['Alzheimers Diagnosis'].value_counts(normalize=True))

Alzheimers Diagnosis
0    0.586541
1    0.413459
Name: proportion, dtype: float64


In [21]:
alzheimers.describe()

Unnamed: 0,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimers,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Genetic Risk,Social Engagement Level,Income Level,Stress Levels,Alzheimers Diagnosis,Residence Type Rural,Residence Type Urban
count,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0,74283.0
mean,71.964703,0.501447,9.487514,26.780639,1.001333,1.002356,0.99829,0.198646,0.298171,0.300338,0.299921,64.654241,0.998977,1.00972,0.999865,1.003366,0.198188,0.999596,0.999663,0.999152,0.413459,0.500828,0.499172
std,12.980748,0.500001,5.75702,4.764679,0.817201,0.817594,0.817168,0.398983,0.457458,0.458408,0.458226,20.153247,0.815825,0.816823,0.815215,0.816825,0.398637,0.815694,0.818141,0.815471,0.492457,0.500003,0.500003
min,50.0,0.0,0.0,18.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,61.0,0.0,4.0,22.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,72.0,1.0,9.0,26.8,1.0,1.0,1.0,0.0,0.0,0.0,0.0,65.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
75%,83.0,1.0,14.0,30.9,2.0,2.0,2.0,0.0,1.0,1.0,1.0,82.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0
max,94.0,1.0,19.0,35.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,99.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0


In [None]:
test_forest = run_forest(alzheimers, 'Alzheimers Diagnosis')
cleaned_alzheimers = alzheimers[['Age', 'BMI', 'Sleep Quality', 'Air Pollution Exposure', 'Physical Activity Level', 'Alcohol Consumption',  'Smoking Status', 'Depression Level', 'Alzheimers Diagnosis']]
alzheimers_forest = run_forest(cleaned_alzheimers, 'Alzheimers Diagnosis')

Age                             0.275766
BMI                             0.106085
Cognitive Test Score            0.098902
Education Level                 0.078921
Genetic Risk                    0.037349
Dietary Habits                  0.030127
Social Engagement Level         0.030071
Smoking Status                  0.030064
Sleep Quality                   0.029675
Alcohol Consumption             0.029234
Physical Activity Level         0.029218
Air Pollution Exposure          0.029019
Depression Level                0.028987
Stress Levels                   0.028899
Income Level                    0.028538
Family History of Alzheimers    0.022576
Gender                          0.018660
Cholesterol Level               0.016503
Hypertension                    0.016162
Diabetes                        0.013221
Residence Type Rural            0.011047
Residence Type Urban            0.010977
dtype: float64
0.7148145655246685
