In [782]:

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [783]:
alzheimers = pd.read_csv('Datasets/alzheimers.csv')
hyper = pd.read_csv('Datasets/hypertension2.csv')
diabetes = pd.read_csv('Datasets/diabetes.csv')

In [784]:
pd.set_option('display.max_columns', 100)

In [801]:
print(len(alzheimers))
print(len(hyper))
print(len(diabetes))

74283
4240
768


In [785]:
alzheimers.columns = alzheimers.columns.str.strip().str.replace('’', '', regex=False)
alzheimers = alzheimers.rename(columns={
    'Genetic Risk Factor (APOE-ε4 allele)': 'Genetic Risk',
    'Urban vs Rural Living': 'Residence Type' 
})
print(alzheimers.columns)
alzheimers.head()

Index(['Country', 'Age', 'Gender', 'Education Level', 'BMI',
       'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption',
       'Diabetes', 'Hypertension', 'Cholesterol Level',
       'Family History of Alzheimers', 'Cognitive Test Score',
       'Depression Level', 'Sleep Quality', 'Dietary Habits',
       'Air Pollution Exposure', 'Employment Status', 'Marital Status',
       'Genetic Risk', 'Social Engagement Level', 'Income Level',
       'Stress Levels', 'Residence Type', 'Alzheimers Diagnosis'],
      dtype='object')


Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimers,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk,Social Engagement Level,Income Level,Stress Levels,Residence Type,Alzheimers Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,Normal,No,90,Low,Poor,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,Normal,No,65,Low,Good,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,Normal,No,43,High,Good,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,Normal,No,81,Medium,Average,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,Normal,No,49,High,Poor,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [786]:
print(hyper.columns)
hyper.head()

Index(['male', 'age', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'diabetes',
       'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'Risk'],
      dtype='object')


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [787]:
print(diabetes.columns)
diabetes.head()

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [788]:
#drop unnecessary columns
alzheimers.drop(['Country', 'Employment Status', 'Marital Status'], axis=1, inplace=True)

In [789]:
#initialize label encoder for columns where order preservation is unimportant
le = LabelEncoder()

#manual encoding for columns where order preservation is important
intensity = {'Low': 0, 'Medium': 1, 'High': 2}
quality = {'Poor': 0, 'Average': 1, 'Good': 2}
answer = {'No': 0, 'Yes': 1}

#one-hot encoding
alzheimers = pd.get_dummies(alzheimers, columns=['Gender'], prefix='Gender', dtype=int)

alzheimers['Physical Activity Level'] = alzheimers['Physical Activity Level'].map(intensity)
 
alzheimers['Smoking Status'] = alzheimers['Smoking Status'].map({'Never': 0, 'Former': 1, 'Current': 2}) #Other custom encoding maps

alzheimers['Alcohol Consumption'] = alzheimers['Alcohol Consumption'].map({'Never': 0, 'Occasionally': 1, 'Regularly': 2})

alzheimers['Cholesterol Level'] = alzheimers['Cholesterol Level'].map({'Normal': 0, 'High': 1})

alzheimers['Family History of Alzheimers'] = alzheimers['Family History of Alzheimers'].map(answer)

alzheimers['Depression Level'] = alzheimers['Depression Level'].map(intensity)

alzheimers['Sleep Quality'] = alzheimers['Sleep Quality'].map(quality)

alzheimers['Dietary Habits'] = alzheimers['Dietary Habits'].map({'Healthy': 0, 'Average': 1, 'Unhealthy': 2})

alzheimers['Air Pollution Exposure'] = alzheimers['Air Pollution Exposure'].map(intensity)

alzheimers['Genetic Risk'] = alzheimers['Genetic Risk'].map(answer)

alzheimers['Social Engagement Level'] = alzheimers['Social Engagement Level'].map(intensity)

alzheimers['Income Level'] = alzheimers['Income Level'].map(intensity)

alzheimers['Stress Levels'] = alzheimers['Stress Levels'].map(intensity)

#Rural = 0, urban = 1
alzheimers['Residence Type'] = le.fit_transform(alzheimers['Residence Type'])

#target variables
alzheimers['Alzheimers Diagnosis'] = alzheimers['Alzheimers Diagnosis'].map(answer)
alzheimers['Diabetes'] = alzheimers['Diabetes'].map(answer)
alzheimers['Hypertension'] = alzheimers['Hypertension'].map(answer)

In [790]:
print(alzheimers.corr()['Alzheimers Diagnosis'].sort_values(key=lambda x: x.abs(), ascending=False))

Alzheimers Diagnosis            1.000000
Age                             0.419923
Genetic Risk                    0.194484
Family History of Alzheimers    0.140885
Stress Levels                   0.004393
Dietary Habits                  0.004364
Residence Type                 -0.004104
Air Pollution Exposure         -0.003995
Education Level                 0.003732
Smoking Status                 -0.003690
Social Engagement Level        -0.003003
Diabetes                       -0.002673
Physical Activity Level         0.002310
Cholesterol Level              -0.002104
Alcohol Consumption             0.001790
BMI                            -0.001703
Gender_Male                     0.001364
Gender_Female                  -0.001364
Hypertension                   -0.001178
Cognitive Test Score           -0.001142
Sleep Quality                  -0.001122
Depression Level                0.000483
Income Level                   -0.000390
Name: Alzheimers Diagnosis, dtype: float64


In [791]:
print(hyper.corr()['Risk'].sort_values(key=lambda x: x.abs(), ascending=False))

Risk             1.000000
sysBP            0.696656
diaBP            0.615840
age              0.306799
BMI              0.301344
BPMeds           0.261067
totChol          0.163632
heartRate        0.146815
currentSmoker   -0.103710
glucose          0.086656
diabetes         0.077752
cigsPerDay      -0.066645
male             0.005853
Name: Risk, dtype: float64


In [792]:
print(diabetes.corr()['Outcome'].sort_values(key=lambda x: x.abs(), ascending=False))

Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64


In [793]:
#scale data
def scale_data(df, target_column):
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame")
    
    # Separate features and target
    features = df.drop(columns=[target_column])
    target = df[target_column]

    s = StandardScaler()
    
    #transform data
    scaled_data = s.fit_transform(features)
    scaled_df = pd.DataFrame(data = scaled_data, columns = features.columns)
    
    #add target column
    scaled_df[target_column] = target.values
    return scaled_df

alzheimers = scale_data(alzheimers, 'Alzheimers Diagnosis')
hyper = scale_data(hyper, 'Risk')
diabetes = scale_data(diabetes, 'Outcome')

In [800]:
def run_forest(df, target_column):
    forest = RandomForestClassifier()

    X_train, X_test, y_train, y_test = train_test_split(df, df[target_column], test_size=0.2, random_state = 3)

    forest.fit(X_train, y_train)
    accuracy = forest.score(X_test, y_test)
    prob = forest.predict_proba(X_test)
    print(accuracy)
    print(prob)
    
    return forest

al = run_forest(alzheimers, 'Alzheimers Diagnosis')
hy = run_forest(hyper, 'Risk')
dia = run_forest(diabetes, 'Outcome')


1.0
[[0.05 0.95]
 [0.02 0.98]
 [1.   0.  ]
 ...
 [0.   1.  ]
 [0.99 0.01]
 [0.04 0.96]]
1.0
[[1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]
 ...
 [0.99 0.01]
 [1.   0.  ]
 [1.   0.  ]]
1.0
[[1.   0.  ]
 [0.   1.  ]
 [1.   0.  ]
 [0.05 0.95]
 [0.96 0.04]
 [0.99 0.01]
 [1.   0.  ]
 [1.   0.  ]
 [0.99 0.01]
 [0.98 0.02]
 [0.03 0.97]
 [0.99 0.01]
 [1.   0.  ]
 [0.02 0.98]
 [0.01 0.99]
 [1.   0.  ]
 [1.   0.  ]
 [1.   0.  ]
 [0.03 0.97]
 [0.02 0.98]
 [0.03 0.97]
 [0.99 0.01]
 [0.04 0.96]
 [0.06 0.94]
 [1.   0.  ]
 [0.01 0.99]
 [1.   0.  ]
 [1.   0.  ]
 [0.01 0.99]
 [0.03 0.97]
 [0.87 0.13]
 [0.98 0.02]
 [0.04 0.96]
 [0.99 0.01]
 [0.01 0.99]
 [0.99 0.01]
 [1.   0.  ]
 [0.99 0.01]
 [0.04 0.96]
 [0.02 0.98]
 [0.98 0.02]
 [0.99 0.01]
 [0.01 0.99]
 [0.98 0.02]
 [0.97 0.03]
 [0.03 0.97]
 [0.97 0.03]
 [0.98 0.02]
 [0.88 0.12]
 [0.99 0.01]
 [0.01 0.99]
 [0.96 0.04]
 [0.98 0.02]
 [0.   1.  ]
 [0.94 0.06]
 [1.   0.  ]
 [0.04 0.96]
 [0.02 0.98]
 [0.02 0.98]
 [0.97 0.03]
 [0.01 0.99]
 [1.   0.  ]
 [0.04 0.96]
 