In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import shapiro, zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
file_path = '/content/alzheimers_disease_data.csv'
data = pd.read_csv(file_path)

In [None]:
null_values = data.isnull().sum()
print("Null values in each column:\n", null_values)

Null values in each column:
 PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
Difficulty

In [None]:
data.duplicated().sum()

0

In [None]:
numerical_columns = ['Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE']

In [None]:
fig = px.box(data, y=numerical_columns)
fig.update_layout(title="Outlier Detection in Numerical Columns", template="plotly_dark")
fig.show()

In [None]:
z_scores = np.abs(zscore(data[numerical_columns]))
threshold = 3
data = data[(z_scores < threshold).all(axis=1)]

In [None]:
# Distribution of ages among patients with and without Alzheimer's?
fig = px.histogram(data, x='Age', color='Diagnosis', barmode='overlay', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)'}, template="plotly_dark")
fig.update_layout(title="Distribution of Ages Among Patients with and without Alzheimer's")
fig.show()

In [None]:
# Compare Gender distribution between patients with and without Alzheimer's?
fig = px.histogram(data, x='Gender', color='Diagnosis', barmode='group', labels={'Gender':'Gender (0=Male, 1=Female)', 'Diagnosis':'Diagnosis (0=No, 1=Yes)'}, template="plotly_dark")
fig.update_layout(title="Gender Distribution Comparison Between Patients with and without Alzheimer's")
fig.show()

In [None]:
# What is the percentage of patients with a family history of Alzheimer's?
family_history_percentage = data['FamilyHistoryAlzheimers'].value_counts(normalize=True) * 100
fig = px.pie(values=family_history_percentage, names=family_history_percentage.index, title='Percentage of Patients with Family History of Alzheimer\'s', template="plotly_dark")
fig.show()

In [None]:
# What is the average BMI for patients with and without Alzheimer's?
avg_bmi = data.groupby('Diagnosis')['BMI'].mean().reset_index()
avg_bmi

Unnamed: 0,Diagnosis,BMI
0,0,27.515092
1,1,27.91267


In [None]:
fig = px.bar(avg_bmi, x='Diagnosis', y='BMI', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'BMI':'Average BMI'}, template="plotly_dark")
fig.update_layout(title="Average BMI for Patients with and without Alzheimer's")
fig.show()

In [None]:
# How many patients have diabetes, and how does this compare to their Alzheimer's diagnosis status?
diabetes_counts = data.groupby(['Diagnosis', 'Diabetes']).size().reset_index(name='counts')
fig = px.bar(diabetes_counts, x='Diagnosis', y='counts', color='Diabetes', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'Diabetes':'Diabetes (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Diabetes by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
# What is the correlation matrix for the dataset?
corr_matrix = data[numerical_columns].corr()
fig = px.imshow(corr_matrix, text_auto=True, title='Correlation Matrix', template="plotly_dark")
fig.update_layout(
    autosize=False,
    width=800,
    height=800,
    margin=dict(l=100, r=100, b=100, t=100, pad=4)
)
fig.show()

In [None]:
age_groups = pd.cut(data['Age'], bins=[60, 70, 80, 90], labels=['60-69', '70-79', '80-89'])
avg_mmse_by_age = data.groupby(age_groups)['MMSE'].mean().reset_index()
fig = px.bar(avg_mmse_by_age, x='Age', y='MMSE', labels={'Age':'Age Group', 'MMSE':'Average MMSE Score'}, template="plotly_dark")
fig.update_layout(title="Average MMSE Score by Age Group")
fig.show()





In [None]:
# What is the distribution of physical activity levels among patients with Alzheimer's?
fig = px.histogram(data[data['Diagnosis'] == 1], x='PhysicalActivity', labels={'PhysicalActivity':'Physical Activity (hours/week)'}, template="plotly_dark")
fig.update_layout(title="Distribution of Physical Activity Levels Among Patients with Alzheimer's")
fig.show()

In [None]:
fig = px.scatter(data, x='AlcoholConsumption', y='BMI', color='Diagnosis', labels={'AlcoholConsumption':'Alcohol Consumption (units/week)', 'BMI':'Body Mass Index', 'Diagnosis':'Diagnosis (0=No, 1=Yes)'}, template="plotly_dark")
fig.update_layout(title="Relationship Between Alcohol Consumption and BMI Among Patients")
fig.show()

In [None]:
# What is the distribution of systolic blood pressure for patients with and without hypertension?
fig = px.histogram(data, x='SystolicBP', color='Hypertension', barmode='overlay', labels={'SystolicBP':'Systolic Blood Pressure', 'Hypertension':'Hypertension (0=No, 1=Yes)'}, template="plotly_dark")
fig.update_layout(title="Distribution of Systolic Blood Pressure for Patients with and without Hypertension")
fig.show()

In [None]:
memory_complaints_counts = data.groupby(['Diagnosis', 'MemoryComplaints']).size().reset_index(name='counts')
fig = px.bar(memory_complaints_counts, x='Diagnosis', y='counts', color='MemoryComplaints', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'MemoryComplaints':'Memory Complaints (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Memory Complaints by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
avg_cholesterol = data.groupby('Diagnosis')[['CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL']].mean().reset_index()
fig = px.bar(avg_cholesterol.melt(id_vars='Diagnosis'), x='Diagnosis', y='value', color='variable', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'value':'Average Cholesterol Level', 'variable':'Cholesterol Type'}, template="plotly_dark")
fig.update_layout(title="Average Cholesterol Levels (Total, LDL, HDL) by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
behavioral_problems_counts = data.groupby(['Diagnosis', 'BehavioralProblems']).size().reset_index(name='counts')
fig = px.bar(behavioral_problems_counts, x='Diagnosis', y='counts', color='BehavioralProblems', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'BehavioralProblems':'Behavioral Problems (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Behavioral Problems by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
fig = px.box(data, x='Diagnosis', y='ADL', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'ADL':'Activities of Daily Living Score'}, template="plotly_dark")
fig.update_layout(title="Distribution of ADL Scores Among Patients with and without Alzheimer's")
fig.show()

In [None]:
fig = px.box(data, x='EducationLevel', y='DietQuality', labels={'EducationLevel':'Education Level', 'DietQuality':'Diet Quality Score'}, template="plotly_dark")
fig.update_layout(title="Diet Quality by Education Level")
fig.show()

In [None]:
# How many patients have confusion symptoms, and what is their Alzheimer's diagnosis status?
confusion_counts = data.groupby(['Diagnosis', 'Confusion']).size().reset_index(name='counts')
fig = px.bar(confusion_counts, x='Diagnosis', y='counts', color='Confusion', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'Confusion':'Confusion (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Confusion Symptoms by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
# What is the average systolic and diastolic blood pressure for patients with and without Alzheimer's?
avg_bp = data.groupby('Diagnosis')[['SystolicBP', 'DiastolicBP']].mean().reset_index()
fig = make_subplots(rows=1, cols=2, subplot_titles=('Average Systolic Blood Pressure', 'Average Diastolic Blood Pressure'))
fig.add_trace(go.Bar(x=avg_bp['Diagnosis'], y=avg_bp['SystolicBP'], name='SystolicBP'), row=1, col=1)
fig.add_trace(go.Bar(x=avg_bp['Diagnosis'], y=avg_bp['DiastolicBP'], name='DiastolicBP'), row=1, col=2)
fig.update_layout(title="Average Blood Pressure by Alzheimer's Diagnosis Status", template="plotly_dark")
fig.show()

In [None]:
# How many patients have diabetes, hypertension, and cardiovascular disease?
diseases = ['Diabetes', 'Hypertension', 'CardiovascularDisease']
fig = make_subplots(rows=1, cols=3, subplot_titles=diseases)
for i, disease in enumerate(diseases, 1):
    disease_counts = data[disease].value_counts()
    fig.add_trace(go.Bar(x=disease_counts.index, y=disease_counts.values, name=disease), row=1, col=i)
fig.update_layout(title="Number of Patients with Diabetes, Hypertension, and Cardiovascular Disease", template="plotly_dark")
fig.show()

In [None]:
# How many patients experience personality changes, and what is their Alzheimer's diagnosis status?
personality_changes_counts = data.groupby(['Diagnosis', 'PersonalityChanges']).size().reset_index(name='counts')
fig = px.bar(personality_changes_counts, x='Diagnosis', y='counts', color='PersonalityChanges', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'PersonalityChanges':'Personality Changes (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Personality Changes by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
avg_physical_activity = data.groupby('Diagnosis')['PhysicalActivity'].mean().reset_index()
fig = px.bar(avg_physical_activity, x='Diagnosis', y='PhysicalActivity', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'PhysicalActivity':'Average Physical Activity (hours/week)'}, template="plotly_dark")
fig.update_layout(title="Average Physical Activity Level by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
# How many patients have difficulty completing tasks, and what is their Alzheimer's diagnosis status?
difficulty_completing_tasks_counts = data.groupby(['Diagnosis', 'DifficultyCompletingTasks']).size().reset_index(name='counts')
fig = px.bar(difficulty_completing_tasks_counts, x='Diagnosis', y='counts', color='DifficultyCompletingTasks', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'DifficultyCompletingTasks':'Difficulty Completing Tasks (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Difficulty Completing Tasks by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
# What is the distribution of forgetfulness symptoms among patients with and without Alzheimer's?
fig = px.histogram(data, x='Forgetfulness', color='Diagnosis', barmode='overlay', labels={'Forgetfulness':'Forgetfulness (0=No, 1=Yes)', 'Diagnosis':'Diagnosis (0=No, 1=Yes)'}, template="plotly_dark")
fig.update_layout(title="Distribution of Forgetfulness Symptoms Among Patients with and without Alzheimer's")
fig.show()

In [None]:
# How many patients have sleep quality below the average, and what is their Alzheimer's diagnosis status?
avg_sleep_quality = data['SleepQuality'].mean()
below_avg_sleep_quality = data[data['SleepQuality'] < avg_sleep_quality]
below_avg_sleep_quality_counts = below_avg_sleep_quality.groupby('Diagnosis').size().reset_index(name='counts')
fig = px.bar(below_avg_sleep_quality_counts, x='Diagnosis', y='counts', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients with Below Average Sleep Quality by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
# How many patients are smokers, and what is their Alzheimer's diagnosis status?
smoking_counts = data.groupby(['Diagnosis', 'Smoking']).size().reset_index(name='counts')
fig = px.bar(smoking_counts, x='Diagnosis', y='counts', color='Smoking', barmode='group', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'Smoking':'Smoking (0=No, 1=Yes)', 'counts':'Number of Patients'}, template="plotly_dark")
fig.update_layout(title="Number of Patients Who Smoke by Alzheimer's Diagnosis Status")
fig.show()

In [None]:
# What is the distribution of functional assessment scores among patients with and without Alzheimer's?
fig = px.box(data, x='Diagnosis', y='FunctionalAssessment', labels={'Diagnosis':'Diagnosis (0=No, 1=Yes)', 'FunctionalAssessment':'Functional Assessment Score'}, template="plotly_dark")
fig.update_layout(title="Distribution of Functional Assessment Scores Among Patients with and without Alzheimer's")
fig.show()

In [None]:
# How does the average MMSE score compare among patients with different levels of education?
avg_mmse_by_education = data.groupby('EducationLevel')['MMSE'].mean().reset_index()
fig = px.bar(avg_mmse_by_education, x='EducationLevel', y='MMSE', labels={'EducationLevel':'Education Level', 'MMSE':'Average MMSE Score'}, template="plotly_dark")
fig.update_layout(title="Average MMSE Score by Education Level")
fig.show()

In [None]:
# What is the relationship between age and cholesterol levels (total, LDL, HDL) among patients with Alzheimer's?
cholesterol_columns = ['CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL']
fig = make_subplots(rows=1, cols=3, subplot_titles=cholesterol_columns)
for i, column in enumerate(cholesterol_columns, 1):
    fig.add_trace(go.Scatter(x=data[data['Diagnosis'] == 1]['Age'], y=data[data['Diagnosis'] == 1][column], mode='markers', name=column), row=1, col=i)
    fig.update_xaxes(title_text='Age', row=1, col=i)
    fig.update_yaxes(title_text=column, row=1, col=i)
fig.update_layout(title="Relationship Between Age and Cholesterol Levels Among Patients with Alzheimer's", template="plotly_dark")
fig.show()

In [None]:
# Normalize numerical columns using PowerTransformer
pt = PowerTransformer()
data[numerical_columns] = pt.fit_transform(data[numerical_columns])

In [None]:
# Verify distribution after transformation using subplots
fig = make_subplots(rows=4, cols=3, subplot_titles=numerical_columns, vertical_spacing=0.1, horizontal_spacing=0.1)

for i, column in enumerate(numerical_columns, 1):
    row = (i - 1) // 3 + 1
    col = (i - 1) % 3 + 1
    fig.add_trace(px.histogram(data, x=column, marginal="box").data[0], row=row, col=col)

fig.update_layout(title_text="Distribution of Numerical Columns After Transformation", showlegend=False, template="plotly_dark")
fig.show()

In [None]:
# Preprocessing
X = data.drop(columns=['PatientID', 'Diagnosis', 'DoctorInCharge'])
y = data['Diagnosis']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:\n", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.95       277
           1       0.96      0.83      0.89       153

    accuracy                           0.93       430
   macro avg       0.94      0.91      0.92       430
weighted avg       0.93      0.93      0.93       430

Confusion Matrix:
 [[272   5]
 [ 26 127]]
Accuracy Score:
 0.9279069767441861


In [None]:
input_data = (4751,72,0,0,2,22.927,0,13.297,6.327,1.3472,9.025,0,0,1,1,0,0,142,72,242.366,56.15,33.68,162.18,21.463,6.51,0,0,0,0,0,1,0) # Removed one element to make it 32
#changing input data to a numpy array,
input_data_as_numpy_array = np.asarray(input_data)
#reshape the numpy array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
#standardized the data
std_data = scaler.transform(input_data_reshaped)
prediction = model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print("The Person does not have alzheimer Disease")
else:
  print("The person has alzheimer.")

[0]
The Person does not have alzheimer Disease



X does not have valid feature names, but StandardScaler was fitted with feature names



In [None]:
import pickle

In [None]:
filename = 'alzheimers_data.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('alzheimers_data.sav','rb'))

In [None]:
#changing input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)
#reshape the numpy array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
#standardized the data
std_data = scaler.transform(input_data_reshaped)
prediction = loaded_model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print("The Person does not have alzheimers Disease")
else:
  print("The person has alzheimers.")