In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
dataset = pd.read_csv("alzheimers_disease_data.csv")

dataset.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [4]:
dataset.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,5825.0,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,620.507185,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,4751.0,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,5288.0,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,5825.0,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,6362.0,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,6899.0,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
print("Missing values per column:\n", dataset.isnull().sum())


Missing values per column:
 PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyC

In [6]:
# Drop columns that are not needed for analysis (e.g., administrative columns)
if 'DoctorInCharge' in dataset.columns:
    dataset.drop(columns=['DoctorInCharge'], inplace=True)

In [11]:
# Encode categorical variables if needed
# For example, convert 'Diagnosis' to numeric if it is not already
le = LabelEncoder()
if dataset['Diagnosis'].dtype == 'object':
    dataset['Diagnosis'] = le.fit_transform(dataset['Diagnosis'])

In [12]:
# Age Distribution Histogram
fig_age = px.histogram(dataset, x="Age", nbins=20, title="Age Distribution",
                       labels={"Age": "Age", "count": "Frequency"})
fig_age.show()

# BMI Distribution Histogram
fig_bmi = px.histogram(dataset, x="BMI", nbins=20, title="BMI Distribution",
                       labels={"BMI": "BMI", "count": "Frequency"})
fig_bmi.show()

# Count Plot for Gender
fig_gender = px.histogram(dataset, x="Gender", title="Gender Distribution",
                          category_orders={"Gender": sorted(dataset["Gender"].unique())},
                          labels={"Gender": "Gender", "count": "Count"})
fig_gender.show()


In [13]:
# Correlation Matrix Heatmap
corr_matrix = dataset.corr()
fig_corr = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                     title="Correlation Matrix", color_continuous_scale="RdBu")
fig_corr.show()

# Scatter Plot: Age vs. MMSE Score
fig_scatter = px.scatter(dataset, x="Age", y="MMSE", title="Age vs MMSE Score",
                         labels={"Age": "Age", "MMSE": "MMSE Score"})
fig_scatter.show()

In [14]:
# ------------------------------
# Modeling & Model Evaluation
# ------------------------------

# Define features (X) and target (y)
# Dropping PatientID (identifier) and Diagnosis from X
X = dataset.drop(columns=['PatientID', 'Diagnosis'])
y = dataset['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model with a classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Create a confusion matrix and display it using Plotly
cm = confusion_matrix(y_test, y_pred)
fig_cm = px.imshow(cm, text_auto=True, color_continuous_scale="Blues",
                     labels=dict(x="Predicted", y="Actual", color="Count"),
                     x=["Predicted 0", "Predicted 1"],
                     y=["Actual 0", "Actual 1"],
                     title="Confusion Matrix")
fig_cm.show()

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.87       277
           1       0.79      0.71      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.80      0.81       430
weighted avg       0.83      0.83      0.83       430



In [16]:
# =============================================================================
# CMPUT 195 Mini Project: Alzheimer's Disease Dataset Analysis
# =============================================================================

# 1. Data Acquisition & Preprocessing
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import ttest_ind

# Load the dataset
dataset = pd.read_csv("alzheimers_disease_data.csv")

# Display initial records and summary statistics
print("----- First Five Rows -----")
print(dataset.head())
print("\n----- Summary Statistics -----")
print(dataset.describe())

# Check for missing values
print("\n----- Missing Values per Column -----")
print(dataset.isnull().sum())

# Drop administrative columns (e.g., DoctorInCharge)
if 'DoctorInCharge' in dataset.columns:
    dataset.drop(columns=['DoctorInCharge'], inplace=True)

# Encode categorical variables if needed (e.g., 'Diagnosis')
le = LabelEncoder()
if dataset['Diagnosis'].dtype == 'object':
    dataset['Diagnosis'] = le.fit_transform(dataset['Diagnosis'])

# -----------------------------------------------------------------------------
# 2. Exploratory Data Analysis (EDA) with Plotly
# -----------------------------------------------------------------------------

# Age Distribution Histogram
fig_age = px.histogram(dataset, x="Age", nbins=20, title="Age Distribution",
                       labels={"Age": "Age", "count": "Frequency"})
fig_age.show()

# BMI Distribution Histogram
fig_bmi = px.histogram(dataset, x="BMI", nbins=20, title="BMI Distribution",
                       labels={"BMI": "BMI", "count": "Frequency"})
fig_bmi.show()

# Count Plot for Gender
fig_gender = px.histogram(dataset, x="Gender", title="Gender Distribution",
                          category_orders={"Gender": sorted(dataset["Gender"].unique())},
                          labels={"Gender": "Gender", "count": "Count"})
fig_gender.show()

# Correlation Matrix Heatmap
corr_matrix = dataset.corr()
fig_corr = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                     title="Correlation Matrix", color_continuous_scale="RdBu")
fig_corr.show()

# Scatter Plot: Age vs. MMSE Score
fig_scatter = px.scatter(dataset, x="Age", y="MMSE", title="Age vs MMSE Score",
                         labels={"Age": "Age", "MMSE": "MMSE Score"})
fig_scatter.show()

# Box Plot: MMSE Score by Diagnosis Group
fig_box = px.box(dataset, x="Diagnosis", y="MMSE", title="MMSE Scores by Diagnosis Group",
                 labels={"Diagnosis": "Diagnosis (0/1)", "MMSE": "MMSE Score"})
fig_box.show()

# -----------------------------------------------------------------------------
# 3. Hypothesis Testing: Comparing MMSE Scores between Diagnosis Groups
# -----------------------------------------------------------------------------
group0 = dataset[dataset['Diagnosis'] == 0]['MMSE']
group1 = dataset[dataset['Diagnosis'] == 1]['MMSE']

t_stat, p_value = ttest_ind(group0, group1)
print("\n----- Hypothesis Testing: MMSE Difference -----")
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")
if p_value < 0.05:
    print("Result: Significant difference in MMSE scores between groups (p < 0.05).")
else:
    print("Result: No significant difference in MMSE scores between groups (p >= 0.05).")

# -----------------------------------------------------------------------------
# 4. Modeling & Model Evaluation
# -----------------------------------------------------------------------------
# Prepare features (X) and target variable (y)
X = dataset.drop(columns=['PatientID', 'Diagnosis'])
y = dataset['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize the features (for models sensitive to scale, e.g., Logistic Regression and SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# Model 1: Logistic Regression
# ---------------------------
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)
y_pred_logreg = logreg.predict(X_test_scaled)
print("\n----- Logistic Regression Classification Report -----")
print(classification_report(y_test, y_pred_logreg))
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
fig_cm_logreg = px.imshow(cm_logreg, text_auto=True, color_continuous_scale="Blues",
                          labels=dict(x="Predicted", y="Actual", color="Count"),
                          x=["Predicted 0", "Predicted 1"],
                          y=["Actual 0", "Actual 1"],
                          title="Logistic Regression: Confusion Matrix")
fig_cm_logreg.show()

# Visualize Logistic Regression Coefficients (Feature Importance)
coef = logreg.coef_[0]
features = X.columns
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': coef})
coef_df = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)
fig_coef = px.bar(coef_df, x='Feature', y='Coefficient',
                  title="Logistic Regression Feature Coefficients")
fig_coef.show()

# ---------------------------
# Model 2: Random Forest Classifier
# ---------------------------
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)  # Tree-based; scaling not strictly required
y_pred_rf = rf.predict(X_test)
print("\n----- Random Forest Classification Report -----")
print(classification_report(y_test, y_pred_rf))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig_cm_rf = px.imshow(cm_rf, text_auto=True, color_continuous_scale="Blues",
                      labels=dict(x="Predicted", y="Actual", color="Count"),
                      x=["Predicted 0", "Predicted 1"],
                      y=["Actual 0", "Actual 1"],
                      title="Random Forest: Confusion Matrix")
fig_cm_rf.show()

# Feature Importance for Random Forest
feat_importance = rf.feature_importances_
feat_imp_df = pd.DataFrame({'Feature': X.columns, 'Importance': feat_importance})
feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)
fig_rf_imp = px.bar(feat_imp_df, x='Feature', y='Importance',
                    title="Random Forest Feature Importance")
fig_rf_imp.show()

# Optional: Hyperparameter tuning for Random Forest using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
print("\n----- Best Parameters from GridSearchCV (Random Forest) -----")
print(grid_rf.best_params_)
print("Best CV Score:", grid_rf.best_score_)

# ---------------------------
# Model 3: Support Vector Machine (SVM)
# ---------------------------
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
print("\n----- SVM Classification Report -----")
print(classification_report(y_test, y_pred_svm))
cm_svm = confusion_matrix(y_test, y_pred_svm)
fig_cm_svm = px.imshow(cm_svm, text_auto=True, color_continuous_scale="Blues",
                       labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=["Predicted 0", "Predicted 1"],
                       y=["Actual 0", "Actual 1"],
                       title="SVM: Confusion Matrix")
fig_cm_svm.show()

# =============================================================================
# End of Analysis Pipeline
# =============================================================================

# Interpretation & Conclusion:
# - The hypothesis testing confirms a significant difference in MMSE scores between diagnosis groups.
# - Logistic Regression shows strong overall performance (83% accuracy), though it may miss some instances of class 1.
# - Consider comparing these results with Random Forest and SVM to see if they improve recall or overall performance.
# - Document the feature importances, model comparisons, and any limitations for your final report.


----- First Five Rows -----
   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  MemoryComplaints  \
0           13.297218          6.327112     1.347214  ...                 0   
1            4.542524          7.619885     0.518767  ...                 0   
2           19.555085          7.844988     1.826335  ...                 0   
3           12.209266          8.428001     7.435604  ...                 0   
4           18.454356          6.310461     0.795498  ...                 0   

   BehavioralProblems       ADL  Confusi


----- Hypothesis Testing: MMSE Difference -----
T-statistic: 11.310, p-value: 0.000
Result: Significant difference in MMSE scores between groups (p < 0.05).

----- Logistic Regression Classification Report -----
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       277
           1       0.79      0.71      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.80      0.81       430
weighted avg       0.83      0.83      0.83       430




----- Random Forest Classification Report -----
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       277
           1       0.96      0.82      0.89       153

    accuracy                           0.93       430
   macro avg       0.94      0.90      0.92       430
weighted avg       0.93      0.93      0.92       430




----- Best Parameters from GridSearchCV (Random Forest) -----
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Best CV Score: 0.933092074038918

----- SVM Classification Report -----
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       277
           1       0.80      0.71      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.80      0.81       430
weighted avg       0.83      0.83      0.83       430

