# Framingham Heart Disease Prediction Project

### 1. Load and explore data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("framingham.csv")

# First few rows
df.head()

### 2. Clean Data

In [None]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with any nulls
df_clean = df.dropna()

# Confirm nulls are handled
print(df_clean.isnull().sum())

# Check data types
df_clean.info()

### 3. Visualize Glucose Distribution

In [None]:
# Distribution of glucose
plt.figure(figsize=(8, 5))
sns.histplot(df_clean['glucose'], kde=True, bins=30, color='blue')
plt.title('Glucose Distribution')
plt.xlabel('Glucose')
plt.ylabel('Frequency')
plt.show()

# Comparing glucose with heart disease
plt.figure(figsize=(8, 5))
sns.boxplot(x='TenYearCHD', y='glucose', data=df_clean)
plt.title('Glucose Levels vs Heart Disease')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Glucose')
plt.show()

### 4. Feature Selection

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df_clean.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Features and target
X = df_clean.drop('TenYearCHD', axis=1)
y = df_clean['TenYearCHD']

### 5. Train Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
log_model = LogisticRegression()
tree_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()

log_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

### 6. Evaluate Models

In [None]:
from sklearn.metrics import classification_report, accuracy_score

models = {
    "Logistic Regression": log_model,
    "Decision Tree": tree_model,
    "Random Forest": rf_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

### 7. Glucose Insight

In [None]:
# Group people by glucose ranges and see heart disease risk
df_clean['glucose_range'] = pd.cut(df_clean['glucose'], bins=[0, 80, 100, 125, 200], labels=['Low', 'Normal', 'Pre-diabetic', 'High'])
glucose_risk = df_clean.groupby('glucose_range')['TenYearCHD'].mean()

print("Heart Disease Rate by Glucose Range:")
print(glucose_risk)

### 8. Feature Importance

In [None]:
# Feature importances from Random Forest
importances = rf_model.feature_importances_
features = X.columns

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features)
plt.title("Feature Importances (Random Forest)")
plt.show()

## Conclusion
The analysis on this dataset revealed that the Random Forest model performed well in predicting the 10-year risk of coronary heart disease. Also glucose was identified as a significant factor influencing the risk of heart disease.