In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
try:
    df = pd.read_csv('/content/healthcare-dataset-stroke-data (1).csv')
    print("Dataset Loaded Successfully!")
    print("Shape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nFirst 5 Rows:\n", df.head())
except FileNotFoundError:
    print("Error: 'healthcare-dataset-stroke-data.csv' not found.")
    print("Please ensure the file is in the correct directory.")
    exit()
print("\n--- Basic Info ---")
print(df.info())
print("\n--- Summary Statistics ---")
print(df.describe(include='all'))
print("\n--- Target Variable Distribution (heart_disease) ---")
if 'heart_disease' in df.columns:
    print(df['heart_disease'].value_counts(normalize=True))
else:
    print("Error: 'heart_disease' column not found in the dataset.")
    exit()
print("\nMissing Values Before:\n", df.isnull().sum())
if 'heart_disease' in df.columns:
    df['heart_disease'].fillna(df['heart_disease'].mode()[0], inplace=True)

print("\nMissing Values After:\n", df.isnull().sum())

def remove_outliers_iqr(data, column):
     Q1 = data[column].quantile(0.25)
     Q3 = data[column].quantile(0.75)
     IQR = Q3 - Q1
     lower = Q1 - 1.5 * IQR
     upper = Q3 + 1.5 * IQR
     return data[(data[column] >= lower) & (data[column] <= upper)]

if 'bmi' in df.columns:
     df = remove_outliers_iqr(df, 'bmi')
if 'avg_glucose_level' in df.columns:
     df = remove_outliers_iqr(df, 'avg_glucose_level')
else:
     print("Warning: 'avg_glucose_level' column not found for outlier removal.")


# ----------------------------------------
# 6. Data Visualization
# ----------------------------------------
if 'heart_disease' in df.columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x='heart_disease', data=df)
    plt.title("Heart Disease Distribution")
    plt.show()

    if 'avg_glucose_level' in df.columns:
         plt.figure(figsize=(6,4))
         sns.boxplot(x='heart_disease', y='avg_glucose_level', data=df)
         plt.title("Avg Glucose Level vs Heart Disease")
         plt.show()
    else:
         print("Warning: 'avg_glucose_level' column not found for visualization.")

    if 'bmi' in df.columns:
         plt.figure(figsize=(6,4))
         sns.boxplot(x='heart_disease', y='bmi', data=df)
         plt.title("BMI vs Heart Disease")
         plt.show()
    else:
      print("Warning: 'bmi' column not found for visualization.")

    if all(col in df.columns for col in ['heart_disease', 'avg_glucose_level', 'bmi']):
         plt.figure(figsize=(10,6))
         sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
         plt.title("Correlation Heatmap")
         plt.show()
    else:
         print("Warning: Some columns required for correlation heatmap are missing.")
else:
    print("Warning: 'heart_disease' column not found for visualization.")

if 'id' in df.columns:
    df.drop(['id'], axis=1, inplace=True)

le = LabelEncoder()
categorical_cols = ['gender', 'ever_married']
existing_categorical_cols = [col for col in categorical_cols if col in df.columns]
for col in existing_categorical_cols:
    df[col] = le.fit_transform(df[col])

for col in ['age', 'hypertension', 'heart_disease']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df.dropna(subset=['heart_disease'], inplace=True)

if 'heart_disease' in df.columns:
    df = df[df['heart_disease'].isin([0, 1])]

if 'heart_disease' in df.columns:
    X = df.drop('heart_disease', axis=1)
    y = df['heart_disease']
else:
    print("Error: 'heart_disease' column not found after preprocessing.")
    exit()

scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=np.number).columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_scaled = X
if 'heart_disease' in df.columns:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    print("\nTraining Data Shape:", X_train.shape)
    print("Testing Data Shape:", X_test.shape)
else:
    print("Error: 'heart_disease' column not found for train-test split.")
    exit()

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\n Model Evaluation Results ")
print("Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

importances = pd.Series(model.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).plot(kind='bar', figsize=(10,5))
plt.title("Feature Importance")
plt.show()