In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, silhouette_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier  # Importing CatBoost

# Load dataset
df = pd.read_csv("dataset.csv")
print("Initial Data Shape:", df.shape)

# Checking for Duplicates
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)
if duplicates > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates dropped. New shape:", df.shape)

# Checking for Missing Values
print("\nMissing values per column:")
print(df.isnull().sum())

# Handle missing values for numerical columns
imputer = SimpleImputer(strategy='mean')
num_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
df[num_cols] = imputer.fit_transform(df[num_cols])

# Handle missing values for categorical columns
df['SmokingHabit'].fillna(df['SmokingHabit'].mode()[0], inplace=True)
df['EchoECG'].fillna(df['EchoECG'].mode()[0], inplace=True)

# Detecting and Removing Outliers using IQR
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("Shape after outlier removal:", df.shape)

# Encoding Categorical Features
encoder = LabelEncoder()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'SmokingHabit', 'EchoECG']
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# Splitting Data into Train and Test Sets
X = df.drop(columns=['HeartDisease', 'Patient Name'])  # Drop non-numeric column
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred_lr)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)

# CatBoost Model
catboost = CatBoostClassifier(verbose=0, random_state=42)
catboost.fit(X_train, y_train)
y_pred_cb = catboost.predict(X_test)
cb_acc = accuracy_score(y_test, y_pred_cb)

# Unsupervised Learning (KMeans Clustering)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans_labels_train = kmeans.fit_predict(X_train)
kmeans_silhouette = silhouette_score(X_train, kmeans_labels_train) if len(set(kmeans_labels_train)) > 1 else 0

# Model Performance Comparison
results = {
    "Random Forest Accuracy": rf_acc,
    "Logistic Regression Accuracy": lr_acc,
    "XGBoost Accuracy": xgb_acc,
    "CatBoost Accuracy": cb_acc,
    "KMeans Silhouette Score": kmeans_silhouette
}
print("\nModel Performance:")
for model, score in results.items():
    print(f"{model}: {score:.4f}")

# Selecting the Best Model
best_model = max(results, key=results.get)
final_model = rf if best_model == "Random Forest Accuracy" else xgb if best_model == "XGBoost Accuracy" else catboost

# Predict on User Input
def predict_heart_disease():
    sample_values = {
        "Age": 45,
        "Sex": 1,
        "ChestPainType": 2,
        "RestingBP": 120,
        "Cholesterol": 220,
        "FastingBS": 0,
        "RestingECG": 1,
        "MaxHR": 150,
        "ExerciseAngina": 0,
        "Oldpeak": 1.5,
        "ST_Slope": 2,
        "SmokingHabit": 1,
        "EchoECG": 1
    }
    
    user_data = []
    for col in X.columns:
        value = input(f"Enter {col} (Sample: {sample_values.get(col, 'N/A')}): ")
        user_data.append(float(value) if value else sample_values[col])
    
    user_data = pd.DataFrame([user_data], columns=X.columns)
    user_data = scaler.transform(user_data)
    prediction = final_model.predict(user_data)[0]
    print("\nPredicted Heart Disease:", "Yes" if prediction == 1 else "No")

if __name__ == "__main__":
    predict_heart_disease()
