In [None]:
# Basic libraries
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML model and metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:

!ls /content/drive/MyDrive/


In [None]:
!ls '/content/drive/MyDrive/DE_Mini_Project/'


In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/DE_Mini_Project/Heart_Diseased_Dataset.csv')

# Set options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Display full dataset
display(data)


In [None]:
# Summary info and statistics
data.info()
data.describe()
data.head()


In [None]:
data.isnull().sum()
data.duplicated().sum()


In [None]:
data = data.drop_duplicates()
data = data.fillna(method='ffill')  # or handle manually


In [None]:
data.info()


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd

# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

print("Original dataset size:", len(data))
print(y.value_counts())


In [None]:
import pandas as pd

# Load the full dataset
data = pd.read_csv('/content/drive/MyDrive/DE_Mini_Project/Heart_Diseased_Dataset.csv')

# Check shape and info
print("Dataset shape:", data.shape)
data.info()


In [None]:
data['target'].value_counts()


In [None]:
# Step 1: Import Libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Split data into train and test sets
X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize XGBoost model
model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Step 4: Train the model
model.fit(X_train, y_train)

# Step 5: Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("✅ Model Accuracy:", round(acc * 100, 2), "%")
print("\n📊 Confusion Matrix:\n", cm)
print("\n📈 Classification Report:\n", report)


In [None]:
xgb.plot_importance(model, importance_type='gain')


In [None]:
import numpy as np

# Example: new patient data (same order of features as training set)
# [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]
new_patient = np.array([[58, 1, 2, 130, 250, 0, 1, 160, 0, 1.0, 2, 0, 2]])

# Predict risk
prediction = model.predict(new_patient)

if prediction[0] == 1:
    print("⚠️ High-Risk Patient — Immediate medical attention recommended.")
else:
    print("✅ Low-Risk Patient — Normal condition, routine monitoring advised.")


In [None]:
import pandas as pd

# Create a small sample dataset (same columns as training, except 'target')
sample_data = pd.DataFrame({
    'age': [58, 45, 63],
    'sex': [1, 0, 1],
    'cp': [2, 3, 1],
    'trestbps': [130, 120, 140],
    'chol': [250, 210, 260],
    'fbs': [0, 0, 1],
    'restecg': [1, 0, 1],
    'thalach': [160, 180, 150],
    'exang': [0, 0, 1],
    'oldpeak': [1.0, 0.2, 2.3],
    'slope': [2, 2, 1],
    'ca': [0, 0, 1],
    'thal': [2, 3, 3]
})

# Save sample to Drive
sample_data.to_csv('/content/drive/MyDrive/DE_Mini_Project/New_Patient_Data.csv', index=False)

print("✅ Sample patient file created successfully!")


In [None]:
new_data = pd.read_csv('/content/drive/MyDrive/DE_Mini_Project/New_Patient_Data.csv')
predictions = model.predict(new_data)
new_data['Risk_Prediction'] = predictions
new_data.to_csv('/content/drive/MyDrive/DE_Mini_Project/Predicted_Patient_Risks.csv', index=False)

print("✅ Predictions saved successfully!")
new_data


In [None]:
# Remove any prediction columns accidentally added
new_data_features = new_data.drop(columns=['Risk_Prediction'], errors='ignore')

# Predict probabilities again
probabilities = model.predict_proba(new_data_features)[:, 1]

# Add probability column
new_data['Risk_Probability'] = probabilities

# Display results
new_data[['age', 'sex', 'chol', 'thalach', 'oldpeak', 'Risk_Prediction', 'Risk_Probability']]


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Risk_Prediction', data=new_data, palette='coolwarm')
plt.title('Overall Patient Risk Distribution')
plt.xlabel('Predicted Risk (0 = Low, 1 = High)')
plt.ylabel('Count of Patients')
plt.show()


In [None]:
plt.figure(figsize=(7,4))
sns.histplot(new_data['Risk_Probability'], bins=20, kde=True, color='purple')
plt.title('Distribution of Predicted Risk Probabilities')
plt.xlabel('Risk Probability')
plt.ylabel('Number of Patients')
plt.show()


In [None]:
top_risk_patients = new_data.sort_values(by='Risk_Probability', ascending=False).head(10)

plt.figure(figsize=(10,5))
sns.barplot(x='Risk_Probability', y=top_risk_patients.index, data=top_risk_patients, palette='Reds_r')
plt.title('Top 10 Predicted High-Risk Patients')
plt.xlabel('Predicted Risk Probability')
plt.ylabel('Patient Index')
plt.show()


In [None]:
plt.figure(figsize=(7,5))
sns.scatterplot(x='age', y='Risk_Probability', hue='Risk_Prediction', data=new_data, palette='coolwarm')
plt.title('Age vs Predicted Risk Probability')
plt.xlabel('Age')
plt.ylabel('Risk Probability')
plt.legend(title='Risk Level')
plt.show()


In [None]:
import joblib

# Save the trained model
joblib.dump(model, '/content/drive/MyDrive/DE_Mini_Project/patient_risk_model.pkl')

print("✅ Model saved successfully at: /content/drive/MyDrive/DE_Mini_Project/patient_risk_model.pkl")


In [None]:
import joblib

# Load the saved model
model = joblib.load('/content/drive/MyDrive/DE_Mini_Project/patient_risk_model.pkl')

print("✅ Model loaded successfully and ready for predictions!")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
plt.hist(new_data['Risk_Probability'], bins=20, color='skyblue', edgecolor='black')
plt.title("Predicted Patient Risk Probability Distribution")
plt.xlabel("Risk Probability")
plt.ylabel("Number of Patients")
plt.show()


In [None]:
!pip install matplotlib seaborn joblib pandas numpy scikit-learn jinja2


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
from jinja2 import Template

# --- 1️⃣ Evaluate model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

# --- 2️⃣ Save confusion matrix plot ---
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Disease', 'High Risk'], yticklabels=['No Disease', 'High Risk'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/DE_Mini_Project/confusion_matrix.png')
plt.close()

# --- 3️⃣ Save risk probability distribution ---
plt.figure(figsize=(6,4))
sns.histplot(new_data['Risk_Probability'], bins=20, kde=True, color='skyblue')
plt.title('Predicted Patient Risk Probability Distribution')
plt.xlabel('Risk Probability')
plt.ylabel('Number of Patients')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/DE_Mini_Project/risk_distribution.png')
plt.close()

# --- 4️⃣ Prepare HTML template ---
html_template = """
<!DOCTYPE html>
<html>
<head>
    <title>Predictive Patient Risk Report</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; background-color: #f7f9fc; }
        h1, h2 { color: #2b4c7e; }
        table { border-collapse: collapse; width: 80%; margin: 20px 0; }
        th, td { border: 1px solid #aaa; padding: 8px; text-align: center; }
        th { background-color: #dde7f0; }
        img { max-width: 600px; margin-top: 15px; border-radius: 10px; box-shadow: 0 0 8px rgba(0,0,0,0.2); }
        .highlight { background-color: #eaf4ff; padding: 10px; border-left: 5px solid #2b4c7e; }
    </style>
</head>
<body>
    <h1>Predictive Patient Risk Report</h1>

    <div class="highlight">
        <h2>Model Performance</h2>
        <p><b>Accuracy:</b> {{ accuracy | round(4) * 100 }} %</p>
    </div>

    <h2>Confusion Matrix</h2>
    <img src="confusion_matrix.png" alt="Confusion Matrix">

    <h2>Classification Report</h2>
    <table>
        <tr><th>Class</th><th>Precision</th><th>Recall</th><th>F1-score</th><th>Support</th></tr>
        {% for label, metrics in report.items() if label in ['0','1'] %}
        <tr>
            <td>{{ label }}</td>
            <td>{{ metrics['precision'] | round(2) }}</td>
            <td>{{ metrics['recall'] | round(2) }}</td>
            <td>{{ metrics['f1-score'] | round(2) }}</td>
            <td>{{ metrics['support'] }}</td>
        </tr>
        {% endfor %}
    </table>

    <h2>Risk Probability Distribution</h2>
    <img src="risk_distribution.png" alt="Risk Distribution">

    <h2>Sample Predicted Patient Risks</h2>
    {{ new_data.head(10).to_html(index=False) }}

    <p><i>Report auto-generated using Python (XGBoost, Pandas, Matplotlib, Seaborn, Jinja2)</i></p>
</body>
</html>
"""

# --- 5️⃣ Render HTML ---
template = Template(html_template)
html_output = template.render(
    accuracy=accuracy,
    report=report,
    new_data=new_data
)

# --- 6️⃣ Save final report ---
with open('/content/drive/MyDrive/DE_Mini_Project/Patient_Risk_Report.html', 'w') as f:
    f.write(html_output)

print("✅ HTML Report saved successfully at: /content/drive/MyDrive/DE_Mini_Project/Patient_Risk_Report.html")
