In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
import plotly.express as px

df = pd.read_csv('heart.csv')



In [77]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;"> Data Exploratory </div></center>

In [17]:
df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [53]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [19]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;">Visualization</div></center>

In [46]:
fig = px.histogram(df, x='Age', nbins=20, color_discrete_sequence=['#2ca02c'], marginal='rug', title='Distribution of Age')


fig.update_layout(
    paper_bgcolor='white',
    plot_bgcolor='white',
    title={'text': 'Distribution of Age', 'font': {'color': 'black'}},
    xaxis_title='Age',
    yaxis_title='Counts',
    xaxis=dict(
        title_font=dict(color='black'),
        tickfont=dict(color='black'),
        showgrid=True,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title_font=dict(color='black'),
        tickfont=dict(color='black'),
        showgrid=True,
        gridcolor='lightgray'
    ),
    margin=dict(l=40, r=20, t=40, b=30)  
)

fig.show()


In [49]:
sex_counts = df['Sex'].value_counts()
colors = ['#ff9999', '#66b3ff']  

fig = px.pie(
    values=sex_counts,
    names=sex_counts.index,
    color_discrete_sequence=colors,
    title="Distribution of Gender",
    template='plotly_white'  
)
fig.update_traces(textinfo='value+percent')
fig.show()


In [50]:
grouped_counts = df.groupby('HeartDisease')['Sex'].value_counts().reset_index(name='counts')
colors = ['#ff9999', '#66b3ff']  

fig = px.bar(
    grouped_counts,
    x='HeartDisease',
    y='counts',
    color='Sex',
    barmode='group',
    color_discrete_sequence=colors,  
    title="Distribution of Sex by Heart Disease Status",
    template='plotly_white'  
)

fig.show()


# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;">Modelling</div></center>

In [61]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [62]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [63]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [65]:
models = {
    'Logistic Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', LogisticRegression())]),
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', RandomForestClassifier())]),
    'Gradient Boosting': Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', GradientBoostingClassifier())]),
    'Support Vector Classifier': Pipeline(steps=[('preprocessor', preprocessor),
                                                 ('classifier', SVC(probability=True))])
}

In [66]:
scores = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    scores[model_name] = {'Accuracy': accuracy, 'ROC AUC': roc_auc}

# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;">Accuracy before Tuning</div></center>

In [67]:
scores_df = pd.DataFrame(scores).T  


print(scores_df)

                           Accuracy   ROC AUC
Logistic Regression        0.884058  0.945176
Random Forest              0.873188  0.942454
Gradient Boosting          0.873188  0.942890
Support Vector Classifier  0.884058  0.949750


In [68]:
scores_df = pd.DataFrame(scores).T

fig = go.Figure()

for column in scores_df.columns:
    fig.add_trace(go.Bar(
        x=scores_df.index,
        y=scores_df[column],
        name=column
    ))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Model',
    yaxis_title='Score',
    barmode='group'
)

fig.show()


# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;">Tuning Model Parameter</div></center>

In [54]:
from sklearn.model_selection import GridSearchCV

param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'saga']
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'Support Vector Classifier': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto']
    }
}

best_models = {}
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

tuned_scores = {}
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    tuned_scores[model_name] = {'Accuracy': accuracy, 'ROC AUC': roc_auc}




Tuning Logistic Regression...
Best parameters for Logistic Regression: {'classifier__C': 0.1, 'classifier__solver': 'liblinear'}
Tuning Random Forest...
Best parameters for Random Forest: {'classifier__max_depth': 30, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 50}
Tuning Support Vector Classifier...
Best parameters for Support Vector Classifier: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}


# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;">Accuracy After Tuning</div></center>

In [55]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=scores_df.index,
    y=scores_df['Accuracy'],
    name='Accuracy',
    marker_color='royalblue'
))

fig.add_trace(go.Bar(
    x=scores_df.index,
    y=scores_df['ROC AUC'],
    name='ROC AUC',
    marker_color='lightcoral'
))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Model',
    yaxis_title='Score',
    barmode='group'
)

fig.show()

In [56]:
print(tuned_scores_df)

                           Accuracy   ROC AUC
Logistic Regression        0.887681  0.944196
Random Forest              0.898551  0.946755
Gradient Boosting          0.869565  0.945993
Support Vector Classifier  0.869565  0.938589


# <center><div style="font-family: Trebuchet MS; background-color: #00000; color: #545955; padding: 12px; line-height: 1;">ReModelling based On tuning Parameter</div></center>

In [73]:
from joblib import dump

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
dump(preprocessor, 'preprocessor.joblib')


# LogReg
log_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(C=0.1, solver='liblinear'))
])
log_reg_pipeline.fit(X, y)
dump(log_reg_pipeline, 'logistic_regression_model.joblib')

# Random Forest
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(max_depth=30, min_samples_split=10, n_estimators=200))
])
random_forest_pipeline.fit(X, y)
dump(random_forest_pipeline, 'random_forest_model.joblib')

# Gradient Boosting
gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=50))
])
gradient_boosting_pipeline.fit(X, y)
dump(gradient_boosting_pipeline, 'gradient_boosting_model.joblib')

# SVC
svc_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(C=0.1, gamma='scale', kernel='linear'))
])
svc_pipeline.fit(X, y)
dump(svc_pipeline, 'svc_model.joblib')

print("Models have been trained and saved successfully.")

Models have been trained and saved successfully.


In [72]:
from joblib import dump

# Fit the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

preprocessor.fit(X_train)  # Fit on the training data

# Save the fitted preprocessor
dump(preprocessor, 'preprocessor.joblib')


['preprocessor.joblib']

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from joblib import dump
df = pd.read_csv('heart.csv')
# Assuming df is already loaded
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Define features
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Define pipelines
log_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(C=0.1, solver='liblinear'))
])
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(max_depth=30, min_samples_split=10, n_estimators=200))
])
gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=50))
])
svc_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(C=0.1, gamma='scale', kernel='linear'))
])

# Fit models
log_reg_pipeline.fit(X_train, y_train)
random_forest_pipeline.fit(X_train, y_train)
gradient_boosting_pipeline.fit(X_train, y_train)
svc_pipeline.fit(X_train, y_train)

# Save models and preprocessor
dump(log_reg_pipeline, 'logistic_regression_model.joblib')
dump(random_forest_pipeline, 'random_forest_model.joblib')
dump(gradient_boosting_pipeline, 'gradient_boosting_model.joblib')
dump(svc_pipeline, 'svc_model.joblib')
dump(preprocessor, 'preprocessor.joblib')

# Save feature columns
# Apply the preprocessing to get the feature names
X_train_encoded = preprocessor.transform(X_train)
feature_columns = list(preprocessor.transformers_[0][1].get_feature_names_out()) + list(preprocessor.transformers_[1][1].get_feature_names_out())
dump(feature_columns, 'feature_columns.joblib')

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Evaluation report for {model.named_steps['classifier'].__class__.__name__}:")
    print(classification_report(y_test, y_pred))

evaluate_model(log_reg_pipeline, X_test, y_test)
evaluate_model(random_forest_pipeline, X_test, y_test)
evaluate_model(gradient_boosting_pipeline, X_test, y_test)
evaluate_model(svc_pipeline, X_test, y_test)

print("Models, preprocessor, and feature columns have been saved successfully.")

Evaluation report for LogisticRegression:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       112
           1       0.92      0.87      0.89       164

    accuracy                           0.87       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.88      0.87      0.87       276

Evaluation report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.86      0.85      0.85       112
           1       0.90      0.90      0.90       164

    accuracy                           0.88       276
   macro avg       0.88      0.88      0.88       276
weighted avg       0.88      0.88      0.88       276

Evaluation report for GradientBoostingClassifier:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       112
           1       0.89      0.90      0.89       164

    accuracy                           0.87 

In [78]:
numeric_features

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [79]:
X

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [80]:
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64