In [1]:
import pandas as pd 
import joblib
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# load Data

In [2]:
df = pd.read_csv('..\\data\\heart_disease.csv')
df.head()

Unnamed: 0,PC1,PC5,PC6,PC10,PC7,PC9,PC12,PC13,PC8,num
0,-0.01755,0.292001,0.746005,-0.34369,-0.727584,0.184558,0.350511,0.598685,-0.910318,0
1,-1.134815,0.174615,0.095826,0.239566,0.764537,-0.343816,-0.552569,-0.67735,-0.511039,1
2,-1.865945,0.059297,-0.125068,0.854383,-0.099045,-0.041109,0.011509,0.022689,-0.030261,1
3,0.897661,0.388291,0.411199,0.125395,0.406543,-0.521288,-0.390647,-0.211833,-0.039662,0
4,1.314315,0.308332,-0.268851,0.029966,-0.274715,0.800398,-0.180443,0.074683,0.190862,0


In [3]:
y= df['num']
x= df.drop('num',axis=1)

In [4]:
X_train , X_test , y_train , y_test =train_test_split(x,y, test_size=0.2,random_state=42)

In [5]:
param_grid = {
    'C': [0.1,1,10,100,1000]
    ,'gamma':[1,0.1,0.01,0.001,0.0001]
    ,'kernel':['sigmoid', 'poly', 'linear', 'rbf']
}

# GridSearchCV

In [6]:
grid =GridSearchCV(SVC(random_state=42),param_grid,verbose=3,cv=5 , scoring='accuracy' , refit=True)
grid.fit(X_train, y_train)
print("GridSearch Best Params:", grid.best_params_)
print("GridSearch Best estimator:", grid.best_estimator_)
print("Best CV Accuracy:", grid.best_score_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.833 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.771 total time=   0.0s
[CV 3/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.872 total time=   0.0s
[CV 4/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.745 total time=   0.0s
[CV 5/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.766 total time=   0.0s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.792 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.792 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.872 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.745 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.851 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.854 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linea

# RandomizedSearchCV

In [7]:
rand =RandomizedSearchCV(SVC(random_state=42),param_grid,verbose=3,cv=5 , scoring='accuracy',n_iter=10,random_state=42,refit=True)
rand.fit(X_train, y_train)
print("RandomizedSearchCV Best Params:", rand.best_params_)
print("RandomizedSearchCV Best estimator:", rand.best_estimator_)
print("Best CV Accuracy:", rand.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.750 total time=   0.0s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.745 total time=   0.0s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.702 total time=   0.0s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.723 total time=   0.0s
[CV 1/5] END ....C=10, gamma=0.001, kernel=poly;, score=0.521 total time=   0.0s
[CV 2/5] END ....C=10, gamma=0.001, kernel=poly;, score=0.521 total time=   0.0s
[CV 3/5] END ....C=10, gamma=0.001, kernel=poly;, score=0.532 total time=   0.0s
[CV 4/5] END ....C=10, gamma=0.001, kernel=poly;, score=0.532 total time=   0.0s
[CV 5/5] END ....C=10, gamma=0.001, kernel=poly;, score=0.511 total time=   0.0s
[CV 1/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.750 total time=   0.0s
[CV 2/5] END ..C=100, gamma=0.01, kernel=linear;

In [8]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("Optimized Accuracy:", accuracy_score(y_test, y_pred))

Optimized Accuracy: 0.9166666666666666


#  saving model pipline

In [10]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel



In [13]:
data = pd.read_csv('..\\data\\heart_disease.csv')

X = data.drop('num', axis=1)
y = data['num']

categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_cols),
    ('num', MinMaxScaler(), numerical_cols)
])



pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=13)),
    ('feature_selector', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), max_features=9, threshold=None)),
    ('model', best_model)
])  

pipeline.fit(X, y)

joblib.dump(pipeline, '../models/final_model.pkl')
print("✅ Pipeline saved as heart_disease_model.pkl")




✅ Pipeline saved as heart_disease_model.pkl
