In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('/content/heart.csv')  # Replace with your actual filename

# Separate features and target variable
X = data.drop('target', axis=1)
y = data['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search to find the best hyperparameters
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = best_rf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.9902597402597403
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       159
           1       1.00      0.98      0.99       149

    accuracy                           0.99       308
   macro avg       0.99      0.99      0.99       308
weighted avg       0.99      0.99      0.99       308



In [None]:
import joblib

joblib.dump(scaler,'scaler.joblib')
joblib.dump(best_rf,'model.joblib')

['model.joblib']

In [None]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [None]:
data[data['target'] == 1]['thal'].value_counts()


Unnamed: 0_level_0,count
thal,Unnamed: 1_level_1
2,412
3,90
1,21
0,3


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_scaled,y_train)

pred = lr.predict(X_test_scaled)

In [None]:
accuracy_score(y_test,pred)*100


80.51948051948052

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train_scaled,y_train)

svc_pred = svc.predict(X_test_scaled)

accuracy_score(y_test,svc_pred)*100

88.96103896103897

In [None]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
joblib.dump(svc,'svc.joblib')

['svc.joblib']

In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle



# Function to create synthetic data
def create_synthetic_data(df, num_samples=100):
    synthetic_data = pd.DataFrame(columns=df.columns)

    for col in df.columns:
        if df[col].dtype == 'object' or col in ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']:
            # For categorical columns, sample with replacement from original values
            synthetic_data[col] = np.random.choice(df[col], num_samples, replace=True)
        else:
            # For continuous columns, add Gaussian noise to mean of the original data
            mean = df[col].mean()
            std = df[col].std()
            synthetic_data[col] = np.random.normal(mean, std, num_samples)

    # Ensure integer columns remain integers
    int_columns = ['age', 'sex', 'cp', 'trestbps', 'fbs', 'restecg', 'thalach', 'exang', 'slope', 'ca', 'thal', 'target']
    synthetic_data[int_columns] = synthetic_data[int_columns].round().astype(int)

    # Shuffle the synthetic dataset for randomness
    synthetic_data = shuffle(synthetic_data).reset_index(drop=True)

    return synthetic_data

# Generate synthetic data
df = create_synthetic_data(data, num_samples=300)

df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,56,0,3,126,236.084806,0,1,139,0,1.075606,1,1,2,1
1,48,0,0,138,212.729743,0,0,124,0,3.305827,2,2,2,1
2,47,1,0,130,156.585263,0,0,177,0,1.352582,0,1,1,0
3,57,1,0,122,305.478093,0,0,153,0,1.758438,2,0,1,1
4,54,0,1,146,205.730612,0,0,153,0,-0.75218,2,0,2,0


In [None]:
df.shape

(300, 14)

In [None]:

df_new = pd.concat([data, df], axis=0, ignore_index=True)

df_new.shape


(1325, 14)

In [None]:
from sklearn.model_selection import train_test_split

x = df_new.drop('target', axis=1)
y = df_new['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
svc_new = SVC()
svc_new.fit(x_train,y_train)

svc_pred_new = svc_new.predict(x_test)

accuracy_score(y_test,svc_pred_new)*100

64.15094339622641

In [None]:
rf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model
best_rf_new = grid_search.best_estimator_

from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
rf_pred = best_rf.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, rf_pred))

Accuracy: 0.6150943396226415
              precision    recall  f1-score   support

           0       0.56      0.87      0.68       125
           1       0.77      0.39      0.51       140

    accuracy                           0.62       265
   macro avg       0.67      0.63      0.60       265
weighted avg       0.67      0.62      0.59       265



