# Experimenting if eliminating class imbalance can increase accuracy

## Importing the dependencies

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

## Loading the data

In [2]:
heart_disease_data = pd.read_csv('heart_disease_data.csv')

In [3]:
heart_disease_data.shape

(303, 14)

## Data Exploration and Elimination of Class Imbalance

In [4]:
heart_disease_data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
heart_disease_negative = heart_disease_data[heart_disease_data['target'] == 0]

In [7]:
heart_disease_positive = heart_disease_data[heart_disease_data['target'] == 1]

In [8]:
heart_disease_positive_sample = heart_disease_positive.sample(heart_disease_negative.shape[0])

In [9]:
heart_disease_positive_sample.shape

(138, 14)

In [10]:
merged_df = pd.concat([heart_disease_positive_sample, heart_disease_negative], axis=0)

In [12]:
merged_df.shape

(276, 14)

In [13]:
merged_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
65,35,0,0,138,183,0,1,182,0,1.4,2,0,2,1
63,41,1,1,135,203,0,1,132,0,0.0,1,0,1,1
152,64,1,3,170,227,0,0,155,0,0.6,1,0,3,1
124,39,0,2,94,199,0,1,179,0,0.0,2,0,2,1
42,45,1,0,104,208,0,0,148,1,3.0,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Splitting the features and labels

In [15]:
X = merged_df.drop(columns=['target'])
y = merged_df['target']

## Splitting the dataset into training set and test set

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

## Scaling the dataset

In [23]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

## Building the models followed by model evaluation

In [24]:
classifiers = [
    {
        'name': 'Logistic Regression',
        'classifier': LogisticRegression(max_iter=2000),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'solver': ['liblinear', 'lbfgs']
        },
        'train_data': 'X_train_scaled'
    },
    {
        'name': 'Decision Tree',
        'classifier': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10]
        },
        'train_data': 'X_train'
    },
    {
        'name': 'Support Vector Machine',
        'classifier': SVC(max_iter=2000),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'kernel': ['linear', 'rbf']
        },
        'train_data': 'X_train_scaled'
    },
    {
        'name': 'Random Forest',
        'classifier': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10]
        },
        'train_data': 'X_train'
    },
    {
        'name': 'Gaussian Naive Bayes',
        'classifier': GaussianNB(),
        'params': {},
        'train_data': 'X_train'
    }
]


In [25]:
results_df = pd.DataFrame(columns=['Classifier', 'Best Parameters', 'Accuracy'])

In [26]:
for classifier in classifiers:
    if classifier['train_data'] == 'X_train':
        X_train_data = X_train
    elif classifier['train_data'] == 'X_train_scaled':
        X_train_data = X_train_scaled
    else:
        raise ValueError("Invalid train_data value. Must be 'X_train' or 'X_train_scaled'.")
    
    grid_search = GridSearchCV(classifier['classifier'], classifier['params'], cv=5)
    grid_search.fit(X_train_data, y_train)
    best_classifier = grid_search.best_estimator_
    if classifier['train_data'] == 'X_train':
        y_pred = best_classifier.predict(X_test)
    elif classifier['train_data'] == 'X_train_scaled':
        X_test_scaled = scaler.transform(X_test)
        y_pred = best_classifier.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    # Append the results to the DataFrame
    results_df = results_df.append({
        'Classifier': classifier['name'],
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy
    }, ignore_index=True)


  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({


In [27]:
results_df

Unnamed: 0,Classifier,Best Parameters,Accuracy
0,Logistic Regression,"{'C': 0.1, 'solver': 'lbfgs'}",0.857143
1,Decision Tree,"{'max_depth': None, 'min_samples_split': 10}",0.732143
2,Support Vector Machine,"{'C': 10.0, 'kernel': 'linear'}",0.857143
3,Random Forest,"{'max_depth': 5, 'n_estimators': 100}",0.803571
4,Gaussian Naive Bayes,{},0.785714


In [29]:
results_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Accuracy
0,Logistic Regression,"{'C': 0.1, 'solver': 'lbfgs'}",0.857143
2,Support Vector Machine,"{'C': 10.0, 'kernel': 'linear'}",0.857143
3,Random Forest,"{'max_depth': 5, 'n_estimators': 100}",0.803571
4,Gaussian Naive Bayes,{},0.785714
1,Decision Tree,"{'max_depth': None, 'min_samples_split': 10}",0.732143


Booyah! Eliminating class imbalance did the trick...increased the accuracy score

Logistic Regression and SVC have the same accuracy on test data

In [30]:
model_svc = SVC(C=10, kernel='linear')
model_svc.fit(X_train_scaled, y_train)

SVC(C=10, kernel='linear')

In [32]:
y_pred_train = model_svc.predict(X_train_scaled)
accuracy_score(y_train, y_pred_train)

0.8545454545454545

In [34]:
model_lr = LogisticRegression(C=0.1, solver='lbfgs')
model_lr.fit(X_train_scaled, y_train)

LogisticRegression(C=0.1)

In [36]:
y_pred_train = model_lr.predict(X_train_scaled)
accuracy_score(y_train, y_pred_train)

0.8318181818181818

SVC model seems to have lesser overfitting than Logistic Regression. I'll go with SVC