In [4]:
import pandas as pd # type: ignore
from datetime import datetime

# Loading the Sales.csv dataset
heart_df = pd.read_csv('heart.csv.xls')

heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
# Data Preprocessing and data cleaning
# Check for missing values
missing_values = heart_df.isnull().sum()
print(missing_values)

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [7]:
# Display dataframe information
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [None]:
# Reading Datasets, Data Preprocessing, Eda, Visualization
# feature Engineering

In [9]:
# Feature Engineering

# Select the top 10 features based on the chi-squared test
from sklearn.feature_selection import SelectKBest, chi2
# create variables for x and y that will take from data, the selected columns

X = heart_df.drop('target', axis=1) # this helps look at only columns you can work with
y = heart_df['target']

selector = SelectKBest(chi2, k=10) # selector that takes in chi2 and number of features

# k is number of features, were looking at
# target is a column, and any other column can be used
# k = 10 selects top 10 features
# selected_features to get the selected features 
X_new = selector.fit_transform(X, y) # creating a new variable X_new 
# fit_transform fitting the features into x and y

selected_features = X.columns[selector.get_support()]

print('Selected Features:', selected_features) # shows top 10 features based on the chi-square

# return selected columns (10) except the target column 
# feature selection, - checking to see if columns are well selected

# why do feature selection?
#feature scaling 

# whenever you get dataset, look for the column that will specifically target it. 
# the target column where youre going to be doing the training of the model form.
# column you want to refer to as target data based on the dataset. eg if data set is based on heart diseases 
# then target can be trestbps, ca or any relevant in heart.csv 

# how do you select target column? in some, target column is always at the end but it can vary 

Selected Features: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak',
       'slope', 'ca'],
      dtype='object')


In [10]:
#Filter to filter out columns with gaps, blank space
# Notes on the datasets
# Target column, is the heart disease status, where, 0 = no heart disease, 1, presence of a heart disease
# code for accessing the column with space so that it does not throw errors

# are all columns labeled well, to check. do feature selection.

In [11]:
# Feature scaling and Normalization
# Normalization - to avoid getting into outliers

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MinMaxScaler

# Standardize the features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # transform data in X axis 

# Normalize the features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Convert scaled feature back to Dataframe
# X_scaled = pd.DataFrame(X_scaled, columns=selected_features)

In [12]:
# Data Splitting 
# Questions focus on Normalization, feature selection, 

# Do this by next time

# Data Splitting
# Training and evaluation of a model
# Most common ratios, 20testing - 80training, 30, 70
# import Sckit-learn help to splite train_test_split

# Random state is used to shuffle data before splitting it into a training and test set. 

# Cross validation  - evaluate the performance of the data, multiple folds
# validation helps us know the performance of a model

# Types of cross validation are; train/test we can use K-Fold Cross Validation 
# Stratified k-Fold, Ensure our distribution of the target variable, particularly used in the imbalance datasets
# Cross validation helps in reducing variance in model performance estimates
# Provides a comprehensive evaluation of model performance 
# you should show the algorithm being used in your work.

# Supervised learning Algorithm
# 1. Regression, Linear or Polynomial 
# 2. Classification, (problems can be)
# Logistic regression, k-Nearest Neighbors(kNN), Support Vector Machine(SVM), Decision Trees and Random Forest

# Model Evaluation
# we have metrics 
# Metrics: Accuracy, Precision, Recall, F1-Score
# Confusion Matrix,
# AUC and ROC, Area Under the Curve, Receiver Operating characteristics Curve 