In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
df=pd.read_csv("heart.csv")

In [18]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [6]:
# a) Standard deviation and variance
print("\n📊 Standard Deviation of numeric attributes:")
print(df.std(numeric_only=True))


📊 Standard Deviation of numeric attributes:
age          9.072290
sex          0.460373
cp           1.029641
trestbps    17.516718
chol        51.592510
fbs          0.356527
restecg      0.527878
thalach     23.005724
exang        0.472772
oldpeak      1.175053
slope        0.617755
ca           1.030798
thal         0.620660
target       0.500070
dtype: float64


In [7]:
# Manual Mean
def mean_cal(col):
    s = 0
    for i in range(df.shape[0]):
        s += df[col][i]
    return s / df.shape[0]
    

In [8]:
# Manual Variance
def var_cal(col):
    m = mean_cal(col)
    s = 0
    for i in range(df.shape[0]):
        s += (df[col][i] - m) ** 2
    return s / df.shape[0]

In [9]:
# Manual Standard Deviation
def std_dev(col):
    return var_cal(col) ** 0.5

In [13]:
# a) Variance & Standard Deviation
print("Manual Variance & Standard Deviation")
for i in df:
    print(f"{i} → Variance: {var_cal(i):.3f} | Std Dev: {std_dev(i):.3f}")


Manual Variance & Standard Deviation
age → Variance: 82.226 | Std Dev: 9.068
sex → Variance: 0.212 | Std Dev: 0.460
cp → Variance: 1.059 | Std Dev: 1.029
trestbps → Variance: 306.536 | Std Dev: 17.508
chol → Variance: 2659.190 | Std Dev: 51.567
fbs → Variance: 0.127 | Std Dev: 0.356
restecg → Variance: 0.278 | Std Dev: 0.528
thalach → Variance: 528.747 | Std Dev: 22.994
exang → Variance: 0.223 | Std Dev: 0.473
oldpeak → Variance: 1.379 | Std Dev: 1.174
slope → Variance: 0.381 | Std Dev: 0.617
ca → Variance: 1.062 | Std Dev: 1.030
thal → Variance: 0.385 | Std Dev: 0.620
target → Variance: 0.250 | Std Dev: 0.500


In [14]:

# Manual Covariance
def covariance(col1, col2):
    mean1 = mean_cal(col1)
    mean2 = mean_cal(col2)
    cov = 0
    for i in range(df.shape[0]):
        cov += (df[col1][i] - mean1) * (df[col2][i] - mean2)
    return cov / df.shape[0]

# Manual Correlation Coefficient
def correlation(col1, col2):
    cov = covariance(col1, col2)
    std1 = std_dev(col1)
    std2 = std_dev(col2)
    return cov / (std1 * std2)

In [21]:
print("\n🔗 Covariance & Correlation:")

cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

for col in cols:
    cov = covariance(col, col)
    corr = correlation(col, col)
    print(f"{col} → Covariance: {cov:.2f}, Correlation: {corr:.2f}")



🔗 Covariance & Correlation:
age → Covariance: 82.23, Correlation: 1.00
sex → Covariance: 0.21, Correlation: 1.00
cp → Covariance: 1.06, Correlation: 1.00
trestbps → Covariance: 306.54, Correlation: 1.00
chol → Covariance: 2659.19, Correlation: 1.00
fbs → Covariance: 0.13, Correlation: 1.00
restecg → Covariance: 0.28, Correlation: 1.00
thalach → Covariance: 528.75, Correlation: 1.00
exang → Covariance: 0.22, Correlation: 1.00
oldpeak → Covariance: 1.38, Correlation: 1.00
slope → Covariance: 0.38, Correlation: 1.00
ca → Covariance: 1.06, Correlation: 1.00
thal → Covariance: 0.38, Correlation: 1.00
target → Covariance: 0.25, Correlation: 1.00


In [25]:
# Independent features = total features - target
independent_features = df.drop('target', axis=1).shape[1]
print(f" Independent features: {independent_features}")


 Independent features: 13


In [28]:
# Check if any column has low variance (maybe not useful)
for col in df.columns:
    if df[col].nunique() == 1:
        print(f"Unwanted feature (constant): {col}")


In [29]:
# Split 'age' into 4 bins with equal frequency
df['age_binned'] = pd.qcut(df['age'], q=4, labels=False)
print(df[['age', 'age_binned']].head())


   age  age_binned
0   52           1
1   53           1
2   70           3
3   61           2
4   62           3


In [39]:

# Min-Max Normalization
for col in cols:
    df[f'{col}_minmax'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    
print(df[[f'{c}_minmax' for c in cols] ].head())


   trestbps_minmax  chol_minmax  thalach_minmax
0         0.292453     0.196347        0.740458
1         0.433962     0.175799        0.641221
2         0.481132     0.109589        0.412214
3         0.509434     0.175799        0.687023
4         0.415094     0.383562        0.267176


In [40]:
# Z-score Normalization
for col in cols:
    mean = df[col].mean()
    std = df[col].std()
    df[f'{col}_zscore'] = (df[col] - mean) / std

print(df[[f'{c}_zscore' for c in cols] ].head())

   trestbps_zscore  chol_zscore  thalach_zscore
0        -0.377451    -0.659010        0.820920
1         0.478874    -0.833454        0.255843
2         0.764315    -1.395551       -1.048180
3         0.935580    -0.833454        0.516648
4         0.364697     0.930368       -1.874062


In [41]:

# Decimal Scaling Normalization
for col in cols:
    max_val = abs(df[col]).max()
    j = len(str(int(max_val)))
    df[f'{col}_decimal'] = df[col] / (10 ** j)

print(df[[f'{c}_decimal' for c in cols] ].head())

   trestbps_decimal  chol_decimal  thalach_decimal
0             0.125         0.212            0.168
1             0.140         0.203            0.155
2             0.145         0.174            0.125
3             0.148         0.203            0.161
4             0.138         0.294            0.106
