In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report


In [5]:
# Load heart disease dataset
heart = pd.read_csv("heart.csv")

# Load air quality dataset (UCI format)
air = pd.read_csv("AirQuality.csv", sep=';', decimal=',')


In [7]:
heart

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [8]:
air

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,",,,,,"
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,","
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,",,,,,"
9467,,,,,,,,,,,,,,,,,",,,,,"
9468,,,,,,,,,,,,,,,,,",,,,,"
9469,,,,,,,,,,,,,,,,,",,,,,"


In [11]:
for col in heart.columns:
    print(f"Missing values in {col}: {heart[col].isnull().sum()}")

# Drop duplicates
heart = heart.drop_duplicates()

# Fill missing values with column mean
heart = heart.fillna(heart.mean(numeric_only=True))

Missing values in id: 0
Missing values in age: 0
Missing values in sex: 0
Missing values in dataset: 0
Missing values in cp: 0
Missing values in trestbps: 59
Missing values in chol: 30
Missing values in fbs: 90
Missing values in restecg: 2
Missing values in thalch: 55
Missing values in exang: 55
Missing values in oldpeak: 62
Missing values in slope: 309
Missing values in ca: 611
Missing values in thal: 486
Missing values in num: 0


In [12]:
air = air.dropna(how="all", axis=1)

# Display missing values per column (fixed)
for col in air.columns:
    print(f"Missing values in {col}: {air[col].isnull().sum()}")

# Drop rows with any missing values
air = air.dropna()

# Drop duplicate rows
air = air.drop_duplicates()

# Convert numeric columns to float if needed
air_numeric_cols = air.select_dtypes(include='object').columns

for col in air_numeric_cols:
    try:
        air[col] = air[col].str.replace(',', '.').astype(float)
    except:
        pass  # skip if conversion fails

# Fill remaining NaNs (if any) with mean
air = air.fillna(air.mean(numeric_only=True))

Missing values in Date: 114
Missing values in Time: 114
Missing values in CO(GT): 114
Missing values in PT08.S1(CO): 114
Missing values in NMHC(GT): 114
Missing values in C6H6(GT): 114
Missing values in PT08.S2(NMHC): 114
Missing values in NOx(GT): 114
Missing values in PT08.S3(NOx): 114
Missing values in NO2(GT): 114
Missing values in PT08.S4(NO2): 114
Missing values in PT08.S5(O3): 114
Missing values in T: 114
Missing values in RH: 114
Missing values in AH: 114
Missing values in ,,,,,: 6915


In [13]:
# Add dummy key column to join
heart["key"] = 1
air["key"] = 1

# Merge based on dummy key (for demonstration purpose only)
merged_data = pd.merge(heart, air, on="key")
merged_data.drop("key", axis=1, inplace=True)


In [15]:
# Scale numeric features (on merged dataset)
numeric_cols = merged_data.select_dtypes(include=np.number).columns
scaler = StandardScaler()
merged_data[numeric_cols] = scaler.fit_transform(merged_data[numeric_cols])


In [16]:
# Filter out invalid values (example: negative age or negative CO levels)
merged_data = merged_data[merged_data['age'] > 0]
if 'CO(GT)' in merged_data.columns:
    merged_data = merged_data[merged_data['CO(GT)'] >= 0]


In [22]:
merged_data

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,...,PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,",,,,,"
0,-1.730169,1.007386,Male,Cleveland,typical angina,0.698041,0.311021,True,lv hypertrophy,0.495698,...,0.395563,0.354381,1.099057,0.910153,0.405386,0.329213,0.316467,0.397904,0.413409,","
1,-1.730169,1.007386,Male,Cleveland,typical angina,0.698041,0.311021,True,lv hypertrophy,0.495698,...,-0.201981,0.202283,1.739444,0.808415,0.073582,-0.125712,0.283703,0.534084,0.414031,","
3,-1.730169,1.007386,Male,Cleveland,typical angina,0.698041,0.311021,True,lv hypertrophy,0.495698,...,0.279432,0.617771,1.011836,1.037326,0.343724,0.573594,0.284963,0.505298,0.413622,","
4,-1.730169,1.007386,Male,Cleveland,typical angina,0.698041,0.311021,True,lv hypertrophy,0.495698,...,-0.292774,0.146637,1.996517,0.662166,-0.045339,-0.460326,0.252199,0.542941,0.412576,","
8,-1.730169,1.007386,Male,Cleveland,typical angina,0.698041,0.311021,True,lv hypertrophy,0.495698,...,-0.301220,0.202283,1.663699,0.738470,-0.013039,0.041595,0.253460,0.573941,0.413028,","
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2246635,1.730169,0.901224,Male,VA Long Beach,atypical angina,-0.658158,0.503861,False,lv hypertrophy,-1.772974,...,-2.043176,4.524105,-2.054677,2.398075,-2.177106,-1.873976,-2.371418,-2.344503,-2.381598,",,,"
2246636,1.730169,0.901224,Male,VA Long Beach,atypical angina,-0.658158,0.503861,False,lv hypertrophy,-1.772974,...,-2.043176,3.908292,-2.054677,2.131012,-2.177106,-1.873976,-2.371418,-2.344503,-2.381598,",,,"
2246637,1.730169,0.901224,Male,VA Long Beach,atypical angina,-0.658158,0.503861,False,lv hypertrophy,-1.772974,...,-2.043176,3.960228,-2.054677,2.162806,-2.177106,-1.873976,-2.371418,-2.344503,-2.381598,",,,"
2246638,1.730169,0.901224,Male,VA Long Beach,atypical angina,-0.658158,0.503861,False,lv hypertrophy,-1.772974,...,-2.043176,2.350210,-2.054677,1.692266,-2.177106,-1.873976,-2.371418,-2.344503,-2.381598,",,,"


In [23]:
heart = heart.drop(columns=['id', 'key'])

# Convert target column: num → binary
heart['num'] = heart['num'].apply(lambda x: 1 if x > 0 else 0)

# Convert categorical/string columns to numeric (one-hot encoding)
heart_encoded = pd.get_dummies(heart, drop_first=True)

# Separate features and target
X = heart_encoded.drop('num', axis=1)
y = heart_encoded['num']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Heart Disease Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Heart Disease Model Accuracy: 0.8369565217391305
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        75
           1       0.88      0.83      0.86       109

    accuracy                           0.84       184
   macro avg       0.83      0.84      0.83       184
weighted avg       0.84      0.84      0.84       184



In [24]:
air_numeric = air.select_dtypes(include=np.number)
air_scaled = pd.DataFrame(scaler.fit_transform(air_numeric), columns=air_numeric.columns)

kmeans = KMeans(n_clusters=3, random_state=42)
air_scaled['Cluster'] = kmeans.fit_predict(air_scaled)

print("Air Quality Clustering Distribution:")
print(air_scaled['Cluster'].value_counts())


Air Quality Clustering Distribution:
Cluster
0    1961
1     366
2     115
Name: count, dtype: int64
