# Cleaning the Dataset & Create Database Schema for the Input csv File

In [7]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [9]:
def safe_convert_to_int(x):
    try:
        # First convert to float to handle any string or float representations, then to int
        return int(float(x))
    except:
        # If conversion fails, return NaN
        return np.nan

# Load the CSV file
df = pd.read_csv('heart_disease.csv')

# Specify only the columns you need
columns_needed = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
df = df[columns_needed]

# Restrict to the first 900 rows
df = df.iloc[:899]

# List of columns that should be integers based on your database schema
integer_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'slope', 'ca', 'thal']
float_columns = ['oldpeak']  # This column should be treated as float

# Convert integer columns
for column in integer_columns:
    if column in df.columns:
        # Apply the safe conversion to integer
        df[column] = df[column].apply(safe_convert_to_int)
        # Ensure the conversion was successful and force type if necessary
        df[column] = df[column].astype('Int64')

# Convert float columns correctly
for column in float_columns:
    if column in df.columns:
        df[column] = pd.to_numeric(df[column], errors='coerce')

# Verify data types and print the first few rows to inspect them
print(df.dtypes)
print(df.head())

# Save the cleaned CSV file
df.to_csv('/home/ubuntu/de300/DE300/HW01/columns_needed_heart_disease.csv', index=False)
engine = create_engine('postgresql://admin:password@localhost:5433/heartdisease')
df.to_sql('cleaned_patient_data', con=engine, if_exists='replace', index=False)

age           Int64
sex           Int64
cp            Int64
trestbps      Int64
chol          Int64
fbs           Int64
restecg       Int64
thalach       Int64
exang         Int64
oldpeak     float64
slope         Int64
ca            Int64
thal          Int64
dtype: object
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

   ca  thal  
0   0     6  
1   3     3  
2   2     7  
3   0     3  
4   0     3  
