In [23]:
import pandas as pd


# Load the dataset
file_path = '/Users/ankita/Downloads/KaggleV2-May-2016.csv'
df = pd.read_csv(file_path)

# Step 1: Rename columns to be lowercase with underscores
df.columns = df.columns.str.strip().str.lower().str.replace("-", "_").str.replace(" ", "_")

# Step 2: Convert date columns to datetime
df['scheduledday'] = pd.to_datetime(df['scheduledday'], errors='coerce')
df['appointmentday'] = pd.to_datetime(df['appointmentday'], errors='coerce')

# Step 3: Standardize text values
df['gender'] = df['gender'].str.upper().str.strip()
df['no_show'] = df['no_show'].str.strip().str.capitalize()

# Step 4: Remove duplicate rows
df_before = df.shape[0]
df = df.drop_duplicates()
df_after = df.shape[0]
duplicates_removed = df_before - df_after

# Step 5: Fix data types
df['patientid'] = df['patientid'].astype('int64')  # Convert from float to int for accuracy

# Step 6: Save the cleaned file
cleaned_file_path = '/Users/ankita/Downloads/cleaned_data.csv'
df.to_csv(cleaned_file_path, index=False, encoding='utf-8')

# Optional: Print summary of changes
summary = {
    "Duplicates removed": duplicates_removed,
    "Columns renamed": list(df.columns),
    "Date columns converted": ["scheduledday", "appointmentday"],
    "Standardized columns": ["gender", "no_show"],
    "Data type fixes": {"patientid": "int64"}
}

print("✅ Cleaning complete. Summary of changes:")
print(summary)






✅ Cleaning complete. Summary of changes:
{'Duplicates removed': 0, 'Columns renamed': ['patientid', 'appointmentid', 'gender', 'scheduledday', 'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show'], 'Date columns converted': ['scheduledday', 'appointmentday'], 'Standardized columns': ['gender', 'no_show'], 'Data type fixes': {'patientid': 'int64'}}
