# Medical Insurance Cost Prediction - Step 1: Data Loading & Cleaning

This notebook will:
1. Load our insurance data
2. Check for any problems in the data
3. Clean the data for analysis

In [None]:
# === 1. IMPORT LIBRARIES ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# === 2. LOAD RAW DATA ===
try:
    raw_df = pd.read_csv('../data/Medical_Insurance_New.csv')
    print("✅ Data loaded successfully. Shape:", raw_df.shape)
except Exception as e:
    print("❌ Error loading data:", e)

# === 3. INITIAL INSPECTION ===
print("\n=== INITIAL DATA CHECK ===")
print("First 5 rows:")
display(raw_df.head())
print("\nData types:")
print(raw_df.dtypes)
print("\nMissing values:")
print(raw_df.isnull().sum())

# === 4. DATA VALIDATION ===
def validate_data(df):
    """Check for common data quality issues"""
    tests = {
        'Negative ages': (df['age'] < 0).any(),
        'Invalid BMI (0-10 or >60)': ((df['bmi'] < 10) | (df['bmi'] > 60)).any(),
        'Invalid children count': (df['children'] < 0).any(),
        'Invalid smoker values': ~df['smoker'].isin(['yes', 'no']).all(),
        'Invalid sex values': ~df['sex'].isin(['male', 'female']).all()
    }
    
    print("\n=== DATA VALIDATION RESULTS ===")
    for test_name, test_result in tests.items():
        status = "❌ FAILED" if test_result else "✅ PASSED"
        print(f"{status}: {test_name}")

validate_data(raw_df)

# === 5. DATA CLEANING ===
def clean_data(df):
    """Perform comprehensive data cleaning"""
    # Create a copy to avoid SettingWithCopyWarning
    clean_df = df.copy()
    
    # Remove duplicates (keeping first occurrence)
    clean_df = clean_df.drop_duplicates(keep='first')
    
    # Handle missing values
    print(f"\nRemoved {clean_df.isnull().sum().sum()} rows with missing values")
    clean_df = clean_df.dropna()
    
    # Fix data types
    clean_df['children'] = clean_df['children'].astype(int)
    
    # Correct invalid values
    clean_df = clean_df[
        (clean_df['age'] > 0) & 
        (clean_df['bmi'].between(10, 60)) & 
        (clean_df['children'] >= 0)
    ]
    
    # Standardize categorical values
    clean_df['smoker'] = clean_df['smoker'].str.lower().str.strip()
    clean_df['sex'] = clean_df['sex'].str.lower().str.strip()
    clean_df['region'] = clean_df['region'].str.title().str.strip()
    
    return clean_df

cleaned_df = clean_data(raw_df)

# === 6. POST-CLEANING VALIDATION ===
print("\n=== POST-CLEANING REPORT ===")
print("New shape:", cleaned_df.shape)
print("\nMissing values after cleaning:")
print(cleaned_df.isnull().sum())

validate_data(cleaned_df)

# === 7. EXPLORATORY CHECKS ===
print("\n=== BASIC STATISTICS ===")
print(cleaned_df.describe())

# Visual check of key distributions
plt.figure(figsize=(15,4))
for i, col in enumerate(['age', 'bmi', 'charges'], 1):
    plt.subplot(1,3,i)
    sns.histplot(cleaned_df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# === 8. SAVE CLEANED DATA ===
try:
    cleaned_df.to_csv('../data/insurance_cleaned.csv', index=False)
    print("\n💾 Cleaned data saved to '../data/insurance_cleaned.csv'")
except Exception as e:
    print("\n❌ Error saving cleaned data:", e)

# === 9. DATA QUALITY REPORT ===
print("\n=== FINAL DATA QUALITY REPORT ===")
print(f"Original records: {len(raw_df)}")
print(f"Final cleaned records: {len(cleaned_df)}")
print(f"\nRecords removed: {len(raw_df) - len(cleaned_df)} ({(len(raw_df)-len(cleaned_df))/len(raw_df)*100:.1f}%)")