## Day 3 -- Missing Value & Data Validation
Fokus: Missing Value & Data Validation

Dataset: customer_dirty.csv

In [None]:
import pandas as pd
import numpy as np

# Read the CSV with tab delimiter since the data is tab-separated
df = pd.read_csv('/Users/axzolotle/Code/self-learn/learning-intern/dataset/Customer_dirty_data_task3.csv', sep='\t')

print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nFirst 5 rows:")
df.head()

In [None]:
# Dataset Info
df.info()

In [None]:
# Audit Missing Value
print("=== Missing Value Audit ===")
missing_count = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing %': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

### Handle Missing (Logic-based)

Contoh logic (boleh kamu modif):

- income missing + employment_status = unemployed → 0
- gender missing → "unknown"
- age missing → biarkan dulu

In [None]:
def fill_null(df):
    """Fill null values based on business logic"""
    # Handle income: if unemployed, set income to 0
    mask = (df['employment_status'] == 'unemployed') & (df['income'].isnull())
    df.loc[mask, 'income'] = 0
    
    # Fill missing gender with 'unknown'
    df['gender'] = df['gender'].fillna('unknown')
    
    return df

# Apply the function
data_filled = fill_null(df.copy())

print("After filling null values:")
print(data_filled.isnull().sum())

### Data Validation

Cari data:

- age < 0 atau age > 100 (invalid age)
- income < 0 (invalid income)
- age < 18 tapi employment_status = employed (young employed - could be valid, marked as warning)

In [None]:
def check_validation(df):
    """
    Check for invalid data entries.
    Uses OR (|) instead of AND (&) to catch ANY of the following issues:
    1. age < 0 (negative age)
    2. age > 100 (unrealistically old)
    3. income < 0 (negative income)
    """
    # FIXED: Use OR (|) instead of AND (&) to find any invalid data
    invalid_mask = (
        (df['age'] < 0) |  # Negative age
        (df['age'] > 100) |  # Age over 100
        (df['income'] < 0)  # Negative income
    )
    
    invalid_data = df[invalid_mask]
    
    # Check for young employed (warning, not error)
    young_employed_mask = (df['age'] < 18) & (df['employment_status'] == 'employed')
    young_employed = df[young_employed_mask]
    
    print("=== Validation Results ===")
    print(f"Total invalid records: {len(invalid_data)}")
    print(f"Total young employed records (warning): {len(young_employed)}")
    
    if len(invalid_data) == 0:
        print("✓ All data passed validation!")
    else:
        print("\nInvalid records found:")
        print(invalid_data)
    
    return invalid_data, young_employed

# Run validation
df_validation, df_young_employed = check_validation(data_filled)

In [None]:
# Summary Report
print("=== Data Quality Summary ===\n")
print(f"Total Records: {len(data_filled)}")
print(f"Missing Values: {data_filled.isnull().sum().sum()}")
print(f"Invalid Records: {len(df_validation)}")
print(f"Young Employed (Warning): {len(df_young_employed)}")
print("\n=== Column Statistics ===")
data_filled.describe()