In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

#constants
DATA_PATH = Path("../dataset/train.csv")
CLEANED_DATA_PATH = Path("../dataset/cleaned_train.csv")

In [6]:
def load_raw_data():
    """Load raw data from CSV file."""
    df = pd.read_csv(DATA_PATH)
    print(f"Loaded raw data: {df.shape}")
    return df

df_raw = load_raw_data()
df = df_raw.copy()    

Loaded raw data: (1460, 81)


In [6]:
def analyze_missing_values(df):
    """Analyze missing values in the DataFrame."""
    missing = df.isnull().sum()
    missing_percent = (missing/len(df)) * 100

    missing_df = pd.DataFrame({
        'missing_count' : missing,
        'missing_percent' : missing_percent,
        'missing_dtype' : df.dtypes
    })

    missing_df = missing_df[missing_df['missing_count']>0].sort_values(by='missing_count', ascending=False)

    return missing_df

missing_values = analyze_missing_values(df)
print("Missing values analysis:")
print(missing_values)

Missing values analysis:
              missing_count  missing_percent missing_dtype
PoolQC                 1453        99.520548        object
MiscFeature            1406        96.301370        object
Alley                  1369        93.767123        object
Fence                  1179        80.753425        object
MasVnrType              872        59.726027        object
FireplaceQu             690        47.260274        object
LotFrontage             259        17.739726       float64
GarageType               81         5.547945        object
GarageYrBlt              81         5.547945       float64
GarageFinish             81         5.547945        object
GarageQual               81         5.547945        object
GarageCond               81         5.547945        object
BsmtExposure             38         2.602740        object
BsmtFinType2             38         2.602740        object
BsmtQual                 37         2.534247        object
BsmtCond                 37    

In [31]:
def handle_missing_values(df):
    """Handle missing values in the DataFrame."""
    
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # LotFrontage → median per Neighborhood
    if "LotFrontage" in df.columns:
        df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
            lambda x: x.fillna(x.median())
        )
    
    # Numerical → median
    num_cols = df.select_dtypes(include=[np.number]).columns    
    for col in num_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())

    # Absence categories → fill with 'NA'/'None'
    absence_features = [
    "MasVnrType","BsmtQual","BsmtCond","BsmtExposure",
    "BsmtFinType1","BsmtFinType2","GarageType","GarageFinish",
    "GarageQual","GarageCond","FireplaceQu",
    "PoolQC","Fence","Alley","MiscFeature"]
    
    for col in absence_features:
        if col in df.columns:
            fill_value = 'None' if col == "MasVnrType" else "NA"
            df[col] = df[col].fillna(fill_value)  # Removed inplace=True

    # Electrical → mode
    if "Electrical" in df.columns and df['Electrical'].isnull().any():
        df["Electrical"] = df["Electrical"].fillna(df["Electrical"].mode()[0])  # Removed inplace=True

    # Get all categorical columns as a set
    all_cat_cols = set(df.select_dtypes(include=["object"]).columns)
    
    # Remove already handled columns using set difference
    remaining_cats = all_cat_cols - set(absence_features) - {"Electrical"}
    
    # Handle only the remaining categorical columns
    for col in remaining_cats:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

In [32]:
df = handle_missing_values(df)
df.isnull().sum().sum()

np.int64(0)

In [9]:
numeric_columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'GarageYrBlt']
year_columns = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
def fix_datatypes(df):
    """Convert columns to appropriate datatypes"""
    # Convert numeric columns that were read as object
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    #convert year columns to datetime
    for col in year_columns:
        df[col] = pd.to_datetime(df[col], format="%Y", errors='coerce')
    
    return df

In [10]:
# Apply data type fixes
df = fix_datatypes(df)

# Verify changes
print("\nData Types After Conversion:")
print(df[numeric_columns + year_columns].dtypes)


Data Types After Conversion:
LotFrontage            float64
LotArea                  int64
MasVnrArea             float64
GarageYrBlt     datetime64[ns]
YearBuilt       datetime64[ns]
YearRemodAdd    datetime64[ns]
GarageYrBlt     datetime64[ns]
YrSold          datetime64[ns]
dtype: object
