# Ames Housing Data Cleaning

## 1. Setup & Constants

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

#constants
DATA_PATH = Path("../dataset/train.csv")
CLEANED_DATA_PATH = Path("../dataset/cleaned_train.csv")

## 📥 Load Raw Data

Loads the raw Ames Housing dataset from CSV.  
Ensures you start with a fresh copy for cleaning.

In [2]:
def load_raw_data():
    """Load raw data from CSV file."""
    df = pd.read_csv(DATA_PATH)
    print(f"Loaded raw data: {df.shape}")
    return df

df_raw = load_raw_data()
df = df_raw.copy()    

Loaded raw data: (1460, 81)


## 🔍 Analyze Missing Values

Summarizes missing values in each column, showing count, percentage, and data type.  
Helps you understand where and how much data is missing.

In [3]:
def analyze_missing_values(df):
    """Analyze missing values in the DataFrame."""
    missing = df.isnull().sum()
    missing_percent = (missing/len(df)) * 100

    missing_df = pd.DataFrame({
        'missing_count' : missing,
        'missing_percent' : missing_percent,
        'missing_dtype' : df.dtypes
    })

    missing_df = missing_df[missing_df['missing_count']>0].sort_values(by='missing_count', ascending=False)

    return missing_df

missing_values = analyze_missing_values(df)
print("Missing values analysis:")
print(missing_values)

Missing values analysis:
              missing_count  missing_percent missing_dtype
PoolQC                 1453        99.520548        object
MiscFeature            1406        96.301370        object
Alley                  1369        93.767123        object
Fence                  1179        80.753425        object
MasVnrType              872        59.726027        object
FireplaceQu             690        47.260274        object
LotFrontage             259        17.739726       float64
GarageType               81         5.547945        object
GarageYrBlt              81         5.547945       float64
GarageFinish             81         5.547945        object
GarageQual               81         5.547945        object
GarageCond               81         5.547945        object
BsmtExposure             38         2.602740        object
BsmtFinType2             38         2.602740        object
BsmtQual                 37         2.534247        object
BsmtCond                 37    

## 🧹 Handle Missing Values

Fills missing values using domain knowledge:
- LotFrontage: median per Neighborhood
- Numerical columns: median
- Absence features: 'NA' or 'None'
- Electrical: mode
- Other categoricals: mode

Ensures no missing values remain before further cleaning.

In [4]:
def handle_missing_values(df):
    """Handle missing values in the DataFrame."""
    
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # LotFrontage → median per Neighborhood
    if "LotFrontage" in df.columns:
        df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
            lambda x: x.fillna(x.median())
        )
    
    # Numerical → median
    num_cols = df.select_dtypes(include=[np.number]).columns    
    for col in num_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())

    # Absence categories → fill with 'NA'/'None'
    absence_features = [
    "MasVnrType","BsmtQual","BsmtCond","BsmtExposure",
    "BsmtFinType1","BsmtFinType2","GarageType","GarageFinish",
    "GarageQual","GarageCond","FireplaceQu",
    "PoolQC","Fence","Alley","MiscFeature"]
    
    for col in absence_features:
        if col in df.columns:
            fill_value = 'None' if col == "MasVnrType" else "NA"
            df[col] = df[col].fillna(fill_value)  # Removed inplace=True

    # Electrical → mode
    if "Electrical" in df.columns and df['Electrical'].isnull().any():
        df["Electrical"] = df["Electrical"].fillna(df["Electrical"].mode()[0])  # Removed inplace=True

    # Get all categorical columns as a set
    all_cat_cols = set(df.select_dtypes(include=["object"]).columns)
    
    # Remove already handled columns using set difference
    remaining_cats = all_cat_cols - set(absence_features) - {"Electrical"}
    
    # Handle only the remaining categorical columns
    for col in remaining_cats:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

## 🛠️ Fix Data Types

Converts columns to appropriate numeric or datetime types.  
Ensures calculations and transformations work as expected.

In [6]:
numeric_columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'GarageYrBlt']
year_columns = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
def fix_datatypes(df):
    """Convert columns to appropriate datatypes"""
    # Convert numeric columns that were read as object
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    #convert year columns to datetime
    for col in year_columns:
        df[col] = pd.to_datetime(df[col], format="%Y", errors='coerce')
    
    return df

## 🚨 Analyze Outliers

Detects outliers in key numeric columns using z-score thresholds.  
Reports count, percentage, and threshold values for each column.

In [8]:
OUTLIER_COLUMNS = [
    'LotFrontage', 'LotArea', 'MasVnrArea',
    'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF',
    '1stFlrSF', '2ndFlrSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
    'SalePrice'
]

def analyze_outliers(df, OUTLIER_COLUMNS, n_std=3):

    outliers_info = {}

    for col in OUTLIER_COLUMNS:
        mean = df[col].mean()
        std = df[col].std()
        z_score = np.abs((df[col]-mean)/std)

        outliers = df[z_score > n_std]

        if len(outliers) > 0:
            outliers_info[col]={
                "count": len(outliers),
                "percent": (len(outliers) / len(df)) * 100,
                "min": df[col].min(),
                "max": df[col].max(),
                "threshold_low": mean - n_std * std,
                "threshold_high": mean + n_std * std
            }
    
    return pd.DataFrame(outliers_info).T

## ✂️ Handle Outliers

Clips extreme values in specified columns to within defined thresholds.  
Reduces the impact of outliers on modeling and analysis.

In [9]:
# Define custom thresholds for specific columns
COLUMN_THRESHOLDS = {
    'LotArea': 2.5,        # More strict for lot area
    'SalePrice': 2.5,      # More strict for price
    'GrLivArea': 2.5,      # More strict for living area
    'default': 3           # Default for other columns
}

def handling_outliers(df, columns, thresholds=COLUMN_THRESHOLDS, method='clip'):
    df = df.copy()
    
    for col in columns:
        n_std = thresholds.get(col, thresholds['default'])
        mean = df[col].mean()
        std = df[col].std()
        
        if method == 'clip':
            df[col] = df[col].clip(
                lower=mean - n_std * std,
                upper=mean + n_std * std
            )
    
    return df

## 📈 Transform Skewed Features

Identifies highly skewed numeric columns and applies log transformation (`log1p`).  
Helps normalize distributions for better model performance.

In [15]:
def transform_skewed(df, columns, skew_threshold=1):
    """
    Transform skewed numerical features using log1p.
    
    Args:
        df: DataFrame
        columns: List of columns to check for skewness
        skew_threshold: Above this value, apply transformation
    
    Returns:
        df: Transformed DataFrame
        transformed_cols: List of columns that were transformed
    """
    df = df.copy()
    
    # Calculate skewness for each column
    skewness = df[columns].apply(lambda x: x.skew())
    
    # Find columns with high skewness
    skewed_cols = skewness[skewness > skew_threshold].index.tolist()
    
    # Apply log1p transformation
    for col in skewed_cols:
        df[col] = np.log1p(df[col])
        print(f"Transformed {col}: skewness before={skewness[col]:.2f}, after={df[col].skew():.2f}")
    
    return df, skewed_cols

## 💾 Save Cleaned Data

Exports the fully cleaned dataset to CSV for modeling and analysis.