# Baseline Ames Housing Pipeline
**Adapted from** Aarthi93’s “End-to-End ML pipeline”  
**Original source & license:**  
https://www.kaggle.com/code/aarthi93/end-to-end-ml-pipeline  
Released under the Apache 2.0 License: https://www.apache.org/licenses/LICENSE-2.0

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

# ======================
# 1. SETUP
# ======================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import os


In [None]:
# Load data from Kaggle input directory
def load_data():
    """
    Loads the Ames housing dataset from Kaggle's input directory
    
    Returns:
        pd.DataFrame: Raw housing data
    """
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# ======================
# 2. DATA LOADING
# ======================
def load_data():
    """Loads and prints initial data"""
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            if 'Ames' in filename:
                df = pd.read_csv(os.path.join(dirname, filename))
                print("\n=== RAW DATA ===")
                print(f"Shape: {df.shape}")
                print("First 3 rows:")
                display(df.head(3))
                print("\nData types:")
                print(df.dtypes.value_counts())
                return df
    raise FileNotFoundError("Dataset not found")

raw_data = load_data()

In [None]:
# ======================
# 3. DATA CLEANING
# ======================
def clean_data(df):
    """Cleans data with printouts"""
    print("\n=== CLEANING DATA ===")
    
    # Columns to drop with reasons
    cols_to_drop = {
        'Order': 'Index column',
        'PID': 'Property ID',
        'Alley': '93% missing',
        'Pool QC': '99% missing', 
        'Fence': '80% missing',
        'Misc Feature': '96% missing',
        'Garage Yr Blt': 'Redundant with Year Built',
        'Mo Sold': 'Potential leakage',
        'Yr Sold': 'Potential leakage'
    }
    
    print(f"\nDropping columns: {cols_to_drop.keys()}")
    df_clean = df.drop(columns=cols_to_drop.keys())
    
    # Before filling missing
    print("\nMissing values BEFORE filling:")
    print(df_clean.isna().sum().sort_values(ascending=False).head(10))
    
    # Fill missing
    df_filled = df_clean.copy()
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            df_filled[col] = df_clean[col].fillna('None')
        else:
            df_filled[col] = df_clean[col].fillna(df_clean[col].median())
    
    # After filling missing
    print("\nMissing values AFTER filling:")
    print(df_filled.isna().sum().sum(), "total missing values remaining")
    
    print("\nCleaned data shape:", df_filled.shape)
    print("\nSample of cleaned data:")
    display(df_filled.head(3))
    
    return df_filled

cleaned_data = clean_data(raw_data)


In [None]:
# ======================
# 4. FEATURE ENGINEERING 
# ======================
def engineer_features(df):
    """Engineers features with printouts"""
    print("\n=== FEATURE ENGINEERING ===")
    CURRENT_YEAR = 2025
    
    df_eng = df.copy()
    
    # New features
    df_eng['House_Age'] = CURRENT_YEAR - df['Year Built']
    df_eng['Years_Since_Remodel'] = CURRENT_YEAR - df['Year Remod/Add']
    df_eng['Total_SF'] = df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
    df_eng['Total_Bathrooms'] = (df['Full Bath'] + 0.5*df['Half Bath'] + 
                                df['Bsmt Full Bath'] + 0.5*df['Bsmt Half Bath'])
    df_eng['Has_Pool'] = (df['Pool Area'] > 0).astype(int)
    
    # Columns to drop
    cols_to_drop = [
        'Year Built', 'Year Remod/Add', 'Total Bsmt SF',
        '1st Flr SF', '2nd Flr SF', 'Full Bath', 'Half Bath',
        'Bsmt Full Bath', 'Bsmt Half Bath', 'Pool Area'
    ]
    
    print("\nAdded new features:")
    print([col for col in df_eng.columns if col not in df.columns])
    
    print("\nDropping original columns:", cols_to_drop)
    df_final = df_eng.drop(columns=cols_to_drop)
    
    print("\nEngineered data shape:", df_final.shape)
    print("\nSample with new features:")
    display(df_final.head(3))
    
    return df_final

engineered_data = engineer_features(cleaned_data)


In [None]:
# ======================
# 5. DATA PREPROCESSING
# ======================
def prepare_data(df):
    """Prepares final data with printouts"""
    print("\n=== FINAL DATA PREP ===")
    
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    
    numeric_cols = X.select_dtypes(include=np.number).columns
    categorical_cols = X.select_dtypes(include='object').columns
    
    print(f"\nNumeric features ({len(numeric_cols)}):")
    print(numeric_cols.tolist())
    
    print(f"\nCategorical features ({len(categorical_cols)}):")
    print(categorical_cols.tolist())
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print("\nTrain/test split sizes:")
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
    
    return X_train, X_test, y_train, y_test, numeric_cols, categorical_cols

X_train, X_test, y_train, y_test, numeric_cols, categorical_cols = prepare_data(engineered_data)


In [None]:
# ======================
# 6. MODEL BUILDING & INSPECTION
# ======================
print("\n=== MODEL BUILDING ===")

# Build pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=150,
        max_depth=30,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ))
])

# Print full model structure
print("\n=== FULL MODEL STRUCTURE ===")
from sklearn import set_config
set_config(display='diagram')  # Enable visual display
display(model)  # This will show the complete pipeline diagram

# Alternative text representation
print("\n=== DETAILED MODEL PARAMETERS ===")
from pprint import pprint
print("\nPreprocessor configuration:")
pprint(preprocessor.get_params())

print("\nRegressor configuration:")
pprint(model.named_steps['regressor'].get_params())

# Print feature names after transformation
print("\n=== TRANSFORMED FEATURE NAMES ===")
model.fit(X_train, y_train)  # Need to fit first to get feature names

# Get feature names from one-hot encoding
encoder = model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder']
cat_features = encoder.get_feature_names_out(categorical_cols)
all_features = np.concatenate([numeric_cols, cat_features])

print(f"\nTotal features after preprocessing: {len(all_features)}")
print("\nFirst 20 feature names:")
print(all_features[:20])

In [None]:
# ======================
# 7. TRAINING & EVALUATION 
# ======================
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

print("\n=== MODEL TRAINING ===")
model.fit(X_train, y_train)
print("Training completed!")

print("\n=== EVALUATION ===")
y_pred = model.predict(X_test)

# 1. Basic Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\n=== BASIC METRICS ===")
print(f"RMSE: ${rmse:,.2f}")
print(f"MAE: ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")

# 2. Prediction Visualization
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()

# 3. Error Distribution
errors = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(errors, kde=True)
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel('Prediction Errors')
plt.title('Distribution of Prediction Errors')
plt.show()

# 4. Feature Importance (Detailed)
importances = model.named_steps['regressor'].feature_importances_
feature_names = np.concatenate([
    numeric_cols,
    model.named_steps['preprocessor']
    .named_transformers_['cat']
    .named_steps['encoder']
    .get_feature_names_out(categorical_cols)
])

feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
top_features = feature_importance.sort_values('Importance', ascending=False).head(20)

print("\n=== TOP 20 FEATURES ===")
display(top_features)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

# 5. Residual Analysis
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Analysis')
plt.show()

# 6. Model Learning Curve (Optional)
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    model, X_train, y_train, cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)
)

train_scores_mean = np.sqrt(-train_scores.mean(axis=1))
test_scores_mean = np.sqrt(-test_scores.mean(axis=1))

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Validation')
plt.xlabel('Training examples')
plt.ylabel('RMSE')
plt.title('Learning Curve')
plt.legend()
plt.show()