# Predictive Modeling of Ames Housing Data
This notebook demonstrates data preprocessing, modeling, and interpretation for the Ames Housing dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [2]:
# Simulate simplified data similar to Ames Housing dataset
data = {
    'MSSubClass': [60, 20, 70, 60, 50], 'LotFrontage': [65.0, 80.0, 68.0, 60.0, 84.0],
    'LotArea': [8450, 9600, 11250, 9550, 14260], 'OverallQual': [7, 6, 7, 7, 8],
    'OverallCond': [5, 8, 5, 5, 5], 'YearBuilt': [2003, 1976, 2001, 1915, 2000],
    'YearRemodAdd': [2003, 1976, 2002, 1970, 2000], 'TotalBsmtSF': [856, 1262, 920, 756, 1145],
    '1stFlrSF': [856, 1262, 920, 961, 1145], '2ndFlrSF': [854, 0, 866, 756, 1053],
    'GrLivArea': [1710, 1262, 1786, 1717, 2198], 'FullBath': [2, 2, 2, 1, 2],
    'HalfBath': [1, 0, 1, 0, 1], 'TotRmsAbvGrd': [8, 6, 6, 7, 9], 'GarageCars': [2, 2, 2, 3, 3],
    'GarageArea': [548, 460, 608, 642, 836], 'YrSold': [2008, 2007, 2008, 2006, 2008],
    'Neighborhood': ['CollgCr', 'Veenker', 'CollgCr', 'Crawfor', 'NridgHt'],
    'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan], 'SalePrice': [208500, 181500, 223500, 140000, 250000]
}

In [3]:
np.random.seed(42)
n_samples = 200
full_data = {
    'OverallQual': np.random.randint(3, 11, n_samples),
    'GrLivArea': np.random.randint(1000, 2500, n_samples),
    'TotalBsmtSF': np.random.randint(500, 2000, n_samples),
    'GarageCars': np.random.randint(0, 4, n_samples),
    'YearBuilt': np.random.randint(1950, 2010, n_samples),
    'YrSold': np.random.randint(2006, 2011, n_samples),
    'LotFrontage': np.random.uniform(50, 100, n_samples),
    'Neighborhood': np.random.choice(['CollgCr', 'Veenker', 'Crawfor', 'NridgHt', 'OldTown'], n_samples),
    'SalePrice': np.random.randint(100000, 400000, n_samples)
}
df = pd.DataFrame(full_data)
df.loc[df.index[::10], 'LotFrontage'] = np.nan

print("--- Initial Data Summary ---")
print(f"Dataset shape: {df.shape}")
print("Columns with missing values:")
print(df.isnull().sum()[df.isnull().sum() > 0])

--- Initial Data Summary ---
Dataset shape: (200, 9)
Columns with missing values:
LotFrontage    20
dtype: int64


In [4]:
# Target transformation
df['SalePrice'] = np.log1p(df['SalePrice'])

# Impute LotFrontage by Neighborhood median and fallback to global median
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

# Feature engineering: HouseAge
df['HouseAge'] = df['YrSold'] - df['YearBuilt']

In [5]:
# Preprocessing pipeline
features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'HouseAge', 'LotFrontage', 'Neighborhood']
X = df[features]
y = df['SalePrice']

categorical_features = ['Neighborhood']
numerical_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'HouseAge', 'LotFrontage']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [6]:
# Model training and comparison
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=10),
    'Lasso': Lasso(alpha=0.01)
}

print("--- Model Comparison using Cross-Validation ---")
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
    r2_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
    rmse_scores = -cross_val_score(pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
    print(f"Model: {name}")
    print(f"  Average R-squared: {np.mean(r2_scores):.4f} (+/- {np.std(r2_scores):.4f})")
    print(f"  Average RMSE: {np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})")
    print("-" * 30)

--- Model Comparison using Cross-Validation ---
Model: Linear Regression
  Average R-squared: -0.2125 (+/- 0.0412)
  Average RMSE: 0.3923 (+/- 0.0317)
------------------------------
Model: Ridge
  Average R-squared: -0.1870 (+/- 0.0475)
  Average RMSE: 0.3881 (+/- 0.0315)
------------------------------
Model: Lasso
  Average R-squared: -0.1692 (+/- 0.0579)
  Average RMSE: 0.3851 (+/- 0.0316)
------------------------------


In [7]:
# Final model interpretation
print("--- Key Findings from Final Ridge Model ---")
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Ridge(alpha=10))])
final_pipeline.fit(X, y)

cat_feature_names = final_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = numerical_features + list(cat_feature_names)
coefficients = final_pipeline.named_steps['regressor'].coef_

coef_df = pd.DataFrame(coefficients, index=all_feature_names, columns=['Coefficient'])
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
sorted_coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

print("Top 10 Most Impactful Features:")
print(sorted_coef_df.drop('Abs_Coefficient', axis=1).head(10))

--- Key Findings from Final Ridge Model ---
Top 10 Most Impactful Features:
                      Coefficient
Neighborhood_Veenker     0.082260
Neighborhood_Crawfor    -0.073230
HouseAge                -0.041434
Neighborhood_NridgHt    -0.035750
Neighborhood_OldTown     0.033359
GrLivArea               -0.022691
OverallQual              0.011818
TotalBsmtSF             -0.009934
GarageCars               0.007621
Neighborhood_CollgCr    -0.006639
