In [4]:
import pandas as pd
import joblib
import os
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# 1. Load Dataset (Built-in failproof data)
print("Loading data...")
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['SalePrice'] = data.target * 100000  # Scale to look like house prices

# Map to your project's 6 required feature names
mapping = {
    'MedInc': 'OverallQual', 
    'HouseAge': 'YearBuilt', 
    'AveRooms': 'GrLivArea', 
    'AveBedrms': 'TotalBsmtSF', 
    'Population': 'GarageCars', 
    'AveOccup': 'FullBath'
}
df = df.rename(columns=mapping)

# 2. Feature Selection
features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'FullBath', 'YearBuilt']
target = 'SalePrice'
X = df[features]
y = df[target]

# 3. Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training model...")
# Added max_depth=15 to help naturally control model size
model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42)
model.fit(X_train, y_train)

# 4. Save with Compression (The "Big Time" fix for GitHub)
if not os.path.exists('model'):
    os.makedirs('model')

# 'compress=3' is the key here. It shrinks the file size significantly.
save_path = 'model/house_price_model.pkl'
joblib.dump(model, save_path, compress=3)



Loading data...
Training model...


['model/house_price_model.pkl']