In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path("..").resolve()))

import pandas as pd
import numpy as np

from src.config import FEATURES, TARGET
from src.prep import clean_and_engineer

In [2]:
df_raw = pd.read_csv("../data/processed/filtered_91766.csv")
df = clean_and_engineer(df_raw)

X = df[FEATURES]
y = df[TARGET]

print("X shape:", X.shape)
print("y stats:", y.min(), y.max(), y.mean())


X shape: (380, 5)
y stats: 187108.0 1810916.0 691492.5421052632


In [5]:
from src.config import FEATURES, TARGET
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
import xgboost as xgb
import pandas as pd

In [6]:
df = pd.read_csv("/Users/youch/projects/Phillips_ranch/data/filtered_91766.csv")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

rf = RandomForestRegressor (
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

xgb = XGBRegressor (
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb.fit(X_train, y_train)

In [9]:
models = {
    "Linear Regression": lr,
    "Random Forest": rf,
    "XGBoost": xgb
}

results = []

for name, model in models.items():
    preds = model.predict(X_test)
    results.append({
        "Model": name,
        "R2": r2_score(y_test, preds),
        "MAE": mean_absolute_percentage_error(y_test, preds)
    })

pd.DataFrame(results).sort_values("MAE")

Unnamed: 0,Model,R2,MAE
2,XGBoost,0.835918,0.063327
1,Random Forest,0.818157,0.072795
0,Linear Regression,0.51188,0.127712


In [10]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)


xgb = XGBRegressor (
    n_estimators=600,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

scores = cross_validate(
    xgb, X, y,
    cv=cv,
    scoring={
    "mae": "neg_mean_absolute_error",
    "r2": "r2"
    },
    return_train_score=False
)

mae = -scores["test_mae"]
r2 = scores["test_r2"]

print("XGB CV MAE mean:", mae.mean(), "std:", mae.std())
print("XGB CV R2  mean:", r2.mean(),  "std:", r2.std())

XGB CV MAE mean: 46532.091694078954 std: 10914.215169106114
XGB CV R2  mean: 0.7580557472010423 std: 0.2124769957303781


In [11]:
final_xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_xgb.fit(X_train, y_train)

In [13]:
import joblib
joblib.dump(final_xgb, "../app/xgb_final_model.pkl")
print("Saved model to ../app/xgb_final_model.pkl")

Saved model to ../app/xgb_final_model.pkl
