In [1]:
# STEP 1: load cleaned data
import pandas as pd
import numpy as np
from pathlib import Path

fp = Path("../data/processed/goa_cleaned.csv")
if not fp.exists():
    raise FileNotFoundError(f"Expected cleaned CSV at {fp}. If missing, run 01_preprocessing.ipynb first.")
df_clean = pd.read_csv(fp)
print("Loaded cleaned data rows:", len(df_clean))
df_clean.head(2)


Loaded cleaned data rows: 2414


Unnamed: 0,location,bhk,price,sq_feet,latitude,longitude,bhk_num,price_per_sqft
0,Mapusa,Office Space,100000,77.0,15.590853,73.810215,0.0,1298.701299
1,Taleigao,2 BHK,10000000,11905.0,15.470266,73.822567,2.0,839.9832


In [2]:
# STEP 2: prepare df_model and target
df_model = df_clean.copy()
df_model['log_price'] = np.log1p(df_model['price'])
print("df_model shape:", df_model.shape)
df_model[['price','sq_feet','price_per_sqft','bhk_num','log_price']].head(3)


df_model shape: (2414, 9)


Unnamed: 0,price,sq_feet,price_per_sqft,bhk_num,log_price
0,100000,77.0,1298.701299,0.0,11.512935
1,10000000,11905.0,839.9832,2.0,16.118096
2,10000000,12500.0,800.0,2.0,16.118096


In [3]:
# STEP 3: train/test split
from sklearn.model_selection import train_test_split

X = df_model[['location','sq_feet','bhk_num','price_per_sqft']]
y = df_model['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train rows:", X_train.shape[0], "Test rows:", X_test.shape[0])


Train rows: 1931 Test rows: 483


In [4]:
# STEP 4: preprocessor (scikit-learn 1.4+ compatible)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_features = ['sq_feet','bhk_num','price_per_sqft']
categorical_features = ['location']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

# quick fit-transform test on training data to ensure no hidden errors
_ = preprocessor.fit_transform(X_train)
print("Preprocessor fitted. Output shape (sample):", _.shape)


Preprocessor fitted. Output shape (sample): (1931, 160)


In [5]:
# STEP 5: train RandomForest pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])

model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))
print("RandomForest trained. RMSE (log):", float(rmse_rf))


RandomForest trained. RMSE (log): 0.07213265442604154


In [6]:
# STEP 6: save the pipeline
from joblib import dump
from pathlib import Path

Path("../models").mkdir(parents=True, exist_ok=True)
dump(model_rf, "../models/rf_pipeline.pkl")
print("✓ Model saved to ../models/rf_pipeline.pkl")


✓ Model saved to ../models/rf_pipeline.pkl


In [7]:
# STEP 7: load and test
from joblib import load
pipe = load("../models/rf_pipeline.pkl")
# test predict on a sample from X_test
sample = X_test.iloc[:3].copy()
preds = pipe.predict(sample)
print("Loaded pipeline. Sample preds (log):", preds)
print("Sample preds (INR):", [int(round(np.expm1(p))) for p in preds])


Loaded pipeline. Sample preds (log): [17.12393513 16.07410643 15.82634765]
Sample preds (INR): [27342015, 9569642, 7469566]


In [8]:
# STEP 8: write helper script (src/models/predict_helper.py)
helper_code = r'''
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

MODEL_PATH = Path(__file__).parents[2] / "models" / "rf_pipeline.pkl"

def load_model():
    return joblib.load(MODEL_PATH)

def predict_price(model, location, sq_feet, bhk_num, price_per_sqft=None, default_pps=None):
    # If caller doesn't supply price_per_sqft, use default or model fallback
    X = pd.DataFrame([{
        "location": location,
        "sq_feet": float(sq_feet),
        "bhk_num": float(bhk_num),
        "price_per_sqft": float(price_per_sqft) if price_per_sqft is not None else float(default_pps)
    }])
    log_pred = model.predict(X)[0]
    return {"pred_log": float(log_pred), "pred_price_inr": int(round(np.expm1(log_pred)))}

if __name__ == "__main__":
    m = load_model()
    print(predict_price(m, "Porvorim", 1200, 2, price_per_sqft=2000))
'''
Path("../src/models").mkdir(parents=True, exist_ok=True)
Path("../src/models/predict_helper.py").write_text(helper_code)
print("Wrote src/models/predict_helper.py")


Wrote src/models/predict_helper.py


In [9]:
# STEP 9: test helper
import importlib.util, sys
spec = importlib.util.spec_from_file_location("ph", "../src/models/predict_helper.py")
ph = importlib.util.module_from_spec(spec)
spec.loader.exec_module(ph)

model_loaded = ph.load_model()
print(ph.predict_price(model_loaded, "Porvorim", 1200, 2, price_per_sqft=2000))


{'pred_log': 15.051782873565772, 'pred_price_inr': 3442755}


In [10]:
# STEP 10: write a README snippet (you can edit later)
readme_text = """# Goa Rental Price Estimator

Model saved: models/rf_pipeline.pkl
Quick test: use src/models/predict_helper.py to load model and predict.

Commands:
- To run notebooks: open notebooks/01_preprocessing.ipynb, 02_EDA.ipynb, 03_Modeling.ipynb
- To run Streamlit demo later: streamlit run app/app.py

"""
Path("../README.md").write_text(readme_text)
print("README.md created/updated.")


README.md created/updated.


In [11]:
# STEP 11: freeze minimal requirements
reqs = [
"pandas",
"numpy",
"scikit-learn",
"xgboost",
"joblib",
"shap",
"matplotlib",
"seaborn",
"folium",
"streamlit"
]
Path("../requirements.txt").write_text("\n".join(reqs))
print("requirements.txt written.")


requirements.txt written.


In [12]:
# --- Recreate, retrain and save pipeline in current environment ---
import pandas as pd
import numpy as np
from pathlib import Path

# 1) load cleaned data (adjust path if different)
df_clean = pd.read_csv("../data/processed/goa_cleaned.csv")

# 2) prepare features + target
df_model = df_clean.copy()
df_model['log_price'] = np.log1p(df_model['price'])

X = df_model[['location','sq_feet','bhk_num','price_per_sqft']]
y = df_model['log_price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3) build preprocessor (use sparse_output=False for modern sklearn)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = ['sq_feet','bhk_num','price_per_sqft']
categorical_features = ['location']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

# fit transform quick check
_ = preprocessor.fit_transform(X_train)

# 4) train RandomForest pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])

print("Training RandomForest... this may take a bit")
model_rf.fit(X_train, y_train)

# 5) eval quick
y_pred = model_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE (log):", float(rmse))

# 6) save pipeline (overwrite previous file)
from joblib import dump
Path("../models").mkdir(parents=True, exist_ok=True)
dump(model_rf, "../models/rf_pipeline.pkl", compress=3)
print("Saved new pipeline to ../models/rf_pipeline.pkl")


Training RandomForest... this may take a bit
RMSE (log): 0.07213265442604126
Saved new pipeline to ../models/rf_pipeline.pkl


In [13]:
# RETRAIN & SAVE pipeline (run inside 03_Modeling.ipynb)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from joblib import dump

# 1) load cleaned data
df_clean = pd.read_csv("../data/processed/goa_cleaned.csv")

# 2) prepare features + target
df_model = df_clean.copy()
df_model['log_price'] = np.log1p(df_model['price'])
X = df_model[['location','sq_feet','bhk_num','price_per_sqft']]
y = df_model['log_price']

# 3) split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4) preprocessor
numeric_features = ['sq_feet','bhk_num','price_per_sqft']
categorical_features = ['location']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)
_ = preprocessor.fit_transform(X_train)

# 5) train RF pipeline
model_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])

print("Training RandomForest... (may take a little)")
model_rf.fit(X_train, y_train)

# 6) eval & save
y_pred = model_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE (log):", float(rmse))

Path("../models").mkdir(parents=True, exist_ok=True)
dump(model_rf, "../models/rf_pipeline.pkl", compress=3)
print("Saved new pipeline to ../models/rf_pipeline.pkl")


Training RandomForest... (may take a little)
RMSE (log): 0.07213265442604153
Saved new pipeline to ../models/rf_pipeline.pkl


In [14]:
# FINAL RETRAIN CELL — run ONLY in 03_Modeling.ipynb

import pandas as pd
import numpy as np
from pathlib import Path
from joblib import dump

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load cleaned data
df_clean = pd.read_csv("../data/processed/goa_cleaned.csv")

# Prepare features
df_model = df_clean.copy()
df_model["log_price"] = np.log1p(df_model["price"])

X = df_model[["location", "sq_feet", "bhk_num", "price_per_sqft"]]
y = df_model["log_price"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessor
preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), ["sq_feet", "bhk_num", "price_per_sqft"]),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["location"]),
    ]
)

# Pipeline
model_rf = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )),
    ]
)

print("Training model...")
model_rf.fit(X_train, y_train)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, model_rf.predict(X_test)))
print("RMSE (log):", rmse)

# SAVE (this overwrites the broken pickle)
Path("../models").mkdir(exist_ok=True)
dump(model_rf, "../models/rf_pipeline.pkl")

print("✅ Model retrained and overwritten successfully")


Training model...
RMSE (log): 0.07213265442604128
✅ Model retrained and overwritten successfully
