In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler # For feature scaling
import joblib

# Load cleaned dataset
file_path = 'austinHousingData_cleaned.csv' 
df = pd.read_csv(file_path)

# Define features and target
target = 'latestPrice'
X = df.drop(columns=[target])
y = df[target]

# Feature Scaling (Optional, but often helpful)
scaler = StandardScaler()
X = scaler.fit_transform(X) 

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LGBM Datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define the parameter grid for tuning, including min_gain_to_split and min_data_in_leaf
param_grid = {
    'num_leaves': [31, 50, 75], 
    'learning_rate': [0.01, 0.05, 0.1],  # Try lower learning rates
    'feature_fraction': [0.8, 0.9, 1.0],
    'bagging_fraction': [0.8, 0.9, 1.0],
    'bagging_freq': [5, 10],
    'max_depth': [5, 10, -1],
    'min_gain_to_split': [0.0, 0.1, 0.5],  # Add min_gain_to_split
    'min_data_in_leaf': [20, 30, 50],  # Add min_data_in_leaf
}

# Initialize the LightGBM regressor
lgbm = lgb.LGBMRegressor(objective='regression', metric='rmse', boosting_type='gbdt', random_state=42)

# Randomized Search
random_search = RandomizedSearchCV(lgbm, param_grid, n_iter=50, scoring='neg_root_mean_squared_error', cv=5, verbose=2, random_state=42)
random_search.fit(X_train, y_train) # Correct the incomplete line from the previous response


# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Parameters:", best_params)

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"LightGBM Model RMSE: {rmse}")
print(f"LightGBM Model MAE: {mae}")
print(f"LightGBM Model R²: {r2}")

# Save the model
model_file_path = 'lightgbm_tuned_model.pkl'
joblib.dump(best_model, model_file_path)
print(f"Model saved at {model_file_path}")