In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Load and preprocess the dataset
df = pd.read_csv('austinHousingData.csv')

# Convert 'latest_saledate' to datetime, extract numerical features
df['latest_saledate'] = pd.to_datetime(df['latest_saledate'])
df['sale_year'] = df['latest_saledate'].dt.year
df['sale_month'] = df['latest_saledate'].dt.month
df['sale_day'] = df['latest_saledate'].dt.day
df.drop(columns=['latest_saledate'], inplace=True)

# Drop irrelevant columns (0.00 importance)
irrelevant_columns = ['streetAddress', 'description', 'latestPriceSource', 'homeImage', 'hasGarage',
                      'sale_year', 'sale_month', 'city', 'homeType', 'zpid']
df.drop(columns=irrelevant_columns, inplace=True)

# Drop columns a user wouldn't likely be able to access
user_unavailable_columns = ['numPriceChanges', 'avgSchoolSize', 'avgSchoolRating', 'avgSchoolDistance',
                            'numOfPhotos', 'latest_saleyear', 'latest_salemonth', 'sale_day',
                            'MedianStudentsPerTeacher', 'numOfElementarySchools', 'numOfHighSchools',
                            'numOfMiddleSchools', 'numOfPrimarySchools']
df.drop(columns=user_unavailable_columns, inplace=True)

# Drop low-importance columns
low_importance_columns = ['numOfAccessibilityFeatures', 'numOfCommunityFeatures', 'hasCooling',
                          'hasHeating', 'numOfWindowFeatures', 'numOfSecurityFeatures', 'hasView',
                          'parkingSpaces', 'propertyTaxRate', 'hasSpa', 'numOfWaterfrontFeatures']
df.drop(columns=low_importance_columns, inplace=True)

# Add feature interaction columns
df['PricePerSqFt'] = df['latestPrice'] / df['livingAreaSqFt']
df['BathBedRatio'] = df['numOfBathrooms'] / df['numOfBedrooms']
df['LotLivingRatio'] = df['lotSizeSqFt'] / df['livingAreaSqFt']
df['GarageBedRatio'] = df['garageSpaces'] / df['numOfBedrooms']
df['LatLonInteraction'] = df['latitude'] * df['longitude']

# Handle potential division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

# Split dataset into features and target variable
target = 'latestPrice'
X = df.drop(columns=[target])
y = df[target]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Base LightGBM parameters
base_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbosity': -1
}

# Optimized LightGBM params from Optuna
optimized_params = {
    **base_params,
    'learning_rate': 0.058793153916846676,
    'num_leaves': 63,
    'max_depth': 16,
    'min_data_in_leaf': 11,
    'feature_fraction': 0.8957209852243492,
    'bagging_fraction': 0.8868935737344777,
    'bagging_freq': 7,
    'lambda_l1': 0.5825558284248297,
    'lambda_l2': 0.005388605992909206
}

# Adjusted LightGBM params
adjusted_params = {
    **optimized_params,
    'learning_rate': 0.02,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0
}

# Modular functions for training and evaluation
def train_lightgbm(params, train_data, test_data, num_boost_round=1000, early_stopping_rounds=50):
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[train_data, test_data],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds), lgb.log_evaluation(100)]
    )
    return model

def evaluate_lightgbm(model, X_test, y_test):
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mae, r2

# Training and evaluation with optimized params
print("Training with optimized params...")
model = train_lightgbm(optimized_params, train_data, test_data)
rmse, mae, r2 = evaluate_lightgbm(model, X_test, y_test)
print(f"Optimized RMSE: {rmse}, MAE: {mae}, R^2: {r2}")

# Training and evaluation with adjusted params
print("Training with adjusted params...")
model = train_lightgbm(adjusted_params, train_data, test_data, num_boost_round=2000, early_stopping_rounds=100)
rmse, mae, r2 = evaluate_lightgbm(model, X_test, y_test)
print(f"Adjusted RMSE: {rmse}, MAE: {mae}, R^2: {r2}")

# Cross-validation to check stability
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = train_lightgbm(adjusted_params, train_data, val_data, num_boost_round=2000, early_stopping_rounds=100)
    rmse, mae, r2 = evaluate_lightgbm(model, X_val, y_val)
    cv_results.append((rmse, mae, r2))

# Aggregate cross-validation results
cv_rmse, cv_mae, cv_r2 = map(lambda x: (np.mean(x), np.std(x)), zip(*cv_results))
print(f"Cross-Validation Results - RMSE: {cv_rmse[0]} ± {cv_rmse[1]}, MAE: {cv_mae[0]} ± {cv_mae[1]}, R^2: {cv_r2[0]} ± {cv_r2[1]}")
