In [1]:
#Code to create a final submission
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
warnings.filterwarnings("ignore")

# Calculate Winkler score for prediction intervals
def winkler_score(y_true, y_lower, y_upper, alpha=0.2):
    alpha = float(alpha)
    lower_clip = np.maximum(0, (y_lower - y_true))
    upper_clip = np.maximum(0, (y_true - y_upper))
    interval_length = (y_upper - y_lower)
    winkler_score = interval_length + (2/alpha) * (lower_clip + upper_clip)
    return np.mean(winkler_score)

def train_lightgbm(X_train, y_train, X_test, is_quantile=False, quantile_alpha=None):
    params = {
        'objective': 'regression' if not is_quantile else 'quantile',
        'metric': 'mae' if not is_quantile else 'quantile',
        'alpha': quantile_alpha if is_quantile else None,
        'boosting_type': 'gbdt',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'max_depth': -1,
        'min_data_in_leaf': 20,
        'verbose': -1,
        'random_state': 42
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=500,
    )
    return model, model.predict(X_test)

def train_random_forest(X_train, y_train, X_test):
    rf_model = RandomForestRegressor(n_estimators=300, min_samples_split=2, min_samples_leaf=2,
                                    max_features='log2', max_depth=20, random_state=39, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    return rf_model, rf_model.predict(X_test)

def train_xgboost(X_train, y_train, X_test, is_quantile=False, quantile_alpha=None):
    fixed_params = {
        'objective': 'reg:quantileerror' if is_quantile else 'reg:squarederror',
        'eval_metric': 'rmse',
        'random_state': 42
    }
    if is_quantile:
        fixed_params['quantile_alpha'] = quantile_alpha

    params = {**fixed_params, **{'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 200,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 1.0}}
    model = xgb.XGBRegressor(**params)
    #Removed eval_set to avoid errors, as final test set has no target
    model.fit(X_train, y_train, verbose=False)
    return model, model.predict(X_test)

def train_catboost(X_train, y_train, X_test, cat_features, is_quantile=False, quantile_alpha=None):
    params = {
        'loss_function': f'Quantile:alpha={quantile_alpha}' if is_quantile else 'MAE',
        'eval_metric': 'Quantile' if is_quantile else 'MAE',
        'iterations': 500,
        'learning_rate': 0.1,
        'random_seed': 42,
        'verbose': 0,
        'cat_features': cat_features
    }

    model = CatBoostRegressor(**params)
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
    model.fit(train_pool) #Removed eval_set
    return model, model.predict(X_test)


def build_base_models(X_train_scaled, X_train_unscaled, y_train, X_test_scaled, X_test_unscaled, alpha, cat_features):
    # --- LightGBM Model ---
    print("Training LightGBM models...")
    lightgbm_model, y_pred_lgb = train_lightgbm(X_train_scaled, y_train, X_test_scaled)
    _, y_lower_lgb = train_lightgbm(X_train_scaled, y_train, X_test_scaled, is_quantile=True, quantile_alpha=alpha/2)
    _, y_upper_lgb = train_lightgbm(X_train_scaled, y_train, X_test_scaled, is_quantile=True, quantile_alpha=1-alpha/2)

    # --- Random Forest Model ---
    print("Training Random Forest model...")
    rf_model, y_pred_rf = train_random_forest(X_train_scaled, y_train, X_test_scaled)

    # --- XGBoost Model ---
    print("Training XGBoost models...")
    xgboost_model, y_pred_xgb = train_xgboost(X_train_scaled, y_train, X_test_scaled)
    _, y_lower_xgb = train_xgboost(X_train_scaled, y_train, X_test_scaled, is_quantile=True, quantile_alpha=alpha/2)
    _, y_upper_xgb = train_xgboost(X_train_scaled, y_train, X_test_scaled, is_quantile=True, quantile_alpha=1-alpha/2)

    # --- CatBoost Model ---
    print("Training CatBoost models...")
    catboost_model, y_pred_cat = train_catboost(X_train_unscaled, y_train, X_test_unscaled, cat_features)
    _, y_lower_cat = train_catboost(X_train_unscaled, y_train, X_test_unscaled, cat_features, is_quantile=True, quantile_alpha=alpha/2)
    _, y_upper_cat = train_catboost(X_train_unscaled, y_train, X_test_unscaled, cat_features, is_quantile=True, quantile_alpha=1-alpha/2)

    return (lightgbm_model, rf_model, xgboost_model, catboost_model), (y_pred_lgb, y_lower_lgb, y_upper_lgb, y_pred_rf, y_pred_xgb, y_lower_xgb, y_upper_xgb, y_pred_cat, y_lower_cat, y_upper_cat)


def build_final_models(train_df, test_df, final_test_df, target='price', alpha=0.2):
    print("Preparing data for final modeling...")
    # Data Preparation for training
    X_train = train_df.drop(['id', target], axis=1, errors='ignore')
    y_train = train_df[target]
    X_test = test_df.drop(['id', target], axis=1, errors='ignore')
    y_test = test_df[target] #This one will be appended

    # Data Preparation for final test set, dropping 'postcode' and target (if exists)
    X_final_test = final_test_df.drop(['id', 'postcode', target], axis=1, errors='ignore')
    id_final_test = final_test_df['id']

     # Combine train and original test for the 'new' final training set
    combined_train_df = pd.concat([X_train, X_test], axis=0)
    combined_y_train = pd.concat([y_train, y_test], axis = 0)  # combine y as well


    cat_columns = combined_train_df.select_dtypes(include=['object', 'category']).columns

    # Label encoding for categorical features
    print("Encoding categorical features...")
    label_encoders = {}
    for col in cat_columns:
        le = LabelEncoder()
        # Fit on combined train data, transform both
        combined_train_df[col] = le.fit_transform(combined_train_df[col].astype(str))

        #Check if the column exists in X_final_test, if it does, then transform it.
        if col in X_final_test.columns:
            # Use .transform, not .fit_transform on X_final_test
             X_final_test[col] = le.transform(X_final_test[col].astype(str))

        label_encoders[col] = le
    # Keep unscaled copies for CatBoost
    X_train_unscaled = combined_train_df.copy()
    X_test_unscaled = X_final_test.copy() #keep also test data unscaled for catboost
    cat_features_indices = [combined_train_df.columns.get_loc(col) for col in cat_columns]


    # Fill missing values with the median of combined_train_df
    combined_train_df = combined_train_df.fillna(combined_train_df.median())
    X_final_test = X_final_test.fillna(combined_train_df.median())


    # Scale features
    print("Scaling features...")
    scaler = StandardScaler()
    # Fit on combined train data, transform both
    X_train_scaled = scaler.fit_transform(combined_train_df)
    X_test_scaled = scaler.transform(X_final_test)


    # Convert back to DataFrame with cleaned column names
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=combined_train_df.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_final_test.columns)

    # Clean column names for compatibility
    def clean_column_names(df):
        df.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
        return df

    X_train_scaled = clean_column_names(X_train_scaled)
    X_test_scaled = clean_column_names(X_test_scaled)
    X_train_unscaled = clean_column_names(X_train_unscaled)
    X_test_unscaled = clean_column_names(X_test_unscaled)


    #Equal weights for all models
    best_weights = {'lgb': 0.25, 'rf': 0.25, 'xgb': 0.25, 'cat': 0.25}
    best_k = 1e-05 #k-value

    # Build final models
    base_models, base_preds = build_base_models(X_train_scaled, X_train_unscaled, combined_y_train,
                                               X_test_scaled, X_test_unscaled,
                                               alpha, cat_features_indices)  #X_test already contains only the data of the final test set
    lightgbm_model, rf_model, xgboost_model, catboost_model = base_models
    y_pred_lgb, y_lower_lgb, y_upper_lgb, y_pred_rf, y_pred_xgb, y_lower_xgb, y_upper_xgb, y_pred_cat, y_lower_cat, y_upper_cat = base_preds


    y_pred_combined = (
        best_weights['lgb'] * y_pred_lgb +
        best_weights['rf'] * y_pred_rf +
        best_weights['xgb'] * y_pred_xgb +
        best_weights['cat'] * y_pred_cat
    )

    # Weighted combination of *differences*
    diff_rf = np.abs(y_pred_combined - y_pred_rf)
    diff_lgb = np.abs(y_pred_combined - y_pred_lgb)
    diff_cat = np.abs(y_pred_combined - y_pred_cat)
    diff_xgb = np.abs(y_pred_combined - y_pred_xgb)

    combined_diff = (
        best_weights['rf'] * diff_rf +
        best_weights['lgb'] * diff_lgb +
        best_weights['cat'] * diff_cat +
        best_weights['xgb'] * diff_xgb
    )

    # Combine upper and lower quantiles
    y_lower_combined = (
        best_weights['lgb'] * y_lower_lgb +
        best_weights['rf'] * y_lower_lgb +
        best_weights['xgb'] * y_lower_xgb +
        best_weights['cat'] * y_lower_cat
    )

    y_upper_combined = (
        best_weights['lgb'] * y_upper_lgb +
        best_weights['rf'] * y_upper_lgb +
        best_weights['xgb'] * y_upper_xgb +
        best_weights['cat'] * y_upper_cat
    )

    avg_prediction = np.mean(np.abs(y_pred_combined))
    scaled_diff = combined_diff / (avg_prediction + 1e-8)
    scaling_factor = np.exp(-best_k * scaled_diff)

    # Create final prediction intervals
    initial_interval_width = y_upper_combined - y_lower_combined
    adjusted_interval_width = initial_interval_width * scaling_factor

    y_pred_final = y_pred_combined
    y_lower_final = np.maximum(0, y_pred_final - adjusted_interval_width / 2)  #Ensure bounds aren't below 0
    y_upper_final = np.maximum(0, y_pred_final + adjusted_interval_width / 2)

    # Ensure point prediction is within bounds.  This is important!
    y_lower_final = np.minimum(y_lower_final, y_pred_final)
    y_upper_final = np.maximum(y_upper_final, y_pred_final)


    # Create the final submission DataFrame
    submission_df = pd.DataFrame({
        'ID': id_final_test,
        'LOWER': y_lower_final.round().astype(int),  # Round and convert to integers
        'UPPER': y_upper_final.round().astype(int),
        'PRED': y_pred_final.round().astype(int)
    })

    return submission_df



# Main execution
if __name__ == "__main__":
    print("Starting price prediction model...")
    train_df = pd.read_csv("../adv_analytics_business/data/train_data_0322.csv", index_col=0)
    test_df = pd.read_csv("../adv_analytics_business/data/test_data_0322.csv", index_col=0)
    final_test_df = pd.read_csv("../adv_analytics_business/data/orig_test_data_0322.csv", index_col=0)
    alpha_value = 0.2


    # Build and evaluate models, get the submission DataFrame
    submission_df = build_final_models(train_df, test_df, final_test_df, alpha=alpha_value)

    # Save the submission file
    submission_df.to_csv("submission2.csv", index=False, header=False, quoting=0)


    print("Submission file created successfully!")

Starting price prediction model...
Preparing data for final modeling...
Encoding categorical features...
Scaling features...
Training LightGBM models...
Training Random Forest model...
Training XGBoost models...
Training CatBoost models...
Submission file created successfully!
