In [1]:
import pandas as pd

In [2]:
no_missing_merged_loc = pd.read_csv('clean_for_training.csv')
no_missing_merged_loc.drop(columns='Unnamed: 0', inplace=True)
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
import pickle

# Define features (X) and target (y)
X = no_missing_merged_loc.drop(columns=['amount'])  # Replace 'amount' with your target column if different
y = no_missing_merged_loc['amount']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training data for meta-learner (optional)
X_train_base, X_val_meta, y_train_base, y_val_meta = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Initialize QuantileTransformers
qt_amount = QuantileTransformer(output_distribution='normal', random_state=42)
qt_size = QuantileTransformer(output_distribution='normal', random_state=42)

# Fit transformers on training data
# Fit qt_size on transaction and property sizes
qt_size.fit(X_train[['transaction_size_sqm', 'property_size_sqm']])

# Fit qt_amount on the entire `y_train` dataset
qt_amount.fit(y_train.values.reshape(-1, 1))

# Transform sizes in the training, validation, and test sets
X_train_base[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_train_base[['transaction_size_sqm', 'property_size_sqm']]
)
X_val_meta[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_val_meta[['transaction_size_sqm', 'property_size_sqm']]
)
X_test[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_test[['transaction_size_sqm', 'property_size_sqm']]
)

# Transform target variable in the training, validation, and test sets
y_train_base = qt_amount.transform(y_train_base.values.reshape(-1, 1)).flatten()
y_val_meta = qt_amount.transform(y_val_meta.values.reshape(-1, 1)).flatten()
y_test = qt_amount.transform(y_test.values.reshape(-1, 1)).flatten()

# Save the transformers for later use
with open("qt_amount.pkl", "wb") as f:
    pickle.dump(qt_amount, f)
with open("qt_size.pkl", "wb") as f:
    pickle.dump(qt_size, f)

# Print dataset sizes
print(f"Training set (Base models): {X_train_base.shape}")
print(f"Validation set (Meta-learner): {X_val_meta.shape}")
print(f"Test set: {X_test.shape}")


Training set (Base models): (97683, 51)
Validation set (Meta-learner): (32561, 51)
Test set: (32562, 51)


In [4]:
xgb_params =  {'n_estimators': 361, 'max_depth': 10, 
             'learning_rate': 0.24829236737227453, 'subsample': 0.875390416499723}
rf_params = {'n_estimators': 111, 'max_depth': 20, 'min_samples_split': 11}

In [6]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Define models with the best parameters
xgb_model = XGBRegressor(
    n_estimators=xgb_params['n_estimators'],
    max_depth=xgb_params['max_depth'],
    learning_rate=xgb_params['learning_rate'],
    subsample=xgb_params['subsample'],
    random_state=42
)

rf_model = RandomForestRegressor(
    n_estimators=rf_params['n_estimators'],
    max_depth=rf_params['max_depth'],
    min_samples_split=rf_params['min_samples_split'],
    random_state=42
)

# Train XGBoost model
print("Training XGBoost model...")
xgb_model.fit(X_train_base, y_train_base)
print("XGBoost training completed.")

# Train Random Forest model
print("Training Random Forest model...")
rf_model.fit(X_train_base, y_train_base)
print("Random Forest training completed.")

# Evaluate models on the validation set
xgb_val_score = xgb_model.score(X_val_meta, y_val_meta)
rf_val_score = rf_model.score(X_val_meta, y_val_meta)

print(f"XGBoost Validation Score: {xgb_val_score}")
print(f"Random Forest Validation Score: {rf_val_score}")


Training XGBoost model...
XGBoost training completed.
Training Random Forest model...
Random Forest training completed.
XGBoost Validation Score: 0.9182581323288751
Random Forest Validation Score: 0.918175433734719


# Check if normalizing could improve the performance of optimized rf and xgb

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split


no_missing_merged_loc = pd.read_csv('clean_for_training.csv')
no_missing_merged_loc.drop(columns='Unnamed: 0', inplace=True)
# Updated list of columns to normalize
columns_to_normalize = [
    "rooms_en_imputed", "project_count", "landmark_count", "metro_count", 
    "mall_count", "Al Makhtoum International Airport", "Burj Al Arab", 
    "Burj Khalifa", "City Centre Mirdif", "Downtown Dubai", 
    "Dubai International Airport", "Dubai Mall", "Dubai Parks and Resorts", 
    "Expo 2020 Site", "Global Village", "Hamdan Sports Complex", 
    "IMG World Adventures", "Ibn-e-Battuta Mall", "Jabel Ali", 
    "Mall of the Emirates", "Marina Mall", "Motor City", "center", 
    "east", "north", "south", "west", 
    "transaction_datetime_month", "transaction_datetime_day", 
    "transaction_datetime_weekday", "transaction_datetime_dayofyear", 
    "req_from_month", "req_from_weekday", "req_from_dayofyear", 
    "req_to_month", "req_to_day", "req_to_weekday", "req_to_dayofyear",
    "parking_count"
]

# Initialize the scaler
scaler = StandardScaler()

# Apply normalization to the specified columns
no_missing_merged_loc[columns_to_normalize] = scaler.fit_transform(no_missing_merged_loc[columns_to_normalize])
# Define features (X) and target (y)
X = no_missing_merged_loc.drop(columns=['amount'])  # Replace 'amount' with your target column if different
y = no_missing_merged_loc['amount']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training data for meta-learner (optional)
X_train_base, X_val_meta, y_train_base, y_val_meta = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Initialize QuantileTransformers
qt_amount = QuantileTransformer(output_distribution='normal', random_state=42)
qt_size = QuantileTransformer(output_distribution='normal', random_state=42)

# Fit transformers on training data
# Fit qt_size on transaction and property sizes
qt_size.fit(X_train[['transaction_size_sqm', 'property_size_sqm']])

# Fit qt_amount on the entire `y_train` dataset
qt_amount.fit(y_train.values.reshape(-1, 1))

# Transform sizes in the training, validation, and test sets
X_train_base[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_train_base[['transaction_size_sqm', 'property_size_sqm']]
)
X_val_meta[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_val_meta[['transaction_size_sqm', 'property_size_sqm']]
)
X_test[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_test[['transaction_size_sqm', 'property_size_sqm']]
)

# Transform target variable in the training, validation, and test sets
y_train_base = qt_amount.transform(y_train_base.values.reshape(-1, 1)).flatten()
y_val_meta = qt_amount.transform(y_val_meta.values.reshape(-1, 1)).flatten()
y_test = qt_amount.transform(y_test.values.reshape(-1, 1)).flatten()


# Print dataset sizes
print(f"Training set (Base models): {X_train_base.shape}")
print(f"Validation set (Meta-learner): {X_val_meta.shape}")
print(f"Test set: {X_test.shape}")

Training set (Base models): (97683, 51)
Validation set (Meta-learner): (32561, 51)
Test set: (32562, 51)


In [11]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
import numpy as np
import cupy 

def optimize_xgboost(trial, X_train, y_train):
    # Define the hyperparameters to tune
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "tree_method": "hist",  # Use "hist" for GPU
        "device": "cuda"  # Specify CUDA device for GPU training
    }
    
    model = XGBRegressor(**param)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="r2")
    return np.mean(scores)

def optimize_model(optimize_func, X_train, y_train, n_trials=20):
    # Convert CuPy arrays to NumPy
    X_train = X_train.get()
    y_train = y_train.get()
    
    def objective(trial):
        return optimize_func(trial, X_train, y_train)
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

# Usage
X_train = cupy.array(X_train_base)
y_train = cupy.array(y_train_base)

best_params_xgb = optimize_model(optimize_xgboost, X_train, y_train)
print("Best parameters for XGBoost:", best_params_xgb)


[I 2024-11-26 02:19:06,639] A new study created in memory with name: no-name-8fb4eef5-1134-4d88-b499-63ea5f895d98
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2024-11-26 02:19:07,927] Trial 0 finished with value: 0.8148735696333368 and parameters: {'n_estimators': 104, 'max_depth': 6, 'learning_rate': 0.016306712871257366, 'subsample': 0.9500015214084825}. Best is trial 0 with value: 0.8148735696333368.
[I 2024-11-26 02:19:10,008] Trial 1 finished with value: 0.8661193599197827 and parameters: {'n_estimators': 463, 'max_depth': 4, 'learning_rate': 0.019468165498085197, 'subsample': 0.796731400630611}. Best is trial 1 with value: 0.8661193599197827.
[I 2024-11-26 02:19:11,521] Trial 2 finished with value: 0.8896094540727048 and parameters: {'n_estimators': 423, 'max_depth': 3, 'learning_rate': 0.21295949721066856, 'subsample': 0.9097634240412253}. Best is trial 2 with value: 

Best parameters for XGBoost: {'n_estimators': 197, 'max_depth': 12, 'learning_rate': 0.035707771528736075, 'subsample': 0.9090060296096989}


In [14]:
# Define Random Forest optimization with cross-validation
def optimize_random_forest(trial, X_train, y_train):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "n_jobs": -1  # Enable parallel processing
    }
    model = RandomForestRegressor(**params, random_state=42)

    # 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="r2", n_jobs=-1)
    return np.mean(scores)  # Return the average R2 score

print("Optimizing Random Forest...")
best_params_rf = optimize_model(optimize_random_forest, X_train_base, y_train_base)
print("Best parameters for Random Forest:", best_params_rf)

Optimizing Random Forest...


TypeError: NDFrame.get() missing 1 required positional argument: 'key'