In [1]:
import pandas as pd

In [2]:
no_missing_merged_loc = pd.read_csv('clean_for_training.csv')
no_missing_merged_loc.drop(columns='Unnamed: 0', inplace=True)

In [3]:
from sklearn.preprocessing import StandardScaler

# Updated list of columns to normalize
columns_to_normalize = [
    "rooms_en_imputed", "project_count", "landmark_count", "metro_count", 
    "mall_count", "Al Makhtoum International Airport", "Burj Al Arab", 
    "Burj Khalifa", "City Centre Mirdif", "Downtown Dubai", 
    "Dubai International Airport", "Dubai Mall", "Dubai Parks and Resorts", 
    "Expo 2020 Site", "Global Village", "Hamdan Sports Complex", 
    "IMG World Adventures", "Ibn-e-Battuta Mall", "Jabel Ali", 
    "Mall of the Emirates", "Marina Mall", "Motor City", "center", 
    "east", "north", "south", "west", 
    "transaction_datetime_month", "transaction_datetime_day", 
    "transaction_datetime_weekday", "transaction_datetime_dayofyear", 
    "req_from_month", "req_from_weekday", "req_from_dayofyear", 
    "req_to_month", "req_to_day", "req_to_weekday", "req_to_dayofyear",
    "parking_count"
]

# Initialize the scaler
scaler = StandardScaler()

# Apply normalization to the specified columns
no_missing_merged_loc[columns_to_normalize] = scaler.fit_transform(no_missing_merged_loc[columns_to_normalize])

import pickle
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Confirm the transformation
print(no_missing_merged_loc[columns_to_normalize].head())


   rooms_en_imputed  project_count  landmark_count  metro_count  mall_count  \
0         -0.815554       2.563710        2.536212     2.525836    2.525847   
1         -0.815554       2.563710        2.536212     2.525836    2.525847   
2          0.029711      -0.477067       -0.258172    -0.716061   -0.675612   
3          0.029711      -0.234877       -0.599818    -0.546503   -0.546408   
4         -0.815554      -0.473619       -0.772892    -0.716061   -0.715961   

   Al Makhtoum International Airport  Burj Al Arab  Burj Khalifa  \
0                          -0.709603     -0.601963      0.293371   
1                          -0.709603     -0.601963      0.293371   
2                           0.638727      0.719958     -0.133155   
3                           1.591177      0.221209     -0.811187   
4                           0.130422     -0.891759     -0.651622   

   City Centre Mirdif  Downtown Dubai  ...  transaction_datetime_weekday  \
0            0.516927        0.298581  .

# Normalization for SVM

In [4]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
import pickle

# Define features (X) and target (y)
X = no_missing_merged_loc.drop(columns=['amount'])  # Replace 'amount' with your target column if different
y = no_missing_merged_loc['amount']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training data for meta-learner (optional)
X_train_base, X_val_meta, y_train_base, y_val_meta = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Initialize QuantileTransformers
qt_amount = QuantileTransformer(output_distribution='normal', random_state=42)
qt_size = QuantileTransformer(output_distribution='normal', random_state=42)

# Fit transformers on training data
# Fit qt_size on transaction and property sizes
qt_size.fit(X_train[['transaction_size_sqm', 'property_size_sqm']])

# Fit qt_amount on the entire `y_train` dataset
qt_amount.fit(y_train.values.reshape(-1, 1))

# Transform sizes in the training, validation, and test sets
X_train_base[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_train_base[['transaction_size_sqm', 'property_size_sqm']]
)
X_val_meta[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_val_meta[['transaction_size_sqm', 'property_size_sqm']]
)
X_test[['transaction_size_sqm', 'property_size_sqm']] = qt_size.transform(
    X_test[['transaction_size_sqm', 'property_size_sqm']]
)

# Transform target variable in the training, validation, and test sets
y_train_base = qt_amount.transform(y_train_base.values.reshape(-1, 1)).flatten()
y_val_meta = qt_amount.transform(y_val_meta.values.reshape(-1, 1)).flatten()
y_test = qt_amount.transform(y_test.values.reshape(-1, 1)).flatten()

# Save the transformers for later use
with open("qt_amount.pkl", "wb") as f:
    pickle.dump(qt_amount, f)
with open("qt_size.pkl", "wb") as f:
    pickle.dump(qt_size, f)

# Print dataset sizes
print(f"Training set (Base models): {X_train_base.shape}")
print(f"Validation set (Meta-learner): {X_val_meta.shape}")
print(f"Test set: {X_test.shape}")


Training set (Base models): (97683, 51)
Validation set (Meta-learner): (32561, 51)
Test set: (32562, 51)


In [5]:
import pandas as pd

# Set pandas to display all columns
pd.set_option('display.max_columns', None)

# Now this will display all columns
X_train_base.tail()


Unnamed: 0,transaction_type_id,property_usage_id,total_buyer,total_seller,transaction_size_sqm,property_size_sqm,transaction_datetime_month,transaction_datetime_day,transaction_datetime_weekday,transaction_datetime_dayofyear,req_from_month,req_from_weekday,req_from_dayofyear,req_to_month,req_to_day,req_to_weekday,req_to_dayofyear,parking_count,is_freehold_encoded,rooms_en_imputed,project_count,landmark_count,metro_count,mall_count,Al Makhtoum International Airport,Burj Al Arab,Burj Khalifa,City Centre Mirdif,Downtown Dubai,Dubai International Airport,Dubai Mall,Dubai Parks and Resorts,Expo 2020 Site,Global Village,Hamdan Sports Complex,IMG World Adventures,Ibn-e-Battuta Mall,Jabel Ali,Mall of the Emirates,Marina Mall,Motor City,center,east,north,south,west,registration_type_encoded,buyer_to_seller_ratio,property_subtype_encoded,is_offplan_encoded,property_type_encoded
141028,1,1,1,1,-1.998418,-2.046415,1.485958,-0.66253,-0.684045,1.423417,1.48275,1.63005,1.491162,1.48275,-2.623532,-0.881384,1.440953,0.196287,1,-0.815554,-0.061922,-0.772892,-0.716061,-0.715961,-2.379401,2.070961,2.372154,2.30445,2.38434,2.27718,2.379134,-1.78462,-1.116896,2.001184,1.683181,2.240561,-0.00406,-0.34245,2.021091,0.632233,1.02344,2.272201,2.177977,2.280982,-1.745919,0.506024,0,0.5,7,1,2
147111,1,1,1,1,-0.948891,-0.957204,1.485958,-0.776135,-1.34024,1.410373,1.48275,1.63005,1.491162,1.48275,-2.623532,-0.881384,1.440953,0.196287,1,-0.815554,0.183717,-0.772892,-0.716061,-0.715961,-0.176721,1.344997,0.884703,0.154829,0.857985,0.483559,0.855892,0.429725,0.173657,-0.651662,-1.11421,-0.598279,0.800265,0.570741,1.177847,0.970485,0.168146,0.810873,-0.0493,0.961861,-1.03369,1.503273,0,0.5,7,1,2
156563,1,1,2,1,0.488986,0.479238,1.485958,1.041544,-0.027851,1.619089,1.48275,1.63005,1.491162,1.48275,-2.623532,-0.881384,1.440953,0.196287,1,0.029711,-0.4646,-0.335369,-0.287424,-0.287336,-0.426161,-0.592261,-0.019224,0.103009,-0.025579,0.156502,-0.014334,-0.339818,-0.532766,-1.077817,-0.980744,-0.878365,-0.473429,-0.516,-0.824248,-0.512126,-1.260595,0.159585,0.189699,0.210867,-0.529093,-0.26757,1,1.0,7,0,2
49804,1,1,2,1,-1.461085,-1.480611,-0.493826,-1.11695,1.940733,-0.62461,-0.500388,-1.245971,-0.501204,-0.500388,0.038198,-1.458353,-0.505166,0.196287,1,-0.815554,-0.395365,-0.222612,-0.176958,-0.176873,-0.954634,-0.293838,0.585286,0.781317,0.592782,0.76217,0.599768,-0.989892,-1.184763,-0.111627,-0.162643,0.121208,-1.072066,-1.208157,-0.445043,-0.902662,-0.921672,0.692449,0.821679,0.672325,-0.828372,-0.665997,0,1.0,7,1,2
114560,1,1,2,1,0.677079,0.66778,0.694044,0.246309,0.628344,0.719,0.689495,-1.245971,0.683798,0.689495,0.570544,-0.881384,0.704583,0.196287,1,-0.815554,0.087955,0.471032,0.502599,0.502666,-0.47858,-0.815651,0.483075,0.92982,0.50988,0.767642,0.514084,-1.033903,-0.705715,0.668876,0.703984,0.812803,-1.414572,-1.073809,-0.724198,-1.661992,-0.154123,0.534332,0.953947,0.335106,-0.033336,-1.651403,1,1.0,0,0,2


# Optimize SVM

## Notes on SVR Optimization with Optuna
Since the last time I optimized the SVM with optuna, the baysian optimization got slow and then got stuck, 
here are improvements to the optimization process. 
### Improvements Made to the Optimization Code
1. **Added Cross-Validation for Robustness:**
   - Instead of evaluating the model on the training set, cross-validation is used to compute a more robust performance metric.
   - This prevents overfitting to a single split and ensures generalizability.

2. **Implemented Early Stopping for Optuna:**
   - Optuna's pruning feature stops trials early if their intermediate results suggest they won't outperform the current best trial.
   - This saves computation time and resources.

3. **Used Mean Squared Error (MSE) as the Metric:**
   - The `neg_mean_squared_error` scoring metric is used for optimization, which is a standard metric for regression tasks.
   - Using a proper metric makes the optimization process more meaningful and interpretable.

4. **Ensured Data Scaling:**
   - SVR is sensitive to the magnitude of input features. Ensure that data is normalized or standardized before running the optimization.

5. **Set a Consistent Random State:**
   - Setting a `random_state` for both cross-validation and Optuna ensures reproducible results.

6. **Logged Optuna Progress:**
   - Optuna logging is enabled to track optimization progress and debug potential issues.


In [12]:
import optuna
from cuml.svm import SVR as cuSVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import cupy as cp
import numpy as np

# Define GPU-accelerated SVR optimization
def optimize_cuml_svr(trial, X_train, y_train):
    params = {
        "C": trial.suggest_float("C", 0.1, 100, log=True),
        "epsilon": trial.suggest_float("epsilon", 0.01, 1, log=True),
        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly"]),
    }

    # Convert data to GPU arrays
    X_train = cp.array(X_train)  # Ensure it's a GPU array
    y_train = cp.array(y_train)

    # Perform 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, test_idx in kf.split(X_train):
        # Use indices for slicing NumPy arrays
        X_train_fold, X_test_fold = X_train[train_idx], X_train[test_idx]
        y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

        # Fit model and predict
        model = cuSVR(**params)
        model.fit(X_train_fold, y_train_fold)
        predictions = model.predict(X_test_fold)

        # Evaluate predictions
        mse = mean_squared_error(cp.asnumpy(y_test_fold), cp.asnumpy(predictions))
        scores.append(mse)

    # Return mean of scores (lower is better for MSE)
    return np.mean(scores)

# Define optimization function
def optimize_model(optimize_func, X_train, y_train, n_trials=20):
    def objective(trial):
        return optimize_func(trial, X_train, y_train)

    study = optuna.create_study(direction="minimize")  # Minimize mean squared error
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

# Ensure X_train_base and y_train_base are numpy arrays
X_train_base_gpu = X_train_base.to_numpy()  # Convert DataFrame to NumPy array if needed

# Ensure y_train_base is already a NumPy array
y_train_base_gpu = y_train_base  # No need for .to_numpy()

# Optimize cuML SVR
best_params_cuml_svr = optimize_model(optimize_cuml_svr, X_train_base_gpu, y_train_base_gpu)
print("Best parameters for GPU-Accelerated SVR:", best_params_cuml_svr)




[I 2024-11-26 23:47:25,167] A new study created in memory with name: no-name-a82fa8c0-7104-438a-a3b2-7c59cbfca083
[I 2024-11-26 23:47:39,940] Trial 0 finished with value: 0.16064710953889677 and parameters: {'C': 0.6300825669524476, 'epsilon': 0.2889038768080176, 'kernel': 'rbf'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-26 23:48:11,010] Trial 1 finished with value: 0.16661860002815596 and parameters: {'C': 0.3799277051109876, 'epsilon': 0.037498413286564114, 'kernel': 'rbf'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-27 00:15:39,958] Trial 2 finished with value: 0.28977575839898917 and parameters: {'C': 7.212376563046342, 'epsilon': 0.01127573441588465, 'kernel': 'poly'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-27 00:22:19,628] Trial 3 finished with value: 0.19554829244939526 and parameters: {'C': 1.1866895206970782, 'epsilon': 0.011760138174227797, 'kernel': 'poly'}. Best is trial 0 with value: 0.16064710953889677.


[W] [00:22:19.648466] SVR with the linear kernel can be much faster using the specialized solver provided by LinearSVR. Consider switching to LinearSVR if tranining takes too long.


[I 2024-11-27 00:36:29,874] Trial 4 finished with value: 0.23706936920408556 and parameters: {'C': 1.045908000857348, 'epsilon': 0.011791262098011982, 'kernel': 'linear'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-27 12:36:29,962] Trial 5 finished with value: 0.2357549548815515 and parameters: {'C': 88.07519577045318, 'epsilon': 0.09533434261811294, 'kernel': 'linear'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-27 12:53:48,817] Trial 6 finished with value: 0.23655217352051552 and parameters: {'C': 1.4102713891871723, 'epsilon': 0.06014479540568654, 'kernel': 'linear'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-27 12:54:38,822] Trial 7 finished with value: 0.21884526127983217 and parameters: {'C': 0.10160396186804392, 'epsilon': 0.16067904543131878, 'kernel': 'poly'}. Best is trial 0 with value: 0.16064710953889677.
[I 2024-11-27 13:25:39,892] Trial 8 finished with value: 0.38449410000805706 and parameters: {'C': 11.951830101220315, 

Best parameters for GPU-Accelerated SVR: {'C': 84.76283883453274, 'epsilon': 0.13537940431984122, 'kernel': 'rbf'}


In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 130244 entries, 18063 to 121958
Data columns (total 51 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   transaction_type_id                130244 non-null  int64  
 1   property_usage_id                  130244 non-null  int64  
 2   total_buyer                        130244 non-null  int64  
 3   total_seller                       130244 non-null  int64  
 4   transaction_size_sqm               130244 non-null  float64
 5   property_size_sqm                  130244 non-null  float64
 6   transaction_datetime_month         130244 non-null  float64
 7   transaction_datetime_day           130244 non-null  float64
 8   transaction_datetime_weekday       130244 non-null  float64
 9   transaction_datetime_dayofyear     130244 non-null  float64
 10  req_from_month                     130244 non-null  float64
 11  req_from_weekday                   13024

In [22]:
X_train[X_train.columns.to_list()[2]]

18063     1
158113    0
137647    1
22752     3
94155     1
         ..
119879    0
103694    2
131932    1
146867    1
121958    1
Name: total_buyer, Length: 130244, dtype: int64

In [None]:
# Updated list of columns to normalize
columns_to_normalize = [
    "rooms_en_imputed", "project_count", "landmark_count", "metro_count", 
    "mall_count", "Al Makhtoum International Airport", "Burj Al Arab", 
    "Burj Khalifa", "City Centre Mirdif", "Downtown Dubai", 
    "Dubai International Airport", "Dubai Mall", "Dubai Parks and Resorts", 
    "Expo 2020 Site", "Global Village", "Hamdan Sports Complex", 
    "IMG World Adventures", "Ibn-e-Battuta Mall", "Jabel Ali", 
    "Mall of the Emirates", "Marina Mall", "Motor City", "center", 
    "east", "north", "south", "west", 
    "transaction_datetime_month", "transaction_datetime_day", 
    "transaction_datetime_weekday", "transaction_datetime_dayofyear", 
    "req_from_month", "req_from_weekday", "req_from_dayofyear", 
    "req_to_month", "req_to_day", "req_to_weekday", "req_to_dayofyear",
    "parking_count", "transaction_type_id" ,"property_usage_id" ,"total_buyer", "total_seller",
    "transaction_size_sqm", "property_size_sqm", "transaction_type_id", "property_usage_id"
]

# Initialize the scaler
scaler = StandardScaler()

# Apply normalization to the specified columns
no_missing_merged_loc[columns_to_normalize] = scaler.fit_transform(no_missing_merged_loc[columns_to_normalize])

import pickle
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Confirm the transformation
print(no_missing_merged_loc[columns_to_normalize].head())