# Import the packages

In [None]:
%%time

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Read the parquet file and split it into features/labels and train/test datasets

In [None]:
%%time

path2file = './all_oa.parquet'
all_oa_df = pd.read_parquet(path2file)
# all_oa_df.info()
# all_oa_df.describe()

# shuffle the DataFrame rows
all_oa_df = all_oa_df.sample(frac=1, random_state=42)  # random_state not needed here cause we do that in train/test split

# X = all_oa_df.loc[:, ['air_quality_index', 'house_price_index', 'jobs_accessibility_index']].to_numpy(dtype=np.float32)   # features
X = all_oa_df.loc[:, ['air_quality_index', 'jobs_accessibility_index']].to_numpy(dtype=np.float32)   # features
# Y = all_oa_df[all_oa_df.columns.difference(['air_quality_index', 'house_price_index', 'jobs_accessibility_index', 'geo_code', 'geometry'])].to_numpy(dtype=np.float32)  # labels
Y = all_oa_df.loc[:, ['population_estimate', 'A, B, D, E. Agriculture, energy and water', 'C. Manufacturing', 'F. Construction', 'G, I. Distribution, hotels and restaurants']].to_numpy(dtype=np.float32)   # labels

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X.shape)
print(Y.shape)

# Perform a Grid Search with k-Fold Cross Validator to find the best hyper-parameters

In [None]:
# set up our search grid
param_grid = {'max_depth': [2, 3, 4], # default 3
              'n_estimators': [80, 100, 120], # default 100
              'learning_rate': [0.05, 0.1, 0.2]}  # default 0.1


def find_best_hyperparams(X_train, Y_train, param_grid):
    #=========================================================================
    # XGBoost regression: 
    # Parameters: 
    # n_estimators  "Number of gradient boosted trees. Equivalent to number 
    #                of boosting rounds."
    # learning_rate "Boosting learning rate (also known as “eta”)"
    # max_depth     "Maximum depth of a tree. Increasing this value will make 
    #                the model more complex and more likely to overfit." 
    #=========================================================================
    regressor=xgb.XGBRegressor(eval_metric='rmse', n_jobs=-1, verbosity=1)

    # Create a k-fold split iterator
    kf = KFold(n_splits=5, shuffle=False)

    # try out every combination of the above values
    gscv_search = GridSearchCV(regressor,
                               param_grid,
                               scoring='neg_root_mean_squared_error',
                               cv=kf,
                               verbose=3,
                               n_jobs=1).fit(X_train, Y_train)
    
    return gscv_search

gscv_search = find_best_hyperparams(X_train, Y_train, param_grid)
print("The best hyperparameters are ", gscv_search.best_params_)

# for i, (train_index, test_index) in enumerate(kf.split(X_train)):
#     print(f"Fold {i}:")
#     print(len(train_index))
#     print(len(test_index))
#     # print(f"  Train: index={train_index}")
#     # print(f"  Test:  index={test_index}")
#     X_train_cv, X_test_cv, Y_train_cv, Y_test_cv = X_train.iloc[train_index], X_train.iloc[test_index], Y_train.iloc[train_index], Y_train.iloc[test_index]

# Re-train using the best hyper-parameters and evaluate

In [None]:
def train_with_best_hyperparams(X_train, Y_train, gscv_search):
    # fit model to training data using the best parameters of the grid search
    model=xgb.XGBRegressor(learning_rate = gscv_search.best_params_['learning_rate'],
                           n_estimators  = gscv_search.best_params_['n_estimators'],
                           max_depth     = gscv_search.best_params_['max_depth'],
                           objective     = 'reg:squarederror',
                           eval_metric   = 'rmse',
                           n_jobs        = -1,
                           verbosity     = 1)

    model.fit(X_train, Y_train, verbose=True)
    
    return model

model = train_with_best_hyperparams(X_train, Y_train, gscv_search)

# Calculate training score
train_score = model.score(X_train, Y_train)  
print("Training score: ", train_score)

# Calculate test score
test_score = model.score(X_test, Y_test)  
print("Test score: ", test_score)

In [None]:
Y_pred = model.predict(X_test)

print(Y_test[0])
print(Y_pred[0])


Y_pred = model.predict(X_train)

print(Y_train[0])
print(Y_pred[0])

In [None]:
# save the model to JSON
model.save_model(fitted_model_path)