In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS
from carbonplan_trace.v1 import load
import carbonplan_trace.v1.model as m
import pandas as pd
from carbonplan_trace.v1.landsat_preprocess import access_credentials
import numpy as np

In [3]:
access_key_id, secret_access_key = access_credentials()

### P0
* try random split [done, doesn't help val score]
* save the concatenated file [done]
* try out afrotropics with random split to replicate the hansen landsat modeling work [done, random we can achieve good performance with random splits]
* check for infs and nulls [done]

* do HPO [in progress]
    * learning rate 
    * max depth 
    * n_estimators 

* confirm findings with different realm and different val year [done with default params]
* try normalizing all features [no benefit as expected]
* normalize all bands with NIR_V [no benefit :(]
* try first split [done, using first year as validation yields a much better validation r2 score. let's use the average of first/last as our]

### P1
do cross validation based on different years 
performance stratified by biomass value 

### P2 
examine the correlation between our continuous features 

In [4]:
realms = list(REALM_GROUPINGS.keys())
realms = ['afrotropic', 'australia', 'nearctic', 'neotropic']
# realms = ['afrotropic', 'nearctic']

In [5]:
# HPO 
import itertools

def product_dict(**kwargs):
    keys = kwargs.keys()
    vals = kwargs.values()
    for instance in itertools.product(*vals):
        yield dict(zip(keys, instance))
        
param_set = {
    'learning_rate': [0.07, 0.05, 0.03],
    'max_depth': [10, 12, 14],
    'colsample_bytree': [0.5, 0.7, 0.9],
    'subsample': [0.5, 0.7, 0.9],
    'min_child_weight': [2, 4, 6],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.5, 1],
    'gamma': [0, 0.5, 1],  
}

groupings = [
    ['learning_rate'],
    ['max_depth'],
    ['colsample_bytree', 'subsample', 'min_child_weight'],
    ['lambda', 'alpha', 'gamma']
]

dims = [list(range(len(param_set[g[0]]))) for g in groupings]
param_set_list = []
for orders in list(itertools.product(*dims)):
    d = {}
    for o, g in zip(orders, groupings):
        for k in g:
            d[k] = param_set[k][o]
    param_set_list.append(d)

In [6]:
def get_all_prediction_result(model, df_train, df_test, df_val): 
    
    df_train['biomass_pred'] = model._predict(df_train) 
    df_test['biomass_pred'] = model._predict(df_test) 
    df_val['biomass_pred'] = model._predict(df_val) 
    
    return pd.concat(
        [
            df_train[['lat', 'lon', 'year', 'biomass_pred']], 
            df_test[['lat', 'lon', 'year', 'biomass_pred']], 
            df_val[['lat', 'lon', 'year', 'biomass_pred']], 
        ]
    ).rename(columns={'biomass_pred': 'biomass'})


def calculate_temporal_variability(df, y1=2007, y2=2008, precision=3):
    year1 = df.loc[df.year == y1, ['lat', 'lon', 'biomass']]
    year2 = df.loc[df.year == y2, ['lat', 'lon', 'biomass']]

    year1['lat_round'] = year1.lat.round(precision)
    year1['lon_round'] = year1.lon.round(precision)
    year2['lat_round'] = year2.lat.round(precision)
    year2['lon_round'] = year2.lon.round(precision)
    
    merged = year1.merge(year2, on=['lat_round', 'lon_round'], suffixes=['_year1', '_year2'])
    
    mae = (merged.biomass_year2 - merged.biomass_year1).abs().mean()
    me = (merged.biomass_year2 - merged.biomass_year1).mean()
    
    return {'mae': mae, 'me': me}

In [None]:
scores = []
# temporal_variability = []
for realm in realms:
    print(f'Building model for {realm} realm')
    
    # load data, add year information
    df = load.training(realm=realm, reload=False, access_key_id=access_key_id, secret_access_key=secret_access_key)
    print(f'    size of entire df is {round(df.size / 1e9, 2)}Gb')
    
#     r = calculate_temporal_variability(df)
#     r['realm'] = realm
#     r['model_name'] = 'ground_truth'
#     r['random_split'] = None
#     temporal_variability.append(r)
    
    for strategy in ['first', 'last']: 
        for random_split in [True, False]:
            # split into train/test based on year 
            df_train, df_test, df_val = m.train_test_split_based_on_year(
                df, val_strategy=strategy, random_train_test=random_split)
            print(f'    training sample size = {len(df_train)}')
            print(f'    testing sample size = {len(df_test)}')
            print(f'    eval sample size = {len(df_val)}')

            # build 2 models: 1) baseline/mean, 2) xgboost 
            # TODO: build linear model as another baseline model 
            # m.baseline_model, m.gradient_boost_model, m.random_forest_model
            for model_class in [m.xgb_model]:
                for params in [{}]:

                    model = model_class(
                        realm=realm, 
                        df_train=df_train,
                        df_test=df_test,
                        output_folder='s3://carbonplan-climatetrace/v1/models/',
                        overwrite=True,
                        validation_year=strategy,
                        params=params
                    )

                    for split, sub in zip(('train', 'test', 'val'), (df_train, df_test, df_val)):
                        model_score = model.evaluate(sub)
                        model_score['model_name'] = model.name
                        model_score['split'] = split
                        model_score['realm'] = realm
                        model_score['validation_year'] = strategy
                        model_score['random_split'] = random_split
                        model_score['sample_size'] = len(sub)
                        model_score.update(params)
                        scores.append(model_score)
                        
#                     preds = get_all_prediction_result(model, df_train, df_test, df_val)
#                     r = calculate_temporal_variability(preds)
#                     r['realm'] = realm
#                     r['model_name'] = model.name
#                     r['random_split'] = random_split
#                     temporal_variability.append(r)

scores = pd.DataFrame(scores)
# scores.to_csv(f'{realm}_{strategy}.csv')

Building model for afrotropic realm
    size of entire df is 0.6Gb
    training sample size = 8399642
    testing sample size = 2099911
    eval sample size = 2271212
{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'tree_method': 'hist', 'n_estimators': 999, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 10, 'colsample_bytree': 0.8, 'subsample': 0.8, 'min_child_weight': 6}
    Building xgb_afrotropic_first model
[0]	validation_0-rmse:138.18591
[1]	validation_0-rmse:132.82529
[2]	validation_0-rmse:127.79534
[3]	validation_0-rmse:123.13554
[4]	validation_0-rmse:118.75377
[5]	validation_0-rmse:114.58966
[6]	validation_0-rmse:110.94740
[7]	validation_0-rmse:107.28252
[8]	validation_0-rmse:103.86309
[9]	validation_0-rmse:100.71201
[10]	validation_0-rmse:97.73372
[11]	validation_0-rmse:94.95342
[12]	validation_0-rmse:92.35983
[13]	validation_0-rmse:89.96066
[14]	validation_0-rmse:87.73549
[15]	validation_0-rmse:85.66430
[16]	validation_0-rmse:83.75243
[17]	validation_0-rmse

In [None]:
# check weighted average result 
# 0.9, 0.9, 3

for random_split in [True, False]:
    print(random_split)
    sub = scores.loc[(scores.split == 'val') & (scores.random_split == random_split)]
    print(f'validation score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}')
    sub = scores.loc[(scores.split == 'test') & (scores.random_split == random_split)]
    print(f'testing score    = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}')
    sub = scores.loc[(scores.split == 'train') & (scores.random_split == random_split)]
    print(f'training score   = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}')

In [82]:
scores.loc[(scores.split == 'val') & (scores.random_split == False)]

Unnamed: 0,bias,mae,r2,model_name,split,realm,validation_year,random_split,sample_size,model_type
11,16.221206,37.730452,0.605638,xgb_afrotropic_last,val,afrotropic,last,False,1633454,xgb
14,16.280223,37.829253,0.603236,gb_afrotropic_last,val,afrotropic,last,False,1633454,gb
17,15.688924,39.327787,0.58492,rf_afrotropic_last,val,afrotropic,last,False,1633454,rf
29,1.024533,5.058862,0.793991,xgb_australia_last,val,australia,last,False,300193,xgb
32,1.440663,5.130379,0.785938,gb_australia_last,val,australia,last,False,300193,gb
35,1.562326,5.140293,0.790218,rf_australia_last,val,australia,last,False,300193,rf
47,0.654617,17.734039,0.601638,xgb_nearctic_last,val,nearctic,last,False,676925,xgb
50,0.692911,17.714689,0.603899,gb_nearctic_last,val,nearctic,last,False,676925,gb
53,0.873349,18.677573,0.581715,rf_nearctic_last,val,nearctic,last,False,676925,rf
65,21.428531,48.232903,0.579549,xgb_neotropic_last,val,neotropic,last,False,1581558,xgb


In [24]:
# weighted average performance 

scores['model_type'] = scores.model_name.apply(lambda x: x.split('_')[0])
for model, g in scores.loc[(scores.split == 'val') & (scores.random_split == False)].groupby('model_type'):
    print(model.ljust(3),  (g.r2 * g.sample_size).sum() / g.sample_size.sum())

gb  0.6070251900006757
rf  0.590250604711402
xgb 0.6086370353027087


In [80]:
# weighted average performance 

scores['model_type'] = scores.model_name.apply(lambda x: x.split('_')[0])
for model, g in scores.loc[(scores.split == 'val') & (scores.random_split == True)].groupby('model_type'):
    print(model.ljust(3),  (g.r2 * g.sample_size).sum() / g.sample_size.sum())

gb  0.6062574251978661
rf  0.5897037319571558
xgb 0.6073554210681192


In [79]:
scores.to_csv('default_rf_gb_scores.csv', index=False)

In [25]:
# simple average performance

scores.loc[(scores.split == 'val') & (scores.random_split == False)].groupby('model_type').r2.mean()

model_type
gb     0.642848
rf     0.629577
xgb    0.645204
Name: r2, dtype: float64

In [118]:
temporal_variability.to_csv('temporal_variability.csv', index=False)

In [112]:
temporal_variability = pd.read_csv('temporal_variability.csv')

In [113]:
temporal_variability['realm'] = temporal_variability.model_name.apply(lambda x: x.split('_')[1])
temporal_variability['model_type'] = temporal_variability.model_name.apply(lambda x: x.split('_')[0])

sample_size = scores.loc[
    (scores.random_split == True) & (scores.model_name.str.startswith('xgb'))
].groupby('realm').sample_size.sum()

In [120]:
# weighted average 
merged = temporal_variability.loc[temporal_variability.random_split != True].merge(sample_size, how='left', on='realm')
name_dict = {
    'gb': 'gradient boosting',
    'ground': 'lidar derived',
    'rf': 'random forest',
    'xgb': 'xgboost'
}
merged['model_type'] = merged.model_type.apply(lambda x: name_dict[x])

print('Biomass MAE between years 2007 and 2008 of the same location using different model architecture')
print('')
for model, g in merged.groupby('model_type'):
    print(model.ljust(20),  np.round((g.mae * g.sample_size).sum() / g.sample_size.sum(), 4))

Biomass MAE between years 2007 and 2008 of the same location using different model architecture

gradient boosting    17.7812
lidar derived        33.0125
random forest        15.9057
xgboost              17.9184


In [55]:
# simple average 
temporal_variability.loc[temporal_variability.random_split != True].merge(sample_size, how='left', on='realm').groupby('model_type').mae.mean()

model_type
gb        14.136758
ground    25.892038
rf        12.603001
xgb       14.137349
Name: mae, dtype: float64

In [None]:
print('done')

In [7]:

#             'learning_rate': 0.05,
#             'max_depth': 10,
#             'colsample_bytree': 0.7,
#             'subsample': 0.7,
#             'min_child_weight': 4,
scores.loc[scores.split == 'val'].r2.mean()

0.623497651124284

In [16]:
print('done')

done


In [20]:
df = []
for realm in realms:
    for strategy in ['first', 'last']:
        df.append(pd.read_csv(f'{realm}_{strategy}.csv'))
        
df = pd.concat(df)

In [81]:
scores = pd.read_csv('HPO_1.csv')

In [26]:
df.loc[df.split == 'test'].groupby(['learning_rate', 'max_depth', 'colsample_bytree', 'lambda']).mean().sort_values(by='r2')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 0,bias,mae,r2,sample_size,subsample,min_child_weight,alpha,gamma
learning_rate,max_depth,colsample_bytree,lambda,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0.1,8,0.5,1.5,4.0,2.308906,26.39783,0.68116,1584083.625,0.5,2.0,0.5,0.5
0.1,10,0.5,2.0,34.0,2.349971,26.181475,0.681359,1584083.625,0.5,2.0,1.0,1.0
0.1,10,0.5,1.0,28.0,2.270306,26.203374,0.681397,1584083.625,0.5,2.0,0.0,0.0
0.1,8,0.5,1.0,1.0,2.353026,26.385209,0.681407,1584083.625,0.5,2.0,0.0,0.0
0.1,8,0.5,2.0,7.0,2.322086,26.377354,0.68143,1584083.625,0.5,2.0,1.0,1.0
0.1,10,0.5,1.5,31.0,2.306159,26.179458,0.681647,1584083.625,0.5,2.0,0.5,0.5
0.05,8,0.5,1.0,55.0,2.205334,26.242036,0.683585,1584083.625,0.5,2.0,0.0,0.0
0.05,8,0.5,1.5,58.0,2.23375,26.236306,0.68365,1584083.625,0.5,2.0,0.5,0.5
0.1,8,0.7,1.0,10.0,2.221195,26.02409,0.683702,1584083.625,0.7,4.0,0.0,0.0
0.1,8,0.7,1.5,13.0,2.148057,26.027765,0.683956,1584083.625,0.7,4.0,0.5,0.5


In [21]:
df_train, df_test, df_val = m.train_test_split_based_on_year(df, val_strategy='last', random_train_test=False)
print(f'    training sample size = {len(df_train)}')
print(f'    testing sample size = {len(df_test)}')
print(f'    eval sample size = {len(df_val)}')

    training sample size = 1876208
    testing sample size = 359029
    eval sample size = 324938


In [39]:
scores = []
model_class = m.xgb_model
    model = model_class(
        realm=realm, 
        df_train=df_train,
        df_test=df_test,
        output_folder='s3://carbonplan-climatetrace/v1/models/',
        params=params,
        overwrite=True
    )
        
    for split, df in zip(('train', 'test', 'val'), (df_train, df_test, df_val)):
        model_score = model.evaluate(df)
        model_score['model_name'] = model.name
        model_score['split'] = split
        model_score.update(params)
        scores.append(model_score)

{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 1000, 'random_state': 42, 'learning_rate': 0.1, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:92.50160
[1]	validation_0-rmse:87.49003
[2]	validation_0-rmse:83.16765
[3]	validation_0-rmse:79.42314
[4]	validation_0-rmse:76.22594
[5]	validation_0-rmse:73.46867
[6]	validation_0-rmse:71.06100
[7]	validation_0-rmse:69.00629
[8]	validation_0-rmse:67.22349
[9]	validation_0-rmse:65.71007
[10]	validation_0-rmse:64.45616
[11]	validation_0-rmse:63.38517
[12]	validation_0-rmse:62.46336
[13]	validation_0-rmse:61.69775
[14]	validation_0-rmse:61.06208
[15]	validation_0-rmse:60.48203
[16]	validation_0-rmse:59.99840
[17]	validation_0-rmse:59.63692
[18]	validation_0-rmse:59.27531
[19]	validation_0-rmse:58.96634
[20]	validation_0-rmse:58.67696
[21]	validation_0-rmse:58.46461
[22]	validation_0-rmse:58.26601
[23]	validation_0-rmse:58.09242
[24]	validation_0-rmse:57.89921
[25]	validation_0-rmse:57.83263
[26]	val



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 2000, 'random_state': 42, 'learning_rate': 0.1, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:92.50160
[1]	validation_0-rmse:87.49004
[2]	validation_0-rmse:83.16765
[3]	validation_0-rmse:79.42314
[4]	validation_0-rmse:76.22594
[5]	validation_0-rmse:73.46867
[6]	validation_0-rmse:71.06100
[7]	validation_0-rmse:69.00628
[8]	validation_0-rmse:67.22348
[9]	validation_0-rmse:65.71008
[10]	validation_0-rmse:64.45616
[11]	validation_0-rmse:63.38517
[12]	validation_0-rmse:62.46336
[13]	validation_0-rmse:61.69776
[14]	validation_0-rmse:61.06208
[15]	validation_0-rmse:60.48203
[16]	validation_0-rmse:59.99840
[17]	validation_0-rmse:59.63692
[18]	validation_0-rmse:59.27531
[19]	validation_0-rmse:58.96634
[20]	validation_0-rmse:58.67696
[21]	validation_0-rmse:58.46461
[22]	validation_0-rmse:58.26601
[23]	validation_0-rmse:58.09242
[24]	validation_0-rmse:57.89922
[25]	validation_0-rmse:57.83263
[26]	val



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 3000, 'random_state': 42, 'learning_rate': 0.1, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:92.50160
[1]	validation_0-rmse:87.49004
[2]	validation_0-rmse:83.16765
[3]	validation_0-rmse:79.42314
[4]	validation_0-rmse:76.22594
[5]	validation_0-rmse:73.46866
[6]	validation_0-rmse:71.06100
[7]	validation_0-rmse:69.00628
[8]	validation_0-rmse:67.22348
[9]	validation_0-rmse:65.71007
[10]	validation_0-rmse:64.45616
[11]	validation_0-rmse:63.38517
[12]	validation_0-rmse:62.46336
[13]	validation_0-rmse:61.69776
[14]	validation_0-rmse:61.06208
[15]	validation_0-rmse:60.48203
[16]	validation_0-rmse:59.99840
[17]	validation_0-rmse:59.63691
[18]	validation_0-rmse:59.27531
[19]	validation_0-rmse:58.96634
[20]	validation_0-rmse:58.67696
[21]	validation_0-rmse:58.46461
[22]	validation_0-rmse:58.26601
[23]	validation_0-rmse:58.09242
[24]	validation_0-rmse:57.89922
[25]	validation_0-rmse:57.83263
[26]	val



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 1000, 'random_state': 42, 'learning_rate': 0.1, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:92.33988
[1]	validation_0-rmse:87.20858
[2]	validation_0-rmse:82.74522
[3]	validation_0-rmse:78.86431
[4]	validation_0-rmse:75.56595
[5]	validation_0-rmse:72.72360
[6]	validation_0-rmse:70.32182
[7]	validation_0-rmse:68.28014
[8]	validation_0-rmse:66.44085
[9]	validation_0-rmse:64.96318
[10]	validation_0-rmse:63.64576
[11]	validation_0-rmse:62.60608
[12]	validation_0-rmse:61.69844
[13]	validation_0-rmse:60.99026
[14]	validation_0-rmse:60.29642
[15]	validation_0-rmse:59.73000
[16]	validation_0-rmse:59.23189
[17]	validation_0-rmse:58.86429
[18]	validation_0-rmse:58.57434
[19]	validation_0-rmse:58.28964
[20]	validation_0-rmse:58.07332
[21]	validation_0-rmse:57.83380
[22]	validation_0-rmse:57.68293
[23]	validation_0-rmse:57.48673
[24]	validation_0-rmse:57.36017
[25]	validation_0-rmse:57.25346
[26]	val



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 2000, 'random_state': 42, 'learning_rate': 0.1, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:92.33988
[1]	validation_0-rmse:87.20857
[2]	validation_0-rmse:82.74521
[3]	validation_0-rmse:78.86431
[4]	validation_0-rmse:75.56595
[5]	validation_0-rmse:72.72359
[6]	validation_0-rmse:70.32182
[7]	validation_0-rmse:68.28014
[8]	validation_0-rmse:66.44085
[9]	validation_0-rmse:64.96318
[10]	validation_0-rmse:63.64576
[11]	validation_0-rmse:62.60608
[12]	validation_0-rmse:61.69844
[13]	validation_0-rmse:60.99027
[14]	validation_0-rmse:60.29642
[15]	validation_0-rmse:59.73001
[16]	validation_0-rmse:59.23189
[17]	validation_0-rmse:58.86429
[18]	validation_0-rmse:58.57434
[19]	validation_0-rmse:58.28964
[20]	validation_0-rmse:58.07332
[21]	validation_0-rmse:57.83380
[22]	validation_0-rmse:57.68293
[23]	validation_0-rmse:57.48674
[24]	validation_0-rmse:57.36017
[25]	validation_0-rmse:57.25346
[26]	val



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 3000, 'random_state': 42, 'learning_rate': 0.1, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:92.33988
[1]	validation_0-rmse:87.20858
[2]	validation_0-rmse:82.74522
[3]	validation_0-rmse:78.86431
[4]	validation_0-rmse:75.56595
[5]	validation_0-rmse:72.72360
[6]	validation_0-rmse:70.32182
[7]	validation_0-rmse:68.28014
[8]	validation_0-rmse:66.44085
[9]	validation_0-rmse:64.96318
[10]	validation_0-rmse:63.64577
[11]	validation_0-rmse:62.60608
[12]	validation_0-rmse:61.69844
[13]	validation_0-rmse:60.99027
[14]	validation_0-rmse:60.29642
[15]	validation_0-rmse:59.73000
[16]	validation_0-rmse:59.23188
[17]	validation_0-rmse:58.86429
[18]	validation_0-rmse:58.57434
[19]	validation_0-rmse:58.28964
[20]	validation_0-rmse:58.07332
[21]	validation_0-rmse:57.83380
[22]	validation_0-rmse:57.68293
[23]	validation_0-rmse:57.48674
[24]	validation_0-rmse:57.36017
[25]	validation_0-rmse:57.25346
[26]	val



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 1000, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:95.36784
[1]	validation_0-rmse:92.65028
[2]	validation_0-rmse:90.10306
[3]	validation_0-rmse:87.74157
[4]	validation_0-rmse:85.52615
[5]	validation_0-rmse:83.47678
[6]	validation_0-rmse:81.56130
[7]	validation_0-rmse:79.78303
[8]	validation_0-rmse:78.10056
[9]	validation_0-rmse:76.57394
[10]	validation_0-rmse:75.16675
[11]	validation_0-rmse:73.81429
[12]	validation_0-rmse:72.55772
[13]	validation_0-rmse:71.42284
[14]	validation_0-rmse:70.31935
[15]	validation_0-rmse:69.32960
[16]	validation_0-rmse:68.39977
[17]	validation_0-rmse:67.54231
[18]	validation_0-rmse:66.76698
[19]	validation_0-rmse:66.01170
[20]	validation_0-rmse:65.33620
[21]	validation_0-rmse:64.73604
[22]	validation_0-rmse:64.15445
[23]	validation_0-rmse:63.61543
[24]	validation_0-rmse:63.11733
[25]	validation_0-rmse:62.67443
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 2000, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:95.36785
[1]	validation_0-rmse:92.65028
[2]	validation_0-rmse:90.10306
[3]	validation_0-rmse:87.74157
[4]	validation_0-rmse:85.52616
[5]	validation_0-rmse:83.47678
[6]	validation_0-rmse:81.56130
[7]	validation_0-rmse:79.78304
[8]	validation_0-rmse:78.10056
[9]	validation_0-rmse:76.57394
[10]	validation_0-rmse:75.16674
[11]	validation_0-rmse:73.81429
[12]	validation_0-rmse:72.55772
[13]	validation_0-rmse:71.42284
[14]	validation_0-rmse:70.31935
[15]	validation_0-rmse:69.32959
[16]	validation_0-rmse:68.39977
[17]	validation_0-rmse:67.54231
[18]	validation_0-rmse:66.76698
[19]	validation_0-rmse:66.01170
[20]	validation_0-rmse:65.33620
[21]	validation_0-rmse:64.73605
[22]	validation_0-rmse:64.15444
[23]	validation_0-rmse:63.61543
[24]	validation_0-rmse:63.11733
[25]	validation_0-rmse:62.67442
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 3000, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:95.36785
[1]	validation_0-rmse:92.65028
[2]	validation_0-rmse:90.10306
[3]	validation_0-rmse:87.74157
[4]	validation_0-rmse:85.52616
[5]	validation_0-rmse:83.47678
[6]	validation_0-rmse:81.56130
[7]	validation_0-rmse:79.78304
[8]	validation_0-rmse:78.10056
[9]	validation_0-rmse:76.57394
[10]	validation_0-rmse:75.16675
[11]	validation_0-rmse:73.81429
[12]	validation_0-rmse:72.55772
[13]	validation_0-rmse:71.42284
[14]	validation_0-rmse:70.31936
[15]	validation_0-rmse:69.32960
[16]	validation_0-rmse:68.39977
[17]	validation_0-rmse:67.54231
[18]	validation_0-rmse:66.76698
[19]	validation_0-rmse:66.01170
[20]	validation_0-rmse:65.33620
[21]	validation_0-rmse:64.73604
[22]	validation_0-rmse:64.15445
[23]	validation_0-rmse:63.61543
[24]	validation_0-rmse:63.11733
[25]	validation_0-rmse:62.67442
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 1000, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:95.28216
[1]	validation_0-rmse:92.48724
[2]	validation_0-rmse:89.88618
[3]	validation_0-rmse:87.42549
[4]	validation_0-rmse:85.16161
[5]	validation_0-rmse:83.02767
[6]	validation_0-rmse:81.06526
[7]	validation_0-rmse:79.20642
[8]	validation_0-rmse:77.51969
[9]	validation_0-rmse:75.92672
[10]	validation_0-rmse:74.45503
[11]	validation_0-rmse:73.10433
[12]	validation_0-rmse:71.85703
[13]	validation_0-rmse:70.68955
[14]	validation_0-rmse:69.63094
[15]	validation_0-rmse:68.65096
[16]	validation_0-rmse:67.71901
[17]	validation_0-rmse:66.89576
[18]	validation_0-rmse:66.11048
[19]	validation_0-rmse:65.39984
[20]	validation_0-rmse:64.72848
[21]	validation_0-rmse:64.10138
[22]	validation_0-rmse:63.52499
[23]	validation_0-rmse:62.99129
[24]	validation_0-rmse:62.48473
[25]	validation_0-rmse:62.02803
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 2000, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:95.28216
[1]	validation_0-rmse:92.48724
[2]	validation_0-rmse:89.88618
[3]	validation_0-rmse:87.42550
[4]	validation_0-rmse:85.16161
[5]	validation_0-rmse:83.02767
[6]	validation_0-rmse:81.06526
[7]	validation_0-rmse:79.20642
[8]	validation_0-rmse:77.51969
[9]	validation_0-rmse:75.92672
[10]	validation_0-rmse:74.45503
[11]	validation_0-rmse:73.10433
[12]	validation_0-rmse:71.85703
[13]	validation_0-rmse:70.68955
[14]	validation_0-rmse:69.63094
[15]	validation_0-rmse:68.65097
[16]	validation_0-rmse:67.71901
[17]	validation_0-rmse:66.89576
[18]	validation_0-rmse:66.11048
[19]	validation_0-rmse:65.39985
[20]	validation_0-rmse:64.72847
[21]	validation_0-rmse:64.10138
[22]	validation_0-rmse:63.52498
[23]	validation_0-rmse:62.99129
[24]	validation_0-rmse:62.48473
[25]	validation_0-rmse:62.02803
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 3000, 'random_state': 42, 'learning_rate': 0.05, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:95.28215
[1]	validation_0-rmse:92.48724
[2]	validation_0-rmse:89.88618
[3]	validation_0-rmse:87.42549
[4]	validation_0-rmse:85.16161
[5]	validation_0-rmse:83.02767
[6]	validation_0-rmse:81.06526
[7]	validation_0-rmse:79.20642
[8]	validation_0-rmse:77.51968
[9]	validation_0-rmse:75.92672
[10]	validation_0-rmse:74.45503
[11]	validation_0-rmse:73.10434
[12]	validation_0-rmse:71.85703
[13]	validation_0-rmse:70.68955
[14]	validation_0-rmse:69.63094
[15]	validation_0-rmse:68.65096
[16]	validation_0-rmse:67.71901
[17]	validation_0-rmse:66.89577
[18]	validation_0-rmse:66.11048
[19]	validation_0-rmse:65.39984
[20]	validation_0-rmse:64.72847
[21]	validation_0-rmse:64.10137
[22]	validation_0-rmse:63.52498
[23]	validation_0-rmse:62.99128
[24]	validation_0-rmse:62.48473
[25]	validation_0-rmse:62.02803
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 1000, 'random_state': 42, 'learning_rate': 0.02, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:97.11549
[1]	validation_0-rmse:95.97162
[2]	validation_0-rmse:94.86339
[3]	validation_0-rmse:93.78058
[4]	validation_0-rmse:92.73274
[5]	validation_0-rmse:91.70919
[6]	validation_0-rmse:90.71558
[7]	validation_0-rmse:89.74292
[8]	validation_0-rmse:88.80193
[9]	validation_0-rmse:87.88336
[10]	validation_0-rmse:86.98832
[11]	validation_0-rmse:86.12174
[12]	validation_0-rmse:85.27377
[13]	validation_0-rmse:84.45088
[14]	validation_0-rmse:83.65039
[15]	validation_0-rmse:82.87532
[16]	validation_0-rmse:82.12468
[17]	validation_0-rmse:81.39687
[18]	validation_0-rmse:80.68542
[19]	validation_0-rmse:79.99628
[20]	validation_0-rmse:79.32521
[21]	validation_0-rmse:78.66901
[22]	validation_0-rmse:78.03095
[23]	validation_0-rmse:77.40805
[24]	validation_0-rmse:76.80457
[25]	validation_0-rmse:76.22141
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 2000, 'random_state': 42, 'learning_rate': 0.02, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:97.11549
[1]	validation_0-rmse:95.97162
[2]	validation_0-rmse:94.86339
[3]	validation_0-rmse:93.78057
[4]	validation_0-rmse:92.73274
[5]	validation_0-rmse:91.70920
[6]	validation_0-rmse:90.71558
[7]	validation_0-rmse:89.74290
[8]	validation_0-rmse:88.80193
[9]	validation_0-rmse:87.88336
[10]	validation_0-rmse:86.98832
[11]	validation_0-rmse:86.12174
[12]	validation_0-rmse:85.27377
[13]	validation_0-rmse:84.45087
[14]	validation_0-rmse:83.65039
[15]	validation_0-rmse:82.87531
[16]	validation_0-rmse:82.12468
[17]	validation_0-rmse:81.39687
[18]	validation_0-rmse:80.68542
[19]	validation_0-rmse:79.99628
[20]	validation_0-rmse:79.32521
[21]	validation_0-rmse:78.66901
[22]	validation_0-rmse:78.03095
[23]	validation_0-rmse:77.40805
[24]	validation_0-rmse:76.80457
[25]	validation_0-rmse:76.22141
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 3000, 'random_state': 42, 'learning_rate': 0.02, 'max_depth': 10}
    Building xgb_nearctic model
[0]	validation_0-rmse:97.11549
[1]	validation_0-rmse:95.97163
[2]	validation_0-rmse:94.86338
[3]	validation_0-rmse:93.78057
[4]	validation_0-rmse:92.73274
[5]	validation_0-rmse:91.70921
[6]	validation_0-rmse:90.71558
[7]	validation_0-rmse:89.74292
[8]	validation_0-rmse:88.80193
[9]	validation_0-rmse:87.88336
[10]	validation_0-rmse:86.98832
[11]	validation_0-rmse:86.12174
[12]	validation_0-rmse:85.27377
[13]	validation_0-rmse:84.45088
[14]	validation_0-rmse:83.65039
[15]	validation_0-rmse:82.87531
[16]	validation_0-rmse:82.12468
[17]	validation_0-rmse:81.39687
[18]	validation_0-rmse:80.68543
[19]	validation_0-rmse:79.99628
[20]	validation_0-rmse:79.32521
[21]	validation_0-rmse:78.66901
[22]	validation_0-rmse:78.03095
[23]	validation_0-rmse:77.40805
[24]	validation_0-rmse:76.80456
[25]	validation_0-rmse:76.22141
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 1000, 'random_state': 42, 'learning_rate': 0.02, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:97.08012
[1]	validation_0-rmse:95.90262
[2]	validation_0-rmse:94.75969
[3]	validation_0-rmse:93.64359
[4]	validation_0-rmse:92.56133
[5]	validation_0-rmse:91.49530
[6]	validation_0-rmse:90.47224
[7]	validation_0-rmse:89.47387
[8]	validation_0-rmse:88.49539
[9]	validation_0-rmse:87.54831
[10]	validation_0-rmse:86.62385
[11]	validation_0-rmse:85.72733
[12]	validation_0-rmse:84.85838
[13]	validation_0-rmse:84.00948
[14]	validation_0-rmse:83.19078
[15]	validation_0-rmse:82.38028
[16]	validation_0-rmse:81.60738
[17]	validation_0-rmse:80.85918
[18]	validation_0-rmse:80.11949
[19]	validation_0-rmse:79.41094
[20]	validation_0-rmse:78.71761
[21]	validation_0-rmse:78.04800
[22]	validation_0-rmse:77.37924
[23]	validation_0-rmse:76.75507
[24]	validation_0-rmse:76.14045
[25]	validation_0-rmse:75.54469
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 2000, 'random_state': 42, 'learning_rate': 0.02, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:97.08012
[1]	validation_0-rmse:95.90262
[2]	validation_0-rmse:94.75969
[3]	validation_0-rmse:93.64358
[4]	validation_0-rmse:92.56133
[5]	validation_0-rmse:91.49529
[6]	validation_0-rmse:90.47224
[7]	validation_0-rmse:89.47387
[8]	validation_0-rmse:88.49540
[9]	validation_0-rmse:87.54831
[10]	validation_0-rmse:86.62385
[11]	validation_0-rmse:85.72732
[12]	validation_0-rmse:84.85838
[13]	validation_0-rmse:84.00948
[14]	validation_0-rmse:83.19078
[15]	validation_0-rmse:82.38027
[16]	validation_0-rmse:81.60738
[17]	validation_0-rmse:80.85918
[18]	validation_0-rmse:80.11949
[19]	validation_0-rmse:79.41093
[20]	validation_0-rmse:78.71761
[21]	validation_0-rmse:78.04800
[22]	validation_0-rmse:77.37925
[23]	validation_0-rmse:76.75506
[24]	validation_0-rmse:76.14045
[25]	validation_0-rmse:75.54469
[26]	va



{'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 3000, 'random_state': 42, 'learning_rate': 0.02, 'max_depth': 14}
    Building xgb_nearctic model
[0]	validation_0-rmse:97.08012
[1]	validation_0-rmse:95.90262
[2]	validation_0-rmse:94.75969
[3]	validation_0-rmse:93.64357
[4]	validation_0-rmse:92.56133
[5]	validation_0-rmse:91.49530
[6]	validation_0-rmse:90.47224
[7]	validation_0-rmse:89.47387
[8]	validation_0-rmse:88.49540
[9]	validation_0-rmse:87.54832
[10]	validation_0-rmse:86.62386
[11]	validation_0-rmse:85.72732
[12]	validation_0-rmse:84.85838
[13]	validation_0-rmse:84.00948
[14]	validation_0-rmse:83.19078
[15]	validation_0-rmse:82.38028
[16]	validation_0-rmse:81.60738
[17]	validation_0-rmse:80.85918
[18]	validation_0-rmse:80.11949
[19]	validation_0-rmse:79.41093
[20]	validation_0-rmse:78.71761
[21]	validation_0-rmse:78.04800
[22]	validation_0-rmse:77.37925
[23]	validation_0-rmse:76.75506
[24]	validation_0-rmse:76.14045
[25]	validation_0-rmse:75.54469
[26]	va



In [40]:
r = pd.DataFrame(scores)
r.loc[r.split == 'val'].sort_values(by='r2')

Unnamed: 0,bias,mae,r2,model_name,split,learning_rate,max_depth,n_estimators
2,-0.852048,23.668822,0.507676,xgb_nearctic,val,0.1,10,1000
5,-0.852048,23.668822,0.507676,xgb_nearctic,val,0.1,10,2000
8,-0.852048,23.668822,0.507676,xgb_nearctic,val,0.1,10,3000
11,-0.934256,23.265791,0.508721,xgb_nearctic,val,0.1,14,1000
14,-0.934256,23.265791,0.508721,xgb_nearctic,val,0.1,14,2000
17,-0.934256,23.265791,0.508721,xgb_nearctic,val,0.1,14,3000
35,-0.947551,23.138256,0.512086,xgb_nearctic,val,0.05,14,3000
32,-0.947551,23.138256,0.512086,xgb_nearctic,val,0.05,14,2000
29,-0.947551,23.138256,0.512086,xgb_nearctic,val,0.05,14,1000
26,-0.804719,23.598356,0.5123,xgb_nearctic,val,0.05,10,3000


In [41]:
r = pd.DataFrame(scores)
r.loc[r.split == 'test'].sort_values(by='r2')

Unnamed: 0,bias,mae,r2,model_name,split,learning_rate,max_depth,n_estimators
1,-2.561562,22.603937,0.529733,xgb_nearctic,test,0.1,10,1000
4,-2.561562,22.603937,0.529733,xgb_nearctic,test,0.1,10,2000
7,-2.561562,22.603937,0.529733,xgb_nearctic,test,0.1,10,3000
10,-2.747026,22.235418,0.532029,xgb_nearctic,test,0.1,14,1000
13,-2.747026,22.235418,0.532029,xgb_nearctic,test,0.1,14,2000
16,-2.747026,22.235418,0.532029,xgb_nearctic,test,0.1,14,3000
28,-2.79269,22.112319,0.534002,xgb_nearctic,test,0.05,14,1000
31,-2.79269,22.112319,0.534002,xgb_nearctic,test,0.05,14,2000
34,-2.79269,22.112319,0.534002,xgb_nearctic,test,0.05,14,3000
43,-2.527872,22.509412,0.536906,xgb_nearctic,test,0.02,10,3000


In [36]:
r = pd.DataFrame(scores)
r.loc[r.split == 'val'].sort_values(by='r2')

Unnamed: 0,bias,mae,r2,model_name,split,learning_rate,max_depth
2,-1.037318,24.771967,0.484516,xgb_nearctic,val,0.2,6
29,-0.995037,24.560283,0.493952,xgb_nearctic,val,0.075,6
38,-0.927733,24.503957,0.494016,xgb_nearctic,val,0.05,6
11,-0.858568,24.66803,0.494875,xgb_nearctic,val,0.15,6
8,-1.259345,23.443574,0.495102,xgb_nearctic,val,0.2,14
5,-0.980468,23.853668,0.49739,xgb_nearctic,val,0.2,10
20,-0.828704,24.492755,0.501537,xgb_nearctic,val,0.1,6
14,-0.814873,23.743686,0.504668,xgb_nearctic,val,0.15,10
17,-0.983394,23.333829,0.50528,xgb_nearctic,val,0.15,14
32,-1.125266,23.683123,0.50654,xgb_nearctic,val,0.075,10


In [9]:
#     'learning_rate': 0.05,
#     'max_depth': 10,
#     'colsample_bytree': 0.6,
#     'subsample': 0.6,
#     'min_child_weight': 4,
# random split in train/test
scores

Unnamed: 0,bias,mae,r2,model_name,split,realm
0,-1.713595e-12,51.62927,-2.220446e-16,baseline_nearctic,train,nearctic
1,-0.0004116822,51.626876,-2.666312e-11,baseline_nearctic,test,nearctic
2,-1.035291,50.380847,-0.0001869265,baseline_nearctic,val,nearctic
3,-0.005482044,16.542344,0.8307906,xgb_nearctic,train,nearctic
4,-0.01688718,19.162086,0.6639486,xgb_nearctic,test,nearctic
5,1.095924,20.959087,0.5445431,xgb_nearctic,val,nearctic


In [44]:
#     'learning_rate': 0.05,
#     'max_depth': 10,
#     'colsample_bytree': 0.6,
#     'subsample': 0.6,
#     'min_child_weight': 4,
# year split in train test 
xgb.evaluate(df_val)



{'bias': 0.8843416267143598,
 'mae': 22.294730590031673,
 'r2': 0.5440653435394311}

In [15]:
# from sklearn.preprocessing import OneHotEncoder
# igbp_encoder = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore').fit(df_train[['igbp']])
#     # one hot encoding for igbp 
#     encoded_igbp = igbp_encoder.transform(X[['igbp']])
#     X = X.drop(['igbp'], axis=1)
#     for i in range(encoded_igbp.shape[1]):
#         X[f'igbp_cat_{str(i+1)}'] = encoded_igbp[:, i]
