In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import random
from functools import reduce
from lightgbm import LGBMRegressor
from math import ceil
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from typing import List, Tuple

In [2]:
pickle_file_path = "../data/ocean_journeys.pickle"
sample_number_of_records = 1000000  # Number of records for a trial-run dataset (this is arbitrarily small relative to the total size (~9 million)
rng_seed_subset_data = 54321        # for reproducibility
rng_seed_split_data = 7890          # for reproducibility
train_portion = 0.7                 # This portion is further split during cross-validaiton; the remainder is hold-out (test) data

### Load data

In [3]:
def load_data(file_path: str,
              n: int,
              rng_seed: int = None) -> pd.DataFrame:
    """
    Load dataset and (optionally) reduce to a sample
    """
    print("Loading complete ocean journeys dataset...")
    with open(file_path, 'rb') as pickle_file:
        df: pd.DataFrame = pickle.load(pickle_file)
        
        print(f"Sampling {n} records without replacement...")
        df = (
            df.sample(n=n, replace=False, random_state=rng_seed)
            .sort_values(by=['IMO', 'unique_route_id'])
            .reset_index(drop=True)
        )
        print(f"Loaded ocean_journeys dataset having {len(df)} records. Info: {df.info()}")
        return df


ocean_journeys: pd.DataFrame = load_data(
    file_path=pickle_file_path,
    n=sample_number_of_records,
    rng_seed=rng_seed_subset_data
)

Loading complete ocean journeys dataset...
Sampling 1000000 records without replacement...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 38 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   IMO                     1000000 non-null  int64         
 1   ship_type               994001 non-null   float64       
 2   ship_class              994001 non-null   object        
 3   width                   994001 non-null   float64       
 4   length                  994001 non-null   float64       
 5   vessel_age_years        992142 non-null   float64       
 6   draught                 994001 non-null   float64       
 7   OD                      1000000 non-null  object        
 8   origin_port             1000000 non-null  object        
 9   destination_port        1000000 non-null  object        
 10  origin_region           1000000 non-null  object

selected_routes_df = pd.DataFrame({
    'OD': ['CNSHG-USLGB']
})
ocean_journeys=ocean_journeys.merge(
    selected_routes_df, on='OD', how='inner'
)

In [4]:
# Dispaly data
ocean_journeys

Unnamed: 0,IMO,ship_type,ship_class,width,length,vessel_age_years,draught,OD,origin_port,destination_port,...,kiel,magellan,malacca,northeast,northwest,panama,suez,remaining_lead_time,journey_obs,is_invalid_jump
0,7907984,70.0,FEEDER,32.23,262.18,40.0,10.6,USLGB-CNSGH,USLGB,CNSGH,...,False,False,False,False,False,False,False,15.457106,17,False
1,7907984,70.0,FEEDER,32.23,262.18,40.0,10.6,USLAX-CNSGH,USLAX,CNSGH,...,False,False,False,False,False,False,False,10.088495,35,False
2,7907984,70.0,FEEDER,32.23,262.18,40.0,10.6,USLAX-CNSGH,USLAX,CNSGH,...,False,False,False,False,False,False,False,5.789838,50,False
3,7907984,70.0,FEEDER,32.23,262.18,40.0,10.6,USLGB-CNSHA,USLGB,CNSHA,...,False,False,False,False,False,False,False,13.127442,22,False
4,7907984,70.0,FEEDER,32.23,262.18,40.0,10.6,USLAX-CNSHA,USLAX,CNSHA,...,False,False,False,False,False,False,False,18.963484,2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,9893589,90.0,FEEDER,32.00,196.16,2.0,9.5,SGSIN-CNSZX,SGSIN,CNSZX,...,False,False,True,False,False,False,False,8.786586,4,False
999996,9893591,70.0,FEEDER,32.00,196.16,2.0,9.1,CNSHG-IDJKT,CNSHG,IDJKT,...,False,False,False,False,False,False,False,0.563438,15,False
999997,9893591,70.0,FEEDER,32.00,196.16,2.0,9.1,CNSHG-IDJKT,CNSHG,IDJKT,...,False,False,False,False,False,False,False,1.004965,13,False
999998,9893591,70.0,FEEDER,32.00,196.16,2.0,9.1,KRPUS-CNSGH,KRPUS,CNSGH,...,False,False,False,False,False,False,False,0.166713,3,False


### Prepare data (drop duplicates and manage data types)

In [5]:
# Keep only complete cases
ocean_journeys.dropna(axis = 0, how = 'any', inplace = True)

# Derive month number (this is our encoding for seasonality)
ocean_journeys['month'] = (
    ocean_journeys['time_position']
    .apply(lambda x: x.month)
    .astype('category')
)

# Encode the other categoricals (for now we will not further bucket the categories)
ocean_journeys['ship_class'] = ocean_journeys['ship_class'].astype('category')
ocean_journeys['origin_region'] = ocean_journeys['origin_region'].astype('category')
ocean_journeys['destination_region'] = ocean_journeys['destination_region'].astype('category')

# Encode binary flags
ocean_journeys['within_region_journey'] = ocean_journeys['within_region_journey'].astype('bool')

# Align numerical types
ocean_journeys['vessel_age_years'] = ocean_journeys['vessel_age_years'].astype('int')

# Calculate total_remaining_distance
ocean_journeys['total_remaining_distance'] = (
    ocean_journeys['ocean_distance'] +
    ocean_journeys['source_to_network_dist'] +
    ocean_journeys['network_to_dest_dist']
)

In [6]:
ocean_journeys.to_csv("ocean_journeys.csv")

### Define candidate features and response variable (prediction target)

In [7]:
journey_dimensions = ['IMO', 'unique_route_id']

candidate_features = [
    "ship_class",
    # "width",
    # "length",
    # "vessel_age_years",
    # "draught",
    "origin_region",
    "destination_region",
    "within_region_journey",
    # "elapsed_time",
    "month",
    # "ocean_distance",
    # "source_to_network_dist",
    # "network_to_dest_dist",
    "total_remaining_distance",
    # "babelmandeb",
    # "bering",
    # "corinth",
    # "dover",
    "gibraltar",
    # "kiel",
    # "magellan",
    # "malacca",
    # "northeast",
    # "northwest",
    "panama",
    "suez"
]

response = 'remaining_lead_time'

In [8]:
ocean_journeys[journey_dimensions + candidate_features + [response]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2817 entries, 0 to 2816
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   IMO                       2817 non-null   int64   
 1   unique_route_id           2817 non-null   int64   
 2   ship_class                2817 non-null   category
 3   origin_region             2817 non-null   category
 4   destination_region        2817 non-null   category
 5   within_region_journey     2817 non-null   bool    
 6   month                     2817 non-null   category
 7   total_remaining_distance  2817 non-null   float64 
 8   gibraltar                 2817 non-null   bool    
 9   panama                    2817 non-null   bool    
 10  suez                      2817 non-null   bool    
 11  remaining_lead_time       2817 non-null   float64 
dtypes: bool(4), category(4), float64(2), int64(2)
memory usage: 132.9 KB


### Split data (we will assign entire journeys to train or test)

In [9]:
unique_journeys = (
    ocean_journeys[['OD', 'unique_route_id']]
    .sort_values(['OD', 'unique_route_id'])
    .drop_duplicates()
    .reset_index(drop=True)
)
unique_journeys['journey_id'] = unique_journeys.index
unique_journeys

Unnamed: 0,OD,unique_route_id,journey_id
0,CNSHG-USLGB,1,0
1,CNSHG-USLGB,2,1
2,CNSHG-USLGB,3,2
3,CNSHG-USLGB,4,3
4,CNSHG-USLGB,5,4
...,...,...,...
439,CNSHG-USLGB,440,439
440,CNSHG-USLGB,441,440
441,CNSHG-USLGB,442,441
442,CNSHG-USLGB,443,442


In [10]:
# Each journey is randomly assigned to train or test
random.seed(rng_seed_split_data)
unique_journeys['is_train'] = unique_journeys.apply(
    lambda idx: True if random.uniform(0, 1) <= train_portion else False,
    axis=1
)
# assert round(unique_journeys['is_train'].mean(), 3) == train_portion
unique_journeys

Unnamed: 0,OD,unique_route_id,journey_id,is_train
0,CNSHG-USLGB,1,0,True
1,CNSHG-USLGB,2,1,False
2,CNSHG-USLGB,3,2,False
3,CNSHG-USLGB,4,3,True
4,CNSHG-USLGB,5,4,True
...,...,...,...,...
439,CNSHG-USLGB,440,439,True
440,CNSHG-USLGB,441,440,True
441,CNSHG-USLGB,442,441,True
442,CNSHG-USLGB,443,442,True


In [11]:
# Merge with dataset to assign each record as train or test
ocean_journeys = ocean_journeys.merge(
    unique_journeys,
    how='inner',
    on=['OD', 'unique_route_id'],
)
ocean_journeys

Unnamed: 0,IMO,ship_type,ship_class,width,length,vessel_age_years,draught,OD,origin_port,destination_port,...,northwest,panama,suez,remaining_lead_time,journey_obs,is_invalid_jump,month,total_remaining_distance,journey_id,is_train
0,7907984,70.0,FEEDER,32.23,262.18,40,10.6,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,1.936968,39,False,3,1785.038122,18,True
1,7907984,70.0,FEEDER,32.23,262.18,40,10.6,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,0.983981,43,False,3,925.486227,18,True
2,7907984,70.0,FEEDER,32.23,262.18,40,10.6,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,7.684329,14,False,2,8071.898198,18,True
3,7907984,70.0,FEEDER,32.23,262.18,40,10.6,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,6.524757,19,False,3,6401.444460,18,True
4,7907984,70.0,FEEDER,32.23,262.18,40,10.6,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,7.667859,19,False,5,6802.182891,97,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2812,9850525,74.0,PANAMAX,48.00,334.00,2,12.7,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,15.498079,2,False,6,10720.951180,164,True
2813,9850525,74.0,PANAMAX,48.00,334.00,2,12.7,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,13.993310,5,False,6,10858.009980,164,True
2814,9850525,74.0,PANAMAX,48.00,334.00,2,12.7,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,10.013843,13,False,7,9624.266557,164,True
2815,9850525,74.0,PANAMAX,48.00,334.00,2,12.7,CNSHG-USLGB,CNSHG,USLGB,...,False,False,False,15.707593,1,False,6,10812.961971,164,True


Further split the `is_train` partition into **k** folds. With each fold, we want to assign entire journies to train or validation.

In [12]:
# Subset unique_journeys to training journeys only,
# then shuffle those rows by sampling all of them without replacement
unique_train_journeys = (
    unique_journeys
    [unique_journeys['is_train']]
    .sample(frac=1)
    .reset_index(drop=True)
)
unique_train_journeys

Unnamed: 0,OD,unique_route_id,journey_id,is_train
0,CNSHG-USLGB,87,86,True
1,CNSHG-USLGB,100,99,True
2,CNSHG-USLGB,59,58,True
3,CNSHG-USLGB,188,187,True
4,CNSHG-USLGB,202,201,True
...,...,...,...,...
312,CNSHG-USLGB,288,287,True
313,CNSHG-USLGB,308,307,True
314,CNSHG-USLGB,122,121,True
315,CNSHG-USLGB,176,175,True


Now create the train/validation folds

In [13]:
def assign_journeys_to_cv_folds(
    unique_journeys_df: pd.DataFrame,
    k: int
) -> List[Tuple[np.array, np.array]]:
    """
    Assign train/validation splits based on group partitions (OD, unique_route_id)
    """
    # Subset all partitions (journeys) to the training cohort only.
    unique_train_journeys = (
        unique_journeys_df
        [unique_journeys_df['is_train']]
        .sample(frac=1)
        .reset_index(drop=True)
    )
    # Build the cross validation folds
    journey_fold_assignments = []
    partition_start_idx = 0
    for _ in range(k):
        partition_stop_idx = (
            partition_start_idx +
            ceil(len(unique_train_journeys.index)/k)
        )
        validation_indices = np.array(
            unique_train_journeys.index[
                partition_start_idx:partition_stop_idx
            ]
        )
        train_slice_1 = np.array(unique_train_journeys.index[0:max(0, partition_start_idx-1)])
        train_slice_2 = np.array(unique_train_journeys.index[validation_indices[-1] + 1:])
        train_indices = np.concatenate([train_slice_1,train_slice_2])
        journey_fold_assignments.append((train_indices, validation_indices))
        partition_start_idx += len(validation_indices) + 1
    
    return journey_fold_assignments

In [14]:
cv_folds = assign_journeys_to_cv_folds(unique_journeys, 10)

for fold in cv_folds:
    print(fold)
    overlapping = set(fold[0]).intersection(set(fold[1]))
    print(f"[OVERLAPPING] {overlapping}\n")

(array([ 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
        45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,
        58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
        97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
       110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
       123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
       136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161,
       162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
       175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
       188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
       201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 2

In [15]:
def generate_cv_folds(ocean_journeys_df: pd.DataFrame,
                      unique_journeys_df: pd.DataFrame,
                      k: int):
    """
    This generator function will return an iterator of train/validation splits
    in terms of the ocean_journeys table index (not the journey_id)
    """
    def expand_indices(journey_ids: np.ndarray):
        """
        Take in an array of jouneys assigned to train or validate for a fold, and
        convert an array of index values from ocean_journeys_df
        """
        return np.concatenate([
            np.array(ocean_journeys_df.index[ocean_journeys_df['journey_id'] == jid])
            for jid in journey_ids
        ])

    # TODO: this itself could be defined as a generator function for better performance
    for fold in assign_journeys_to_cv_folds(unique_journeys_df, k):
        yield (
            expand_indices(fold[0]),  # Train indices for this cv fold
            expand_indices(fold[1])   # Validation indices for this cv fold
        )
        

In [16]:
# Verify partitioning
print("Verifying the cross-validation folds:\n")
for train_indices, validation_indices in generate_cv_folds(ocean_journeys, unique_journeys, 10):
    print(f"[TRAIN_IDX] {train_indices}")
    print(f"[VALIDATION_IDX] {validation_indices}")
    train_indices_valid = all(map(lambda idx: idx in ocean_journeys.index, train_indices))
    validation_indices_valid = all(map(lambda idx: idx in ocean_journeys.index, validation_indices))
    print(f"[TRAIN_IDX_VALID] {train_indices_valid}\n[VALIDATION_IDX_VALID] {validation_indices_valid}")
    overlapping = set(train_indices).intersection(set(validation_indices))
    print(f"[OVERLAPPING] {overlapping}\n")

Verifying the cross-validation folds:

[TRAIN_IDX] [735 736 737 ... 897 898 899]
[VALIDATION_IDX] [2278 2279 2280 2281 2282 2283 2284 2285 2286 2287  911  912  913  914
  915  916  917  918  919   69   70   71   72   73 2142 2143 2144 2145
 2146 2147  145  146  147  148 1171 1172 1173 1174 1175 1176 1054 1055
 1056 1057 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2684
 2685 2686  756  757  758  759  760  761  401  402  403  678  679  680
  681  682  683  684 2524 2525 2526 2527 2528 2529   28   29   30   31
   32   33   34   35 1073 1074 1075 1076  718  719  720  721  722  723
  724  725  726 1971 1972 1973 1974 1975 1976 1977 1978 1979 1254 1255
 1256 1257 1258 1259    0    1    2    3 2567 2568 2569 2570 2571 2572
 2573 2574 2100 2101 2102 2103 2488 2489 2490 2491 1086 1087 1088 1089
 1090  580  581  582  583  584  585  586  587  588 1162 1163 1164 1165
 1166 1167 1168 1169 1170  201  202  203 1517 1518 1519 1520 1521  290
  291  292  293  294  850  851  852  853  854  855

In [17]:
X = ocean_journeys[candidate_features]
y = ocean_journeys[response]

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2817 entries, 0 to 2816
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   ship_class                2817 non-null   category
 1   origin_region             2817 non-null   category
 2   destination_region        2817 non-null   category
 3   within_region_journey     2817 non-null   bool    
 4   month                     2817 non-null   category
 5   total_remaining_distance  2817 non-null   float64 
 6   gibraltar                 2817 non-null   bool    
 7   panama                    2817 non-null   bool    
 8   suez                      2817 non-null   bool    
dtypes: bool(4), category(4), float64(1)
memory usage: 131.4 KB


#### Tuning grid

In [19]:
param_ranges = {
    "n_estimators": [100 + 100*x for x in range(5)],  # Multiples of 100
    "max_depth": [2, 4, 6, 8],
    "min_child_samples": [10, 25, 50, 100],
    "learning_rate": [0.01, 0.03, 0.05],
}
grid_size = reduce(
    lambda a, b: a*b,                             # Reduce by iterative multiplication
    map(lambda x: len(x), param_ranges.values())  # Across: size of each dimension
)
print(f"Total size of the search space: {grid_size}")
print(f"\nSearch space:")
for key, values in param_ranges.items():
    print(f"\t{key}: {values}")

Total size of the search space: 240

Search space:
	n_estimators: [100, 200, 300, 400, 500]
	max_depth: [2, 4, 6, 8]
	min_child_samples: [10, 25, 50, 100]
	learning_rate: [0.01, 0.03, 0.05]


### Gradient Boosting Machines (regression)

In [20]:
cv_folds = generate_cv_folds(ocean_journeys, unique_journeys, 5)
cv_folds

<generator object generate_cv_folds at 0x1358ef430>

In [21]:
grid_search_cv = GridSearchCV(
    estimator=LGBMRegressor(),
    param_grid=param_ranges,
    scoring='neg_mean_absolute_percentage_error',
    cv=cv_folds,
    verbose=2
)
grid_search_cv

GridSearchCV(cv=<generator object generate_cv_folds at 0x1358ef430>,
             estimator=LGBMRegressor(),
             param_grid={'learning_rate': [0.01, 0.03, 0.05],
                         'max_depth': [2, 4, 6, 8],
                         'min_child_samples': [10, 25, 50, 100],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='neg_mean_absolute_percentage_error', verbose=2)

In [22]:
grid_search_cv.fit(X, y)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=200; total time=   0.5s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=200; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=200; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=2, min_child_samples=10, n_estimators=200; total time=   0.2s
[CV] END learning_rate=0.01

GridSearchCV(cv=<generator object generate_cv_folds at 0x1358ef430>,
             estimator=LGBMRegressor(),
             param_grid={'learning_rate': [0.01, 0.03, 0.05],
                         'max_depth': [2, 4, 6, 8],
                         'min_child_samples': [10, 25, 50, 100],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='neg_mean_absolute_percentage_error', verbose=2)

In [23]:
# Rendered dataframe of top 10 model fits (based on scoring metric MAPE)
(
    pd.DataFrame(grid_search_cv.cv_results_)
    .sort_values(by='rank_test_score')
    [:10]
)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_child_samples,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
122,0.982375,0.029842,0.009021,0.000387,0.03,6,10,300,"{'learning_rate': 0.03, 'max_depth': 6, 'min_c...",-0.127205,-0.114855,-0.135314,-0.117879,-0.173289,-0.133708,0.02106,1
123,1.224557,0.043619,0.009841,0.000117,0.03,6,10,400,"{'learning_rate': 0.03, 'max_depth': 6, 'min_c...",-0.12561,-0.114399,-0.135882,-0.118764,-0.178311,-0.134593,0.02303,2
124,1.88199,0.312635,0.011329,0.000759,0.03,6,10,500,"{'learning_rate': 0.03, 'max_depth': 6, 'min_c...",-0.125508,-0.112293,-0.136554,-0.119123,-0.181602,-0.135016,0.024625,3
203,1.075247,0.035464,0.00938,0.000289,0.05,6,10,400,"{'learning_rate': 0.05, 'max_depth': 6, 'min_c...",-0.124704,-0.115783,-0.137945,-0.12079,-0.177366,-0.135317,0.022272,4
202,0.877823,0.032026,0.00871,0.000306,0.05,6,10,300,"{'learning_rate': 0.05, 'max_depth': 6, 'min_c...",-0.126691,-0.115402,-0.140016,-0.120458,-0.174746,-0.135462,0.0213,5
201,0.659731,0.016051,0.00807,0.000278,0.05,6,10,200,"{'learning_rate': 0.05, 'max_depth': 6, 'min_c...",-0.125859,-0.118817,-0.139796,-0.119617,-0.174386,-0.135695,0.020755,6
204,1.305491,0.022125,0.010084,0.000685,0.05,6,10,500,"{'learning_rate': 0.05, 'max_depth': 6, 'min_c...",-0.122465,-0.115465,-0.136984,-0.121122,-0.18468,-0.136143,0.025286,7
222,1.94774,0.754575,0.009427,0.000494,0.05,8,10,300,"{'learning_rate': 0.05, 'max_depth': 8, 'min_c...",-0.123902,-0.116115,-0.150581,-0.119247,-0.180016,-0.137972,0.024302,8
223,3.179568,1.311585,0.012707,0.001565,0.05,8,10,400,"{'learning_rate': 0.05, 'max_depth': 8, 'min_c...",-0.121761,-0.113814,-0.156397,-0.119864,-0.180602,-0.138488,0.025806,9
121,0.743955,0.015955,0.007745,0.000146,0.03,6,10,200,"{'learning_rate': 0.03, 'max_depth': 6, 'min_c...",-0.136704,-0.123926,-0.141646,-0.117801,-0.173715,-0.138758,0.019461,10


In [24]:
grid_search_cv.best_estimator_

LGBMRegressor(learning_rate=0.03, max_depth=6, min_child_samples=25,
              n_estimators=200)

In [None]:
mean_absolute_percentage_error(
    y_true=y_test,
    y_pred=test_pred
)

In [None]:
test_df = pd.DataFrame({'pred': test_pred, 'obs': y_test})
test_df['error'] = test_df['pred'] - test_df['obs']
test_df['pct_error'] = test_df['error'] / test_df['obs']
test_df.to_csv("./test_scores.csv", index=False)

In [None]:
import pickle
with open('./save_grid_search_cv.pickle', 'wb') as pickle_file:
    pickle.dump(grid_search_cv, pickle_file)

In [None]:
!pwd