In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import numpy as np
import pandas as pd

import os 
import joblib
import time

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import optuna
from sklearn.ensemble import RandomForestRegressor

In [3]:
class cfg:
    modelname = 'randomforest'
    debug = False
    optim = True
    seed = 42
    nfolds = 5
    njobs = 4

In [4]:
# load data
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,IsSynthetic,...,Monterey County,Napa County,Orange County,Other,Riverside County,San Francisco County,Santa Barbara County,Santa Clara County,Ventura County,Yolo County
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [5]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

x = train[FEATURES]
y = train[TARGET]

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic', 'AnomalyScore', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'pca_lon', 'pca_lat', 'Alameda County', 'Contra Costa County', 'Fresno County', 'Kern County', 'Los Angeles County', 'Monterey County', 'Napa County', 'Orange County', 'Other', 'Riverside County', 'San Francisco County', 'Santa Barbara County', 'Santa Clara County', 'Ventura County', 'Yolo County']
Train set shape: (57777, 33)
Test set shape: (24759, 32)


In [6]:
# setup
params_path = './training_files/params/'
os.makedirs(params_path, exist_ok=True)

cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)

In [7]:
# fixed params
fixed_params = {
        'n_jobs': cfg.njobs,
        'verbose': 0,
        'bootstrap': True,
        'min_samples_split': 50
    }

# objective function for optimization
def objective(trial):
    
    # trial parameters
    tuning_params = {
        'n_estimators': int(trial.suggest_float('n_estimators', 50, 500, step=25)),
        'criterion': trial.suggest_categorical('criterion', ['squared_error', 'friedman_mse']),
        'max_features': trial.suggest_float('max_features', 0.1, 0.8, step = 0.05),
        'max_samples': trial.suggest_float('max_samples', 0.1, 0.6, step = 0.05),
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_idx, test_idx in cv.split(x, y):
        
        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # fit model
        model = RandomForestRegressor(**params)
        model.fit(x_train, y_train)
        
        # obtain score
        rmse = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        scores.append(rmse)

    # return mean cv score 
    return np.mean(scores)

In [8]:
if cfg.optim:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 5 if cfg.debug else 50
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{cfg.modelname}_optimization',
        direction='minimize')

    # perform optimization
    print(f'Starting {cfg.modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

[32m[I 2023-01-08 15:08:07,538][0m A new study created in memory with name: randomforest_optimization[0m


Starting randomforest optimization...


[32m[I 2023-01-08 15:08:42,870][0m Trial 0 finished with value: 0.5554693580747571 and parameters: {'n_estimators': 225.0, 'criterion': 'squared_error', 'max_features': 0.5, 'max_samples': 0.15000000000000002}. Best is trial 0 with value: 0.5554693580747571.[0m
[32m[I 2023-01-08 15:09:29,860][0m Trial 1 finished with value: 0.5408917005033234 and parameters: {'n_estimators': 100.0, 'criterion': 'friedman_mse', 'max_features': 0.55, 'max_samples': 0.45000000000000007}. Best is trial 1 with value: 0.5408917005033234.[0m
[32m[I 2023-01-08 15:09:36,228][0m Trial 2 finished with value: 0.5487336696490199 and parameters: {'n_estimators': 50.0, 'criterion': 'squared_error', 'max_features': 0.25, 'max_samples': 0.2}. Best is trial 1 with value: 0.5408917005033234.[0m
[32m[I 2023-01-08 15:10:01,804][0m Trial 3 finished with value: 0.5435509092683629 and parameters: {'n_estimators': 125.0, 'criterion': 'friedman_mse', 'max_features': 0.4, 'max_samples': 0.25}. Best is trial 1 with val

In [9]:
if cfg.optim:
    
    # optimization results
    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    best_params_path = f'{params_path}{cfg.modelname}_bestparams.joblib'
    with open(best_params_path, "wb") as file:
        joblib.dump(best_params, file)

Number of finished trials: 24
Best score: 0.5315626385009307
Best trial parameters:
	n_jobs: 4
	verbose: 0
	bootstrap: True
	min_samples_split: 50
	n_estimators: 400.0
	criterion: friedman_mse
	max_features: 0.30000000000000004
	max_samples: 0.55


In [10]:
# load best params
best_params_path = f'{params_path}{cfg.modelname}_bestparams.joblib'
with open(best_params_path, 'rb') as file:
    best_params = joblib.load(file)

print("Final parameters:")
best_params['n_estimators'] = int(best_params['n_estimators'])
for k, v in best_params.items():
    print(f"\t{k}: {v}")

Final parameters:
	n_jobs: 4
	verbose: 0
	bootstrap: True
	min_samples_split: 50
	n_estimators: 400
	criterion: friedman_mse
	max_features: 0.30000000000000004
	max_samples: 0.55


In [11]:
# cv training and predict
def train_model(train, test, params):

    start_time = time.time()
    print(30*'*', f'Model: {cfg.modelname}', 30*'*', '\n')

    # get data
    x = train[FEATURES]
    y = train[TARGET]
    xtest = test[FEATURES]

    # cv loop
    cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)
    preds, cv_scores = [], []
    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):

        print(f'fold {fold+1}/{cfg.nfolds}...')
        fold_start_time = time.time()

        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # define and fit model
        model = RandomForestRegressor(**params)
        model.fit(x_train, y_train)

        # validation score
        cv_score = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        cv_scores.append(cv_score)

        # predict test data
        preds.append(model.predict(xtest))

        fold_run_time = time.time() - fold_start_time
        print(f'rmse: {cv_score:.4f}, run time: {fold_run_time:.2f}\n')

    # print results
    run_time = time.time() - start_time
    print(f'\nTraining completed. Total run time: {run_time:2f}')
    print(f'CV score:\n\t mean: {np.mean(cv_scores):0.6f}\n\t std: {np.std(cv_scores):0.6f}')

    return (cv_scores, preds)

In [12]:
# train model
scores, preds = train_model(train, test, best_params)

****************************** Model: randomforest ****************************** 

fold 1/5...
rmse: 0.5194, run time: 25.09

fold 2/5...
rmse: 0.5353, run time: 23.47

fold 3/5...
rmse: 0.5292, run time: 24.09

fold 4/5...
rmse: 0.5453, run time: 24.38

fold 5/5...
rmse: 0.5293, run time: 25.21


Training completed. Total run time: 122.264595
CV score:
	 mean: 0.531709
	 std: 0.008516


In [13]:
# get cv score and final predictions
score = np.mean(scores)
final_preds = np.mean(np.array(preds), axis=0)

In [14]:
# save predictions
sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
sub[TARGET] = final_preds
sub.head()

out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)
sub.to_csv(out_path + f'{cfg.modelname}.csv')