# Single Output Regression

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

SEED = 42
HOME = Path.cwd().parent
RAW_PATH = HOME / 'data/raw'
print(RAW_PATH)
MAX_TIME = 1800

/home/rco/DS/kdd22/data/raw


In [2]:
pub = pd.read_csv(RAW_PATH / 'public.csv')
print(pub.shape)
pub.head()

(146262, 6)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0


In [3]:
def merge_pixels_dist(dataset='train', suffixes=['', '_a', '_a_est', '_h', '_h_est', '_p', '_p_est']):
    df_final = pub.copy()
    for s in suffixes:
        filename = 'pixels_dist_' + dataset + s + '.csv'
        df_right = pd.read_csv(RAW_PATH / filename)
        df_right.rename({'Images':'Filename'}, axis=1, inplace=True)
        df_final = df_final.merge(df_right, on='Filename', how='inner', suffixes=(None, s))
        print(df_final.shape)

    return df_final

train = merge_pixels_dist('train')
train.head()

(91231, 8)
(91231, 10)
(91231, 12)
(91231, 14)
(91231, 16)
(91231, 18)
(91231, 20)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0,-1.098183,2.828369,-1.296189,2.868935,-1.220466,2.826518,-1.098183,2.828369,-1.230324,2.83861,-1.296189,2.868935,-1.220466,2.826518
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2,1.452448,0.207358,0.847966,0.356077,0.7925,0.231472,1.452448,0.207358,0.753004,0.217258,0.847966,0.356077,0.7925,0.231472
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3,-3.544215,-1.494362,-3.517452,-1.592591,-3.823705,-1.658855,-3.544215,-1.494362,-3.818374,-1.668158,-3.517452,-1.592591,-3.823705,-1.658855
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1,-6.054061,-4.08215,-6.091535,-3.099275,-6.065357,-2.354155,-6.054061,-4.08215,-6.145322,-2.303187,-6.091535,-3.099275,-6.065357,-2.354155
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0,-0.69025,8.232869,-0.449659,8.125691,-0.564889,8.012415,-0.69025,8.232869,-0.555564,7.999658,-0.449659,8.125691,-0.564889,8.012415


In [4]:
test = merge_pixels_dist('test')
test.drop(['North', 'East'], axis=1, inplace=True)
test.head()

(55031, 8)
(55031, 10)
(55031, 12)
(55031, 14)
(55031, 16)
(55031, 18)
(55031, 20)


Unnamed: 0,Filename,Altitude,Delta,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,000053b1e684c9e7ea73727b2238ce18.jpg,167.943069,0.010269,0,3.946422,-3.815853,0.356597,0.257315,-0.654044,-1.042109,3.946422,-3.815853,5.856787,-8.512549,0.356597,0.257315,-0.654044,-1.042109
1,00029153d12ae1c9abe59c17ff2e0895.jpg,195.853088,0.089218,2,7.668228,-4.031865,9.366933,-2.493681,4.654472,-5.100857,7.668228,-4.031865,4.936895,-4.14951,9.366933,-2.493681,4.654472,-5.100857
2,0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943466,-0.018326,1,-11.549439,-0.880645,-12.51837,-0.683746,-12.124315,-0.030847,-11.549439,-0.880645,-12.188266,0.273252,-12.51837,-0.683746,-12.124315,-0.030847
3,00063cb5da1826febf178b669eea3250.jpg,213.184418,-0.108704,1,-28.991636,1.930891,-25.94293,-0.682081,-21.449981,4.21257,-28.991636,1.930891,-21.237231,4.168302,-25.94293,-0.682081,-21.449981,4.21257
4,00063ece2e68a8847f228e8fd922f851.jpg,184.757767,0.0177,0,0.184479,-6.925782,-0.165511,-6.852798,0.008325,-6.20988,0.184479,-6.925782,0.003814,-6.17922,-0.165511,-6.852798,0.008325,-6.20988


In [5]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(nthreads=8, max_mem_size='32G')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_342"; OpenJDK Runtime Environment (build 1.8.0_342-8u342-b07-0ubuntu1~22.04-b07); OpenJDK 64-Bit Server VM (build 25.342-b07, mixed mode)
  Starting server from /home/rco/anaconda3/envs/kdd22/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpenv6jpiz
  JVM stdout: /tmp/tmpenv6jpiz/h2o_rco_started_from_python.out
  JVM stderr: /tmp/tmpenv6jpiz/h2o_rco_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,00 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,3 days
H2O_cluster_name:,H2O_from_python_rco_rmbw2z
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,28.42 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,8


In [6]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [7]:
features = [f for f in train.columns if f not in ['Filename', 'North', 'East', 'kfold']]

def train_automl(train, test, target, max_runtime_secs=3600):
    
    aml = H2OAutoML(seed=SEED,
                    max_runtime_secs=max_runtime_secs,
                    stopping_metric='RMSE',
                    sort_metric='RMSE',
                    exploitation_ratio=0.1,
                    verbosity='info')

    aml.train(x=features, y=target, training_frame=train)  
    
    train_preds = aml.leader.predict(train)
    train_preds.rename({'predict':target+'_prediction'})
    
    test_preds = aml.leader.predict(test)
    test_preds.rename({'predict':target+'_prediction'})
    
    return aml, train_preds, test_preds
    

In [8]:
north_aml, north_train_preds, north_test_preds = train_automl(train, test, 'North', MAX_TIME)
north_aml.leaderboard.head(20)

AutoML progress: |
15:47:25.869: Project: AutoML_1_20221031_154725
15:47:25.871: 5-fold cross-validation will be used.
15:47:25.871: Setting stopping tolerance adaptively based on the training frame: 0.0033107682550761745
15:47:25.871: Build control seed: 42
15:47:25.871: training frame: Frame key: AutoML_1_20221031_154725_training_Key_Frame__upload_be13ba34ec5f514f20030f37b9f64815.hex    cols: 20    rows: 91231  chunks: 9    size: 16981251  checksum: -8173609672253272758
15:47:25.872: validation frame: NULL
15:47:25.872: leaderboard frame: NULL
15:47:25.872: blending frame: NULL
15:47:25.872: response column: North
15:47:25.872: fold column: null
15:47:25.872: weights column: null
15:47:25.883: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), gri

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_4_AutoML_1_20221031_154725,0.320151,0.102497,0.118318,,0.102497
StackedEnsemble_AllModels_3_AutoML_1_20221031_154725,0.320326,0.102608,0.118454,,0.102608
StackedEnsemble_BestOfFamily_4_AutoML_1_20221031_154725,0.320535,0.102743,0.120517,,0.102743
StackedEnsemble_BestOfFamily_5_AutoML_1_20221031_154725,0.323723,0.104796,0.122122,,0.104796
GBM_grid_1_AutoML_1_20221031_154725_model_53,0.324,0.104976,0.124874,,0.104976
GBM_grid_1_AutoML_1_20221031_154725_model_34,0.326721,0.106746,0.131505,,0.106746
GBM_grid_1_AutoML_1_20221031_154725_model_24,0.331505,0.109895,0.125586,,0.109895
XGBoost_grid_1_AutoML_1_20221031_154725_model_14,0.332413,0.110499,0.129901,,0.110499
StackedEnsemble_AllModels_1_AutoML_1_20221031_154725,0.334342,0.111785,0.128398,,0.111785
StackedEnsemble_BestOfFamily_2_AutoML_1_20221031_154725,0.335208,0.112364,0.127131,,0.112364


In [9]:
east_aml, east_train_preds, east_test_preds = train_automl(train, test, 'East', MAX_TIME)
east_aml.leaderboard.head(20)

AutoML progress: |
16:17:30.459: Project: AutoML_2_20221031_161730
16:17:30.459: 5-fold cross-validation will be used.
16:17:30.459: Setting stopping tolerance adaptively based on the training frame: 0.0033107682550761745
16:17:30.459: Build control seed: 42
16:17:30.459: training frame: Frame key: AutoML_2_20221031_161730_training_Key_Frame__upload_be13ba34ec5f514f20030f37b9f64815.hex    cols: 20    rows: 91231  chunks: 9    size: 16981251  checksum: -8173609672253272758
16:17:30.459: validation frame: NULL
16:17:30.459: leaderboard frame: NULL
16:17:30.459: blending frame: NULL
16:17:30.459: response column: East
16:17:30.459: fold column: null
16:17:30.459: weights column: null
16:17:30.459: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_4_AutoML_2_20221031_161730,0.290585,0.0844394,0.114772,,0.0844394
StackedEnsemble_AllModels_3_AutoML_2_20221031_161730,0.2919,0.0852056,0.114951,,0.0852056
StackedEnsemble_BestOfFamily_6_AutoML_2_20221031_161730,0.292131,0.0853406,0.11976,,0.0853406
StackedEnsemble_BestOfFamily_5_AutoML_2_20221031_161730,0.292273,0.0854234,0.119548,,0.0854234
StackedEnsemble_BestOfFamily_4_AutoML_2_20221031_161730,0.292372,0.0854812,0.11958,,0.0854812
XGBoost_grid_1_AutoML_2_20221031_161730_model_14,0.297324,0.0884015,0.12563,,0.0884015
StackedEnsemble_AllModels_5_AutoML_2_20221031_161730,0.298697,0.0892197,0.140651,,0.0892197
GBM_grid_1_AutoML_2_20221031_161730_model_34,0.303237,0.0919526,0.125875,,0.0919526
GBM_grid_1_AutoML_2_20221031_161730_model_53,0.303863,0.0923329,0.122416,,0.0923329
StackedEnsemble_AllModels_2_AutoML_2_20221031_161730,0.305224,0.0931615,0.125155,,0.0931615


In [10]:
train_final = train.cbind(north_train_preds)
train_final = train_final.cbind(east_train_preds)
train_final.as_data_frame().to_csv('train_final.csv')

train_final.head(5)

Filename,Altitude,Delta,North,East,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est,North_prediction,East_prediction
00003e3b9e5336685200ae85d21b4f5e.jpg,178.83,-0.0652313,-0.386045,0.929772,0,-1.09818,2.82837,-1.29619,2.86893,-1.22047,2.82652,-1.09818,2.82837,-1.23032,2.83861,-1.29619,2.86893,-1.22047,2.82652,-0.370673,0.918724
0001261e2060303a06ba6c64d676d639.jpg,207.921,-0.0806885,0.635584,0.152819,2,1.45245,0.207358,0.847966,0.356077,0.7925,0.231472,1.45245,0.207358,0.753004,0.217258,0.847966,0.356077,0.7925,0.231472,0.640065,0.163524
0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048,0.0215759,-1.22823,-0.499388,3,-3.54421,-1.49436,-3.51745,-1.59259,-3.8237,-1.65885,-3.54421,-1.49436,-3.81837,-1.66816,-3.51745,-1.59259,-3.8237,-1.65885,-1.15011,-0.47306
0004289ee1c7b8b08c77e19878106ae3.jpg,201.085,0.505981,-1.73971,-0.699928,1,-6.05406,-4.08215,-6.09153,-3.09927,-6.06536,-2.35415,-6.05406,-4.08215,-6.14532,-2.30319,-6.09153,-3.09927,-6.06536,-2.35415,-1.8073,-0.692987
0004d0b59e19461ff126e3a08a814c33.jpg,187.55,-0.328156,-0.169798,2.82875,0,-0.69025,8.23287,-0.449659,8.12569,-0.564889,8.01241,-0.69025,8.23287,-0.555564,7.99966,-0.449659,8.12569,-0.564889,8.01241,-0.170678,2.80593


In [11]:
test_final = test.cbind(north_test_preds)
test_final = test_final.cbind(east_test_preds)
test_final.as_data_frame().to_csv('test_final.csv')

test_final

Filename,Altitude,Delta,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est,North_prediction,East_prediction
000053b1e684c9e7ea73727b2238ce18.jpg,167.943,0.0102692,0,3.94642,-3.81585,0.356597,0.257315,-0.654044,-1.04211,3.94642,-3.81585,5.85679,-8.51255,0.356597,0.257315,-0.654044,-1.04211,-0.162767,-0.0421389
00029153d12ae1c9abe59c17ff2e0895.jpg,195.853,0.0892181,2,7.66823,-4.03186,9.36693,-2.49368,4.65447,-5.10086,7.66823,-4.03186,4.9369,-4.14951,9.36693,-2.49368,4.65447,-5.10086,1.01156,-0.482322
0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943,-0.0183258,1,-11.5494,-0.880645,-12.5184,-0.683746,-12.1243,-0.0308469,-11.5494,-0.880645,-12.1883,0.273252,-12.5184,-0.683746,-12.1243,-0.0308469,-1.42134,0.143293
00063cb5da1826febf178b669eea3250.jpg,213.184,-0.108704,1,-28.9916,1.93089,-25.9429,-0.682081,-21.45,4.21257,-28.9916,1.93089,-21.2372,4.1683,-25.9429,-0.682081,-21.45,4.21257,-0.90714,0.0534793
00063ece2e68a8847f228e8fd922f851.jpg,184.758,0.0177002,0,0.184479,-6.92578,-0.165511,-6.8528,0.00832452,-6.20988,0.184479,-6.92578,0.00381415,-6.17922,-0.165511,-6.8528,0.00832452,-6.20988,0.149268,-1.31934
000838c1249fec206b77360ff0adc110.jpg,209.136,-0.544525,1,2.54224,-2.43533,2.78873,-2.46239,3.46306,-2.61436,2.54224,-2.43533,3.48015,-2.62869,2.78873,-2.46239,3.46306,-2.61436,1.29458,-1.54507
000a8e84b013655b832041a6f362e5c9.jpg,177.72,-0.23024,4,-0.601386,-3.88556,-0.061612,-4.1541,-0.321557,-4.38535,-0.601386,-3.88556,-0.280671,-4.39763,-0.061612,-4.1541,-0.321557,-4.38535,-0.118214,-1.24217
000d0c74074191add6f22e0004db8f76.jpg,200.488,0.0812378,3,3.12172,-6.69092,2.78086,-5.82139,2.14271,-5.28396,3.12172,-6.69092,2.19983,-5.08691,2.78086,-5.82139,2.14271,-5.28396,0.839787,-2.0338
000dd3543ac84d906eae52e7c779bb2a.jpg,155.313,0.062027,4,-0.954504,-9.59106,-3.27656,-8.70192,-4.1014,-8.30099,-0.954504,-9.59106,-4.38984,-8.47304,-3.27656,-8.70192,-4.1014,-8.30099,-0.436464,-0.898591
00129b07887a18a7331909231c28816e.jpg,187.643,-0.133362,4,0.559739,4.16881,0.2546,4.49667,0.456218,4.28046,0.559739,4.16881,0.446538,4.31604,0.2546,4.49667,0.456218,4.28046,0.165328,1.65831


# Submission

In [12]:
submission = test_final.as_data_frame()[['Filename', 'North_prediction', 'East_prediction']]
submission.rename({'North_prediction':'North', 'East_prediction':'East'}, axis=1, inplace=True)
submission = submission.melt(id_vars='Filename', value_name='Predicted')
submission['Id'] = submission.Filename + ':' + submission.variable
submission = submission[['Id', 'Predicted']]
submission.head()

Unnamed: 0,Id,Predicted
0,000053b1e684c9e7ea73727b2238ce18.jpg:North,-0.162767
1,00029153d12ae1c9abe59c17ff2e0895.jpg:North,1.011559
2,0006246bee639c7a7b11a08e34dd3cc6.jpg:North,-1.421341
3,00063cb5da1826febf178b669eea3250.jpg:North,-0.90714
4,00063ece2e68a8847f228e8fd922f851.jpg:North,0.149268


In [13]:
submission.to_csv('submission_h2o_full.csv', index=False)

# North Explanation

In [14]:
# north_aml.explain(train)

# East Explanation

In [15]:
# east_aml.explain(train)

In [16]:
h2o.shutdown()

H2O session _sid_b763 closed.


  h2o.shutdown()
