# Single Output Regression

In [3]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

SEED = 42
HOME = Path.cwd().parent
RAW_PATH = HOME / 'data/raw'
print(RAW_PATH)
MAX_TIME = 360

/home/rco/DS/kdd22/data/raw


In [4]:
pub = pd.read_csv(RAW_PATH / 'public.csv')
print(pub.shape)
pub.head()

(146262, 6)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0


In [5]:
def merge_pixels_dist(dataset='train', suffixes=['', '_a', '_a_est', '_h', '_h_est', '_p', '_p_est']):
    df_final = pub.copy()
    for s in suffixes:
        filename = 'pixels_dist_' + dataset + s + '.csv'
        df_right = pd.read_csv(RAW_PATH / filename)
        df_right.rename({'Images':'Filename'}, axis=1, inplace=True)
        df_final = df_final.merge(df_right, on='Filename', how='inner', suffixes=(None, s))
        print(df_final.shape)

    return df_final

train = merge_pixels_dist('train')
train.head()

(91231, 8)
(91231, 10)
(91231, 12)
(91231, 14)
(91231, 16)
(91231, 18)
(91231, 20)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0,-1.098183,2.828369,-1.296189,2.868935,-1.220466,2.826518,-1.098183,2.828369,-1.230324,2.83861,-1.296189,2.868935,-1.220466,2.826518
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2,1.452448,0.207358,0.847966,0.356077,0.7925,0.231472,1.452448,0.207358,0.753004,0.217258,0.847966,0.356077,0.7925,0.231472
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3,-3.544215,-1.494362,-3.517452,-1.592591,-3.823705,-1.658855,-3.544215,-1.494362,-3.818374,-1.668158,-3.517452,-1.592591,-3.823705,-1.658855
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1,-6.054061,-4.08215,-6.091535,-3.099275,-6.065357,-2.354155,-6.054061,-4.08215,-6.145322,-2.303187,-6.091535,-3.099275,-6.065357,-2.354155
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0,-0.69025,8.232869,-0.449659,8.125691,-0.564889,8.012415,-0.69025,8.232869,-0.555564,7.999658,-0.449659,8.125691,-0.564889,8.012415


In [6]:
test = merge_pixels_dist('test')
test.drop(['North', 'East'], axis=1, inplace=True)
test.head()

(55031, 8)
(55031, 10)
(55031, 12)
(55031, 14)
(55031, 16)
(55031, 18)
(55031, 20)


Unnamed: 0,Filename,Altitude,Delta,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,000053b1e684c9e7ea73727b2238ce18.jpg,167.943069,0.010269,0,3.946422,-3.815853,0.356597,0.257315,-0.654044,-1.042109,3.946422,-3.815853,5.856787,-8.512549,0.356597,0.257315,-0.654044,-1.042109
1,00029153d12ae1c9abe59c17ff2e0895.jpg,195.853088,0.089218,2,7.668228,-4.031865,9.366933,-2.493681,4.654472,-5.100857,7.668228,-4.031865,4.936895,-4.14951,9.366933,-2.493681,4.654472,-5.100857
2,0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943466,-0.018326,1,-11.549439,-0.880645,-12.51837,-0.683746,-12.124315,-0.030847,-11.549439,-0.880645,-12.188266,0.273252,-12.51837,-0.683746,-12.124315,-0.030847
3,00063cb5da1826febf178b669eea3250.jpg,213.184418,-0.108704,1,-28.991636,1.930891,-25.94293,-0.682081,-21.449981,4.21257,-28.991636,1.930891,-21.237231,4.168302,-25.94293,-0.682081,-21.449981,4.21257
4,00063ece2e68a8847f228e8fd922f851.jpg,184.757767,0.0177,0,0.184479,-6.925782,-0.165511,-6.852798,0.008325,-6.20988,0.184479,-6.925782,0.003814,-6.17922,-0.165511,-6.852798,0.008325,-6.20988


In [7]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(nthreads=32, max_mem_size='48G')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_342"; OpenJDK Runtime Environment (build 1.8.0_342-8u342-b07-0ubuntu1~22.04-b07); OpenJDK 64-Bit Server VM (build 25.342-b07, mixed mode)
  Starting server from /home/rco/anaconda3/envs/kdd22/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpbeoazdnf
  JVM stdout: /tmp/tmpbeoazdnf/h2o_rco_started_from_python.out
  JVM stderr: /tmp/tmpbeoazdnf/h2o_rco_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,3 days
H2O_cluster_name:,H2O_from_python_rco_piobqc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,42.64 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,32


In [8]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [9]:
features = [f for f in train.columns if f not in ['Filename', 'North', 'East', 'kfold']]

def train_automl(train, test, target, max_runtime_secs=3600):
    
    aml = H2OAutoML(seed=SEED,
                    max_runtime_secs=max_runtime_secs,
                    stopping_metric='RMSE',
                    sort_metric='RMSE',
                    exploitation_ratio=0.1,
                    verbosity='info')

    aml.train(x=features, y=target, training_frame=train, )  
    
    train_preds = aml.leader.predict(train)
    train_preds.rename({'predict':target+'_prediction'})
    
    test_preds = aml.leader.predict(test)
    test_preds.rename({'predict':target+'_prediction'})
    
    return aml, train_preds, test_preds
    

In [10]:
north_aml, north_train_preds, north_test_preds = train_automl(train, test, 'North', MAX_TIME)
north_aml.leaderboard.head(20)

AutoML progress: |
13:35:37.792: Project: AutoML_1_20221031_133537
13:35:37.795: 5-fold cross-validation will be used.
13:35:37.795: Setting stopping tolerance adaptively based on the training frame: 0.0033107682550761745
13:35:37.795: Build control seed: 42
13:35:37.795: training frame: Frame key: AutoML_1_20221031_133537_training_Key_Frame__upload_950daa9849f7f1e6ad80efa57aa94327.hex    cols: 20    rows: 91231  chunks: 9    size: 16981251  checksum: -8173609672253272758
13:35:37.795: validation frame: NULL
13:35:37.795: leaderboard frame: NULL
13:35:37.795: blending frame: NULL
13:35:37.796: response column: North
13:35:37.796: fold column: null
13:35:37.796: weights column: null
13:35:37.808: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), gri

In [None]:
east_aml, east_train_preds, east_test_preds = train_automl(train, test, 'East', MAX_TIME)
east_aml.leaderboard.head(20)

AutoML progress: |
00:54:34.588: Project: AutoML_2_20221031_05434
00:54:34.588: Blending will be used.
00:54:34.589: Setting stopping tolerance adaptively based on the training frame: 0.0033107682550761745
00:54:34.589: Build control seed: 42
00:54:34.589: Since cross-validation is disabled, and validation, blending frame(s) were not provided, automatically split the training data into training, validation, blending frame(s) in the ratio 70/10/20.
00:54:34.785: training frame: Frame key: AutoML_2_20221031_05434_training_Key_Frame__upload_9f6d439ad8662552d77f613b00f042ec.hex    cols: 20    rows: 63883  chunks: 9    size: 11894523  checksum: -4175895588245746392
00:54:34.794: validation frame: Frame key: AutoML_2_20221031_05434_validation_Key_Frame__upload_9f6d439ad8662552d77f613b00f042ec.hex    cols: 20    rows: 9162  chunks: 9    size: 1716417  checksum: 7857583928764237808
00:54:34.794: leaderboard frame: Frame key: AutoML_2_20221031_05434_validation_Key_Frame__upload_9f6d439ad8662552

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_2_AutoML_2_20221031_05434,0.320294,0.102588,0.14419,,0.102588
StackedEnsemble_AllModels_1_AutoML_2_20221031_05434,0.320678,0.102835,0.144415,,0.102835
StackedEnsemble_BestOfFamily_1_AutoML_2_20221031_05434,0.326219,0.106419,0.150529,,0.106419
XGBoost_1_AutoML_2_20221031_05434,0.337221,0.113718,0.154495,,0.113718
GBM_1_AutoML_2_20221031_05434,0.340298,0.115803,0.162736,,0.115803
DRF_1_AutoML_2_20221031_05434,0.356087,0.126798,0.150717,,0.126798
XGBoost_2_AutoML_2_20221031_05434,0.357663,0.127923,0.173528,,0.127923
GBM_4_AutoML_2_20221031_05434,0.367756,0.135244,0.203864,,0.135244
GBM_3_AutoML_2_20221031_05434,0.373875,0.139783,0.202913,,0.139783
GBM_2_AutoML_2_20221031_05434,0.393964,0.155207,0.224959,,0.155207


In [None]:
train_final = train.cbind(north_train_preds)
train_final = train_final.cbind(east_train_preds)
train_final.as_data_frame().to_csv('train_final.csv')

train_final.head(5)

Filename,Altitude,Delta,North,East,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est,North_prediction,East_prediction
00003e3b9e5336685200ae85d21b4f5e.jpg,178.83,-0.0652313,-0.386045,0.929772,0,-1.09818,2.82837,-1.29619,2.86893,-1.22047,2.82652,-1.09818,2.82837,-1.23032,2.83861,-1.29619,2.86893,-1.22047,2.82652,-0.382476,0.888842
0001261e2060303a06ba6c64d676d639.jpg,207.921,-0.0806885,0.635584,0.152819,2,1.45245,0.207358,0.847966,0.356077,0.7925,0.231472,1.45245,0.207358,0.753004,0.217258,0.847966,0.356077,0.7925,0.231472,0.633915,0.160478
0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048,0.0215759,-1.22823,-0.499388,3,-3.54421,-1.49436,-3.51745,-1.59259,-3.8237,-1.65885,-3.54421,-1.49436,-3.81837,-1.66816,-3.51745,-1.59259,-3.8237,-1.65885,-1.13904,-0.48216
0004289ee1c7b8b08c77e19878106ae3.jpg,201.085,0.505981,-1.73971,-0.699928,1,-6.05406,-4.08215,-6.09153,-3.09927,-6.06536,-2.35415,-6.05406,-4.08215,-6.14532,-2.30319,-6.09153,-3.09927,-6.06536,-2.35415,-1.95296,-0.837682
0004d0b59e19461ff126e3a08a814c33.jpg,187.55,-0.328156,-0.169798,2.82875,0,-0.69025,8.23287,-0.449659,8.12569,-0.564889,8.01241,-0.69025,8.23287,-0.555564,7.99966,-0.449659,8.12569,-0.564889,8.01241,-0.181308,2.71981


In [None]:
test_final = test.cbind(north_test_preds)
test_final = test_final.cbind(east_test_preds)
test_final.as_data_frame().to_csv('test_final.csv')

test_final

Filename,Altitude,Delta,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est,North_prediction,East_prediction
000053b1e684c9e7ea73727b2238ce18.jpg,167.943,0.0102692,0,3.94642,-3.81585,0.356597,0.257315,-0.654044,-1.04211,3.94642,-3.81585,5.85679,-8.51255,0.356597,0.257315,-0.654044,-1.04211,-0.0813788,-0.354913
00029153d12ae1c9abe59c17ff2e0895.jpg,195.853,0.0892181,2,7.66823,-4.03186,9.36693,-2.49368,4.65447,-5.10086,7.66823,-4.03186,4.9369,-4.14951,9.36693,-2.49368,4.65447,-5.10086,1.03116,-0.640209
0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943,-0.0183258,1,-11.5494,-0.880645,-12.5184,-0.683746,-12.1243,-0.0308469,-11.5494,-0.880645,-12.1883,0.273252,-12.5184,-0.683746,-12.1243,-0.0308469,-1.6744,0.0134703
00063cb5da1826febf178b669eea3250.jpg,213.184,-0.108704,1,-28.9916,1.93089,-25.9429,-0.682081,-21.45,4.21257,-28.9916,1.93089,-21.2372,4.1683,-25.9429,-0.682081,-21.45,4.21257,-1.29895,0.397777
00063ece2e68a8847f228e8fd922f851.jpg,184.758,0.0177002,0,0.184479,-6.92578,-0.165511,-6.8528,0.00832452,-6.20988,0.184479,-6.92578,0.00381415,-6.17922,-0.165511,-6.8528,0.00832452,-6.20988,0.0444692,-1.48739
000838c1249fec206b77360ff0adc110.jpg,209.136,-0.544525,1,2.54224,-2.43533,2.78873,-2.46239,3.46306,-2.61436,2.54224,-2.43533,3.48015,-2.62869,2.78873,-2.46239,3.46306,-2.61436,1.59287,-1.49153
000a8e84b013655b832041a6f362e5c9.jpg,177.72,-0.23024,4,-0.601386,-3.88556,-0.061612,-4.1541,-0.321557,-4.38535,-0.601386,-3.88556,-0.280671,-4.39763,-0.061612,-4.1541,-0.321557,-4.38535,-0.134649,-1.37513
000d0c74074191add6f22e0004db8f76.jpg,200.488,0.0812378,3,3.12172,-6.69092,2.78086,-5.82139,2.14271,-5.28396,3.12172,-6.69092,2.19983,-5.08691,2.78086,-5.82139,2.14271,-5.28396,0.785977,-2.03307
000dd3543ac84d906eae52e7c779bb2a.jpg,155.313,0.062027,4,-0.954504,-9.59106,-3.27656,-8.70192,-4.1014,-8.30099,-0.954504,-9.59106,-4.38984,-8.47304,-3.27656,-8.70192,-4.1014,-8.30099,-0.463512,-1.17267
00129b07887a18a7331909231c28816e.jpg,187.643,-0.133362,4,0.559739,4.16881,0.2546,4.49667,0.456218,4.28046,0.559739,4.16881,0.446538,4.31604,0.2546,4.49667,0.456218,4.28046,0.146631,1.34891


In [None]:
submission = test_final.as_data_frame()[['Filename', 'North_prediction', 'East_prediction']]
submission.rename({'North_prediction':'North', 'East_prediction':'East'}, axis=1, inplace=True)
submission = submission.melt(id_vars='Filename', value_name='Predicted')
submission['Id'] = submission.Filename + ':' + submission.variable
submission = submission[['Id', 'Predicted']]
submission.head()

Unnamed: 0,Id,Predicted
0,000053b1e684c9e7ea73727b2238ce18.jpg:North,-0.081379
1,00029153d12ae1c9abe59c17ff2e0895.jpg:North,1.03116
2,0006246bee639c7a7b11a08e34dd3cc6.jpg:North,-1.674396
3,00063cb5da1826febf178b669eea3250.jpg:North,-1.298955
4,00063ece2e68a8847f228e8fd922f851.jpg:North,0.044469


In [None]:
submission.to_csv('submission.csv', index=False)

# North Explanation

In [None]:
north_aml.explain(train)

# East Explanation

In [None]:
east_aml.explain(train)