# Single Output Regression

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from kaggle_secrets import UserSecretsClient
from pathlib import Path


# user_secrets = UserSecretsClient()
# wandb_key = user_secrets.get_secret("wandb_key")

SEED = 42
PATH = Path('../input/kdd-team/')
PIX_FILES = ['pixels_dist_train_a_est.csv', 'pixels_dist_test_a_est.csv']
MAX_TIME = 360

In [2]:
pub = pd.read_csv('../input/kdd-team/public.csv')
print(pub.shape)
pub.head()

(146262, 6)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0


In [3]:
pub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146262 entries, 0 to 146261
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Filename  146262 non-null  object 
 1   Altitude  146262 non-null  float64
 2   Delta     146262 non-null  float64
 3   North     91231 non-null   float64
 4   East      91231 non-null   float64
 5   kfold     146262 non-null  int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 6.7+ MB


In [4]:
def make_train_test(train_pixels_file, test_pixels_file=None, H2O=False):
    train = pd.read_csv(PATH / train_pixels_file, low_memory=False)
    print('Train Shape:', train.shape)
    train.rename({'Images':'Filename'}, axis=1, inplace=True)
    train = train.merge(pub, on='Filename', how='inner').reset_index(drop=True)
    train.reset_index(inplace=True, drop=True)
    print('Train Merged Shape:', train.shape)
    
    if test_pixels_file is not None:
        test = pd.read_csv(PATH / test_pixels_file, low_memory=False)
        print('Test Shape:', test.shape)
        test.rename({'Images':'Filename'}, axis=1, inplace=True)
        test = test.merge(pub, on='Filename', how='inner')
        test.drop(['North', 'East'], axis=1, inplace=True)
        test.reset_index(inplace=True, drop=True)
        print('Test Merged Shape:', test.shape)
    else:
        X_test = None
        
    if H2O:
        train = h2o.H2OFrame(train)
        test = h2o.H2OFrame(test)
        
    return train, test

In [5]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16" 2022-07-19; OpenJDK Runtime Environment (build 11.0.16+8-post-Ubuntu-0ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.16+8-post-Ubuntu-0ubuntu120.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp0m4jzkne
  JVM stdout: /tmp/tmp0m4jzkne/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp0m4jzkne/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.1
H2O_cluster_version_age:,1 month and 11 days
H2O_cluster_name:,H2O_from_python_unknownUser_vyx5rh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.500 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [6]:
train, test = make_train_test(PIX_FILES[0],
                              PIX_FILES[1],
                              H2O=True)

Train Shape: (91231, 3)
Train Merged Shape: (91231, 8)
Test Shape: (55031, 3)
Test Merged Shape: (55031, 6)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [7]:
features = ['North_pixel', 'East_pixel', 'Altitude', 'Delta']

def train_automl(train, test, target, max_runtime_secs=3600):
    
    aml = H2OAutoML(seed=SEED,
                    max_runtime_secs=max_runtime_secs,
                    stopping_metric='RMSE',
                    sort_metric='RMSE',
                    exploitation_ratio=0.1,
                    verbosity='info')

    aml.train(x=features, y=target, training_frame=train)  
    
    train_preds = aml.leader.predict(train)
    train_preds.rename({'predict':target+'_prediction'})
    
    test_preds = aml.leader.predict(test)
    test_preds.rename({'predict':target+'_prediction'})
    
    return aml, train_preds, test_preds
    

In [8]:
north_aml, north_train_preds, north_test_preds = train_automl(train, test, 'North', MAX_TIME)
north_aml.leaderboard.head(5)

AutoML progress: |
22:00:59.10: Project: AutoML_1_20221030_220058
22:00:59.12: 5-fold cross-validation will be used.
22:00:59.13: Setting stopping tolerance adaptively based on the training frame: 0.0033107682550761745
22:00:59.13: Build control seed: 42
22:00:59.14: training frame: Frame key: AutoML_1_20221030_220058_training_Key_Frame__upload_b10035c17da862f2b089829acda44db7.hex    cols: 8    rows: 91231  chunks: 4    size: 8212986  checksum: 7075760434043015152
22:00:59.14: validation frame: NULL
22:00:59.14: leaderboard frame: NULL
22:00:59.14: blending frame: NULL
22:00:59.14: response column: North
22:00:59.15: fold column: null
22:00:59.15: weights column: null
22:00:59.42: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), 

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_2_AutoML_1_20221030_220058,0.335382,0.112481,0.129516,,0.112481
StackedEnsemble_AllModels_3_AutoML_1_20221030_220058,0.335668,0.112673,0.130267,,0.112673
StackedEnsemble_BestOfFamily_3_AutoML_1_20221030_220058,0.335729,0.112714,0.128862,,0.112714
StackedEnsemble_AllModels_1_AutoML_1_20221030_220058,0.338157,0.11435,0.13007,,0.11435
StackedEnsemble_BestOfFamily_2_AutoML_1_20221030_220058,0.339272,0.115105,0.129542,,0.115105


In [9]:
east_aml, east_train_preds, east_test_preds = train_automl(train, test, 'East', MAX_TIME)
east_aml.leaderboard.head(5)

AutoML progress: |
22:07:03.741: Project: AutoML_2_20221030_220703
22:07:03.741: 5-fold cross-validation will be used.
22:07:03.742: Setting stopping tolerance adaptively based on the training frame: 0.0033107682550761745
22:07:03.742: Build control seed: 42
22:07:03.743: training frame: Frame key: AutoML_2_20221030_220703_training_Key_Frame__upload_b10035c17da862f2b089829acda44db7.hex    cols: 8    rows: 91231  chunks: 4    size: 8212986  checksum: 7075760434043015152
22:07:03.743: validation frame: NULL
22:07:03.743: leaderboard frame: NULL
22:07:03.743: blending frame: NULL
22:07:03.743: response column: East
22:07:03.743: fold column: null
22:07:03.743: weights column: null
22:07:03.743: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_3_AutoML_2_20221030_220703,0.308011,0.0948709,0.124022,,0.0948709
StackedEnsemble_AllModels_2_AutoML_2_20221030_220703,0.308046,0.0948921,0.124019,,0.0948921
StackedEnsemble_AllModels_3_AutoML_2_20221030_220703,0.308064,0.0949037,0.124037,,0.0949037
StackedEnsemble_AllModels_1_AutoML_2_20221030_220703,0.309652,0.0958841,0.12542,,0.0958841
StackedEnsemble_BestOfFamily_2_AutoML_2_20221030_220703,0.309672,0.0958966,0.125414,,0.0958966


In [10]:
train

Filename,North_pixel,East_pixel,Altitude,Delta,North,East,kfold
00003e3b9e5336685200ae85d21b4f5e.jpg,-1.22047,2.82652,178.83,-0.0652313,-0.386045,0.929772,0
0001261e2060303a06ba6c64d676d639.jpg,0.7925,0.231472,207.921,-0.0806885,0.635584,0.152819,2
0002ac0d783338cfeab0b2bdbd872cda.jpg,-3.8237,-1.65885,178.048,0.0215759,-1.22823,-0.499388,3
0004289ee1c7b8b08c77e19878106ae3.jpg,-6.06536,-2.35415,201.085,0.505981,-1.73971,-0.699928,1
0004d0b59e19461ff126e3a08a814c33.jpg,-0.564889,8.01241,187.55,-0.328156,-0.169798,2.82875,0
00053f5e11d1fe4e49a221165b39abc9.jpg,-1.26505,-0.239883,168.996,-0.0148621,-0.321776,-0.0755221,4
00056c20eb5a029583db75506953f1d9.jpg,1.78264,-4.7307,179.963,0.109741,0.646206,-1.68678,2
0006aabe0ba47a35c0b0bf6596f85159.jpg,-6.36111,0.247669,180.632,0.212234,-0.178107,0.132653,2
0006dd05ea1e999ddaa041a7091b7b36.jpg,-7.89952,8.27357,168.886,0.0512695,-1.82588,1.95608,2
0007789b118e4710fc0e7c8758a6532a.jpg,6.36956,-2.11581,194.349,0.387817,2.46552,-0.88375,3


In [11]:
train_final = train.cbind(north_train_preds)
train_final = train_final.cbind(east_train_preds)
train_final.as_data_frame().to_csv('train_final.csv')

train_final.head(5)

Filename,North_pixel,East_pixel,Altitude,Delta,North,East,kfold,North_prediction,East_prediction
00003e3b9e5336685200ae85d21b4f5e.jpg,-1.22047,2.82652,178.83,-0.0652313,-0.386045,0.929772,0,-0.395535,0.920265
0001261e2060303a06ba6c64d676d639.jpg,0.7925,0.231472,207.921,-0.0806885,0.635584,0.152819,2,0.610366,0.186492
0002ac0d783338cfeab0b2bdbd872cda.jpg,-3.8237,-1.65885,178.048,0.0215759,-1.22823,-0.499388,3,-1.23191,-0.481458
0004289ee1c7b8b08c77e19878106ae3.jpg,-6.06536,-2.35415,201.085,0.505981,-1.73971,-0.699928,1,-1.76411,-0.729352
0004d0b59e19461ff126e3a08a814c33.jpg,-0.564889,8.01241,187.55,-0.328156,-0.169798,2.82875,0,-0.106102,2.57709


In [12]:
test_final = test.cbind(north_test_preds)
test_final = test_final.cbind(east_test_preds)
test_final.as_data_frame().to_csv('test_final.csv')

test_final

Filename,North_pixel,East_pixel,Altitude,Delta,kfold,North_prediction,East_prediction
000053b1e684c9e7ea73727b2238ce18.jpg,-0.654044,-1.04211,167.943,0.0102692,0,-0.23325,-0.153566
00029153d12ae1c9abe59c17ff2e0895.jpg,4.65447,-5.10086,195.853,0.0892181,2,1.61387,-1.64877
0006246bee639c7a7b11a08e34dd3cc6.jpg,-12.1243,-0.0308469,146.943,-0.0183258,1,-1.2992,0.178793
00063cb5da1826febf178b669eea3250.jpg,-21.45,4.21257,213.184,-0.108704,1,-1.07919,0.39061
00063ece2e68a8847f228e8fd922f851.jpg,0.00832452,-6.20988,184.758,0.0177002,0,0.141974,-0.97295
000838c1249fec206b77360ff0adc110.jpg,3.46306,-2.61436,209.136,-0.544525,1,1.17048,-1.30534
000a8e84b013655b832041a6f362e5c9.jpg,-0.321557,-4.38535,177.72,-0.23024,4,-0.121386,-1.08199
000d0c74074191add6f22e0004db8f76.jpg,2.14271,-5.28396,200.488,0.0812378,3,0.61841,-1.76348
000dd3543ac84d906eae52e7c779bb2a.jpg,-4.1014,-8.30099,155.313,0.062027,4,-0.525346,-1.09816
00129b07887a18a7331909231c28816e.jpg,0.456218,4.28046,187.643,-0.133362,4,0.219356,1.75238


In [13]:
submission = test_final.as_data_frame()[['Filename', 'North_prediction', 'East_prediction']]
submission.rename({'North_prediction':'North', 'East_prediction':'East'}, axis=1, inplace=True)
submission = submission.melt(id_vars='Filename', value_name='Predicted')
submission['Id'] = submission.Filename + ':' + submission.variable
submission = submission[['Id', 'Predicted']]
submission.head()

Unnamed: 0,Id,Predicted
0,000053b1e684c9e7ea73727b2238ce18.jpg:North,-0.23325
1,00029153d12ae1c9abe59c17ff2e0895.jpg:North,1.613872
2,0006246bee639c7a7b11a08e34dd3cc6.jpg:North,-1.299204
3,00063cb5da1826febf178b669eea3250.jpg:North,-1.079194
4,00063ece2e68a8847f228e8fd922f851.jpg:North,0.141974


In [14]:
submission.to_csv('submission.csv', index=False)