# Single Output Regression

In [3]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

SEED = 42
HOME = Path.cwd().parent
RAW_PATH = HOME / 'data/raw'
print(RAW_PATH)
MAX_TIME = 3600

/home/rco/DS/kdd22/data/raw


In [4]:
pub = pd.read_csv(RAW_PATH / 'public.csv')
print(pub.shape)
pub.head()

(146262, 6)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0


In [5]:
suffixes = [
    '', '_a', '_a_est', '_h', '_h_est', '_p', '_p_est',
    '_AKAZE_BRUTE_LMEDS', '_AKAZE_BRUTE_NAIVE', '_AKAZE_BRUTE_RANSAC', '_AKAZE_BRUTE_RHO',
    '_BRISK_BRUTE_LMEDS', '_BRISK_BRUTE_NAIVE', '_BRISK_BRUTE_RANSAC', '_BRISK_BRUTE_RHO',
    '_SIFT_BRUTE_LMEDS', '_SIFT_BRUTE_NAIVE', '_SIFT_BRUTE_RANSAC', '_SIFT_BRUTE_RHO',
    '_SIFT_FLANN_LMEDS', '_SIFT_FLANN_NAIVE', '_SIFT_FLANN_RANSAC', '_SIFT_FLANN_RHO',    
]

def merge_pixels_dist(dataset='train', suffixes=suffixes):
    df_final = pub.copy()
    for s in suffixes:
        filename = 'pixels_dist_' + dataset + s + '.csv'
        df_right = pd.read_csv(RAW_PATH / filename)
        df_right.rename({'Images':'Filename'}, axis=1, inplace=True)
        df_final = df_final.merge(df_right, on='Filename', how='inner', suffixes=(None, s))
        print(df_final.shape)

    return df_final

train = merge_pixels_dist('train')
train.head()

(91231, 8)
(91231, 10)
(91231, 12)
(91231, 14)
(91231, 16)
(91231, 18)
(91231, 20)
(91231, 22)
(91231, 24)
(91231, 26)
(91231, 28)
(91231, 30)
(91231, 32)
(91231, 34)
(91231, 36)
(91231, 38)
(91231, 40)
(91231, 42)
(91231, 44)
(91231, 46)
(91231, 48)
(91231, 50)
(91231, 52)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,...,North_pixel_SIFT_BRUTE_RHO,East_pixel_SIFT_BRUTE_RHO,North_pixel_SIFT_FLANN_LMEDS,East_pixel_SIFT_FLANN_LMEDS,North_pixel_SIFT_FLANN_NAIVE,East_pixel_SIFT_FLANN_NAIVE,North_pixel_SIFT_FLANN_RANSAC,East_pixel_SIFT_FLANN_RANSAC,North_pixel_SIFT_FLANN_RHO,East_pixel_SIFT_FLANN_RHO
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0,-1.098183,2.828369,-1.296189,2.868935,...,,,,,,,,,,
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2,1.452448,0.207358,0.847966,0.356077,...,,,,,23.903748,-18.196305,,,,
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3,-3.544215,-1.494362,-3.517452,-1.592591,...,,,,,,,,,,
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1,-6.054061,-4.08215,-6.091535,-3.099275,...,-6.663898,-3.789955,-6.607596,-3.825182,-6.040889,-3.597447,-6.656188,-3.796844,-6.663898,-3.789955
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0,-0.69025,8.232869,-0.449659,8.125691,...,-6.196263,7.616971,7.517969,-3.088306,-6.1968,7.616973,-170.353988,134.741365,-6.196263,7.616971


In [6]:
train.isna().sum()

Filename                              0
Altitude                              0
Delta                                 0
North                                 0
East                                  0
kfold                                 0
North_pixel                           0
East_pixel                            0
North_pixel_a                         0
East_pixel_a                          0
North_pixel_a_est                     0
East_pixel_a_est                      0
North_pixel_h                         0
East_pixel_h                          0
North_pixel_h_est                     0
East_pixel_h_est                      0
North_pixel_p                         0
East_pixel_p                          0
North_pixel_p_est                     0
East_pixel_p_est                      0
North_pixel_AKAZE_BRUTE_LMEDS     71961
East_pixel_AKAZE_BRUTE_LMEDS      71961
North_pixel_AKAZE_BRUTE_NAIVE     71939
East_pixel_AKAZE_BRUTE_NAIVE      71939
North_pixel_AKAZE_BRUTE_RANSAC    71960


In [7]:
test = merge_pixels_dist('test')
test.drop(['North', 'East'], axis=1, inplace=True)
test.head()

(55031, 8)
(55031, 10)
(55031, 12)
(55031, 14)
(55031, 16)
(55031, 18)
(55031, 20)


Unnamed: 0,Filename,Altitude,Delta,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,000053b1e684c9e7ea73727b2238ce18.jpg,167.943069,0.010269,0,3.946422,-3.815853,0.356597,0.257315,-0.654044,-1.042109,3.946422,-3.815853,5.856787,-8.512549,0.356597,0.257315,-0.654044,-1.042109
1,00029153d12ae1c9abe59c17ff2e0895.jpg,195.853088,0.089218,2,7.668228,-4.031865,9.366933,-2.493681,4.654472,-5.100857,7.668228,-4.031865,4.936895,-4.14951,9.366933,-2.493681,4.654472,-5.100857
2,0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943466,-0.018326,1,-11.549439,-0.880645,-12.51837,-0.683746,-12.124315,-0.030847,-11.549439,-0.880645,-12.188266,0.273252,-12.51837,-0.683746,-12.124315,-0.030847
3,00063cb5da1826febf178b669eea3250.jpg,213.184418,-0.108704,1,-28.991636,1.930891,-25.94293,-0.682081,-21.449981,4.21257,-28.991636,1.930891,-21.237231,4.168302,-25.94293,-0.682081,-21.449981,4.21257
4,00063ece2e68a8847f228e8fd922f851.jpg,184.757767,0.0177,0,0.184479,-6.925782,-0.165511,-6.852798,0.008325,-6.20988,0.184479,-6.925782,0.003814,-6.17922,-0.165511,-6.852798,0.008325,-6.20988


In [7]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(nthreads=16, max_mem_size='32G')

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,16 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_rco_1iom6y
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,28.42 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [8]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


NameError: name 'test' is not defined

In [11]:
features = [f for f in train.columns if f not in ['Filename', 'North', 'East', 'kfold']]

def train_automl(train, test, target, max_runtime_secs=3600):
    
    aml = H2OAutoML(seed=SEED,
                    nfolds=5,
                    max_runtime_secs=max_runtime_secs,
                    stopping_metric='RMSE',
                    sort_metric='RMSE',
                    exploitation_ratio=0.2)

    aml.train(x=features, y=target, training_frame=train)  
    
    train_preds = aml.leader.predict(train)
    train_preds.rename({'predict':target+'_prediction'})
    
    test_preds = aml.leader.predict(test)
    test_preds.rename({'predict':target+'_prediction'})
    
    return aml, train_preds, test_preds
    

In [12]:
north_aml, north_train_preds, north_test_preds = train_automl(train, test, 'North', MAX_TIME)
north_aml.leaderboard.head(20)

NameError: name 'test' is not defined

In [13]:
east_aml, east_train_preds, east_test_preds = train_automl(train, test, 'East', MAX_TIME)
east_aml.leaderboard.head(20)

NameError: name 'test' is not defined

In [14]:
train_final = train.cbind(north_train_preds)
train_final = train_final.cbind(east_train_preds)
train_final.as_data_frame().to_csv('train_final.csv')

train_final.head(5)

NameError: name 'north_train_preds' is not defined

In [15]:
test_final = test.cbind(north_test_preds)
test_final = test_final.cbind(east_test_preds)
test_final.as_data_frame().to_csv('test_final.csv')

test_final

NameError: name 'test' is not defined

# Submission

In [None]:
submission = test_final.as_data_frame()[['Filename', 'North_prediction', 'East_prediction']]
submission.rename({'North_prediction':'North', 'East_prediction':'East'}, axis=1, inplace=True)
submission = submission.melt(id_vars='Filename', value_name='Predicted')
submission['Id'] = submission.Filename + ':' + submission.variable
submission = submission[['Id', 'Predicted']]
submission.head()

Unnamed: 0,Id,Predicted
0,000053b1e684c9e7ea73727b2238ce18.jpg:North,-0.081379
1,00029153d12ae1c9abe59c17ff2e0895.jpg:North,1.03116
2,0006246bee639c7a7b11a08e34dd3cc6.jpg:North,-1.674396
3,00063cb5da1826febf178b669eea3250.jpg:North,-1.298955
4,00063ece2e68a8847f228e8fd922f851.jpg:North,0.044469


In [None]:
submission.to_csv('submission_h2o_full_classic.csv', index=False)

# North Explanation

In [1]:
# north_aml.explain(train)

# East Explanation

In [2]:
# # east_aml.explain(train)

In [9]:
h2o.shutdown()

H2O session _sid_89b3 closed.


  h2o.shutdown()
