# Single Output Regression

In [7]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime

SEED = 42
HOME = Path.cwd().parent
RAW_PATH = HOME / 'data/raw'
print(RAW_PATH)
MAX_TIME = 360

/home/rco/DS/kdd22/data/raw


In [8]:
pub = pd.read_csv(RAW_PATH / 'public.csv')
print(pub.shape)
pub.head()

(146262, 6)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0


In [9]:
def merge_pixels_dist(dataset='train', suffixes=['', '_a', '_a_est', '_h', '_h_est', '_p', '_p_est']):
    df_final = pub.copy()
    for s in suffixes:
        filename = 'pixels_dist_' + dataset + s + '.csv'
        df_right = pd.read_csv(RAW_PATH / filename)
        df_right.rename({'Images':'Filename'}, axis=1, inplace=True)
        df_final = df_final.merge(df_right, on='Filename', how='inner', suffixes=(None, s))
        print(df_final.shape)

    return df_final

In [10]:
train = merge_pixels_dist('train')
train.head()

(91231, 8)
(91231, 10)
(91231, 12)
(91231, 14)
(91231, 16)
(91231, 18)
(91231, 20)


Unnamed: 0,Filename,Altitude,Delta,North,East,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,00003e3b9e5336685200ae85d21b4f5e.jpg,178.829834,-0.065231,-0.386045,0.929772,0,-1.098183,2.828369,-1.296189,2.868935,-1.220466,2.826518,-1.098183,2.828369,-1.230324,2.83861,-1.296189,2.868935,-1.220466,2.826518
1,0001261e2060303a06ba6c64d676d639.jpg,207.921478,-0.080688,0.635584,0.152819,2,1.452448,0.207358,0.847966,0.356077,0.7925,0.231472,1.452448,0.207358,0.753004,0.217258,0.847966,0.356077,0.7925,0.231472
2,0002ac0d783338cfeab0b2bdbd872cda.jpg,178.048431,0.021576,-1.228229,-0.499388,3,-3.544215,-1.494362,-3.517452,-1.592591,-3.823705,-1.658855,-3.544215,-1.494362,-3.818374,-1.668158,-3.517452,-1.592591,-3.823705,-1.658855
3,0004289ee1c7b8b08c77e19878106ae3.jpg,201.084625,0.505981,-1.739709,-0.699928,1,-6.054061,-4.08215,-6.091535,-3.099275,-6.065357,-2.354155,-6.054061,-4.08215,-6.145322,-2.303187,-6.091535,-3.099275,-6.065357,-2.354155
4,0004d0b59e19461ff126e3a08a814c33.jpg,187.550201,-0.328156,-0.169798,2.828752,0,-0.69025,8.232869,-0.449659,8.125691,-0.564889,8.012415,-0.69025,8.232869,-0.555564,7.999658,-0.449659,8.125691,-0.564889,8.012415


In [11]:
test = merge_pixels_dist('test')
test.drop(['North', 'East'], axis=1, inplace=True)
test.head()

(55031, 8)
(55031, 10)
(55031, 12)
(55031, 14)
(55031, 16)
(55031, 18)
(55031, 20)


Unnamed: 0,Filename,Altitude,Delta,kfold,North_pixel,East_pixel,North_pixel_a,East_pixel_a,North_pixel_a_est,East_pixel_a_est,North_pixel_h,East_pixel_h,North_pixel_h_est,East_pixel_h_est,North_pixel_p,East_pixel_p,North_pixel_p_est,East_pixel_p_est
0,000053b1e684c9e7ea73727b2238ce18.jpg,167.943069,0.010269,0,3.946422,-3.815853,0.356597,0.257315,-0.654044,-1.042109,3.946422,-3.815853,5.856787,-8.512549,0.356597,0.257315,-0.654044,-1.042109
1,00029153d12ae1c9abe59c17ff2e0895.jpg,195.853088,0.089218,2,7.668228,-4.031865,9.366933,-2.493681,4.654472,-5.100857,7.668228,-4.031865,4.936895,-4.14951,9.366933,-2.493681,4.654472,-5.100857
2,0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943466,-0.018326,1,-11.549439,-0.880645,-12.51837,-0.683746,-12.124315,-0.030847,-11.549439,-0.880645,-12.188266,0.273252,-12.51837,-0.683746,-12.124315,-0.030847
3,00063cb5da1826febf178b669eea3250.jpg,213.184418,-0.108704,1,-28.991636,1.930891,-25.94293,-0.682081,-21.449981,4.21257,-28.991636,1.930891,-21.237231,4.168302,-25.94293,-0.682081,-21.449981,4.21257
4,00063ece2e68a8847f228e8fd922f851.jpg,184.757767,0.0177,0,0.184479,-6.925782,-0.165511,-6.852798,0.008325,-6.20988,0.184479,-6.925782,0.003814,-6.17922,-0.165511,-6.852798,0.008325,-6.20988


In [12]:
from sklearn.model_selection import KFold, cross_val_score, PredefinedSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import RANSACRegressor, ElasticNet, PassiveAggressiveRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor

targets = ['North', 'East']
features = [f for f in train.columns if f not in ['Filename', 'North', 'East', 'kfold']]

cv = PredefinedSplit(train['kfold'])

def cross_val_model(model):
    
    scaler = StandardScaler()
    ct = ColumnTransformer([('Scaler', scaler, ['Altitude'])], remainder='passthrough')
    pipe = Pipeline([('Column_Transformer', ct), ('Model', model)])

    rmse = cross_val_score(estimator=pipe,
                           X=train[features],
                           y=train[targets],
                           scoring='neg_root_mean_squared_error',
                           cv=cv, 
                           n_jobs=-1)
    
    return -rmse.mean(), -rmse.std()

models = {
    'kNN':KNeighborsRegressor(),
    'RANSAC':RANSACRegressor(random_state=SEED),
    'ElasticNet':ElasticNet(random_state=SEED),
    'PAR':MultiOutputRegressor(PassiveAggressiveRegressor(random_state=SEED)),
    'LinearSVR':MultiOutputRegressor(LinearSVR(random_state=SEED)),
    'SVR':MultiOutputRegressor(SVR()),
    'XTree':ExtraTreesRegressor(random_state=SEED),
    'RandomForest':RandomForestRegressor(random_state=SEED),
    'GBR':MultiOutputRegressor(GradientBoostingRegressor(random_state=SEED))
    
}

rmse_mean = []
rmse_std = []
for model in models:
    score = cross_val_model(models[model])
    rmse_mean.append(score[0])
    rmse_std.append(score[1])
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time, model, score[0].round(3))    

scores_df = pd.DataFrame({'Model':models.keys(), 'RMSE_mean':rmse_mean, 'RMSE_std':rmse_std})
scores_df.head(10)

13:33:26 kNN 0.372
13:33:27 RANSAC 540.365
13:33:28 ElasticNet 1.001
13:33:28 PAR 259.084




13:33:52 LinearSVR 68.13


In [None]:
# model.fit(train[features], train[targets])
# predictions = pd.DataFrame(model.predict(X_test[features]), columns=['North', 'East'])
# predictions.head()


In [None]:
# north_aml, north_train_preds, north_test_preds = train_automl(train, test, 'North', MAX_TIME)
# north_aml.leaderboard.head(20)

In [None]:
# east_aml, east_train_preds, east_test_preds = train_automl(train, test, 'East', MAX_TIME)
# east_aml.leaderboard.head(20)

In [None]:
# train_final = train.cbind(north_train_preds)
# train_final = train_final.cbind(east_train_preds)
# train_final.as_data_frame().to_csv('train_final.csv')

# train_final.head(5)

In [None]:
# test_final = test.cbind(north_test_preds)
# test_final = test_final.cbind(east_test_preds)
# test_final.as_data_frame().to_csv('test_final.csv')

# test_final

In [None]:
# submission = test_final.as_data_frame()[['Filename', 'North_prediction', 'East_prediction']]
# submission.rename({'North_prediction':'North', 'East_prediction':'East'}, axis=1, inplace=True)
# submission = submission.melt(id_vars='Filename', value_name='Predicted')
# submission['Id'] = submission.Filename + ':' + submission.variable
# submission = submission[['Id', 'Predicted']]
# submission.head()

In [None]:
# submission.to_csv('submission.csv', index=False)