# Sample Submission

In [None]:
import pandas as pd
example = pd.read_csv('sampleSubmission.csv')
example.head(10)

Unnamed: 0,id,Lng,Lat
0,516,106.733964,-6.557158
1,517,106.733964,-6.557158
2,518,106.733964,-6.557158
3,519,106.733964,-6.557158
4,520,106.733964,-6.557158
5,521,106.733964,-6.557158
6,522,106.733964,-6.557158
7,523,106.733964,-6.557158
8,524,106.733964,-6.557158
9,525,106.733964,-6.557158


# Load Data

In [None]:
from pandas_profiling import ProfileReport as Report

QuadData = pd.read_csv('QuadData_train.csv')
to_pred  = pd.read_csv('QuadData_test.csv')

print('train data shape:', QuadData.shape)
print('test data shape :', to_pred.shape)

train data shape: (6440, 47)
test data shape : (2351, 47)


In [None]:
# Report(QuadData)

In [None]:
dev_dict = {'F330': 0, 'F450': 1}
QuadData = QuadData.replace({'DeviceID': dev_dict})
to_pred  = to_pred.replace({'DeviceID': dev_dict})
QuadData.head(2)

Unnamed: 0,id,ExpID,PatternID,DeviceID,LineNo_ATT,DesRoll,Roll,DesPitch,Pitch,DesYaw,...,AccZ_2,MagX,MagY,MagZ,MagZ_2,C1,C2,C3,Lat,Lng
0,0,6,2,1,14206,-2.52,-0.74,-2.3,-1.55,359.96,...,-10.88622,382,-12,-200,-275,1536,1578,1719,-6.557158,106.733964
1,1,6,2,1,14216,-2.64,0.28,-2.95,-1.02,359.96,...,-9.850414,385,4,-193,-277,1581,1553,1697,-6.557158,106.733964


In [None]:
QuadData.drop(['id'], axis=1, inplace=True)
to_pred.drop(['id'], axis=1, inplace=True)

# Extract the Data

In [None]:
x = QuadData.drop(['Lng','Lat'], axis = 1)
y = QuadData[['Lng', 'Lat']]
to_pred.drop(['Lng', 'Lat'], axis=1, inplace=True)
print(x.shape, y.shape, to_pred.shape)

(6440, 44) (6440, 2) (2351, 44)


# Scaling

In [None]:
x_before_scaling, y_before_scaling, to_pred_before_scaling = x, y, to_pred

In [None]:
x, y, to_pred = x_before_scaling, y_before_scaling, to_pred_before_scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(x)
x = pd.DataFrame(X, columns=x.columns)
To_pred = scaler.transform(to_pred)
to_pred = pd.DataFrame(To_pred, columns=to_pred.columns)
x.head(2)

Unnamed: 0,ExpID,PatternID,DeviceID,LineNo_ATT,DesRoll,Roll,DesPitch,Pitch,DesYaw,Yaw,...,AccY,AccZ,AccZ_2,MagX,MagY,MagZ,MagZ_2,C1,C2,C3
0,0.454545,1.0,1.0,0.48796,0.229194,0.227626,0.4098,0.316136,0.999917,0.005862,...,0.87373,0.509687,0.602476,0.954545,0.549531,0.378182,0.223796,0.627164,0.689516,0.870839
1,0.454545,1.0,1.0,0.488314,0.227213,0.242566,0.400903,0.323257,0.999917,0.999222,...,0.861961,0.551377,0.649525,0.957576,0.566215,0.390909,0.21813,0.687084,0.655914,0.841545


# Define Custom Scorer

In [None]:
from sklearn.metrics.scorer import make_scorer
from haversine import haversine

def haversine_score(y_true, y_pred):
  assert len(y_true) == len(y_pred)
  total, n = 0, len(y_true)
  for i in range(n):
    total += haversine(y_true[i], y_pred[i])
  return total/n

haversine_scorer = make_scorer(haversine_score, greater_is_better = False)

# The Regressors before Feature Engineering

In [None]:
import numpy as np
from math import sqrt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, Lasso, LassoLars, ElasticNet, OrthogonalMatchingPursuit
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import ExtraTreeRegressor, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

from mlxtend.regressor import StackingRegressor
from sklearn.model_selection import cross_val_score

In [None]:
knn = KNeighborsRegressor()

linreg = LinearRegression()

linearSVR = LinearSVR(random_state = 123)
kernelSVR = SVR()

extratree = ExtraTreeRegressor(random_state = 123)
decisiontree = DecisionTreeRegressor(random_state = 123)

randomforest = RandomForestRegressor(random_state = 123, n_estimators = 100)
randomforest2 = RandomForestRegressor(random_state = 123, n_estimators = 100, max_features = "log2")

ridge = Ridge(random_state = 123)
bayesridge = BayesianRidge()

lasso = Lasso()
lars = LassoLars()
net = ElasticNet()

omp = OrthogonalMatchingPursuit()

bagging = BaggingRegressor(random_state= 123, n_estimators = 100)

boosting = GradientBoostingRegressor(random_state = 123, n_estimators = 100)
adaboost = AdaBoostRegressor(random_state= 123, n_estimators= 100)

# Feature Selection

## Tuning Parameter for Lasso

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.feature_selection import SelectFromModel

In [None]:
Lasso()

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
params_grid = {'alpha': [0.01, 0.1, 1, 10],
               'max_iter': [1000,10000],
               'normalize': [False, True]
              }
tune_model =GridSearchCV(Lasso(), param_grid=params_grid, scoring = haversine_scorer, cv = 3)

In [None]:
tune_model.fit(x, np.array(y))

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 10000],
                         'normalize': [False, True]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(haversine_score, greater_is_better=False),
             verbose=0)

In [None]:
print('AFTER DT Parameters: ', tune_model.best_params_)

AFTER DT Parameters:  {'normalize': False, 'max_iter': 1000, 'alpha': 0.01}


## Feature Selestion Lasso

In [None]:
x_before_selection, y_before_selection, to_pred_before_selection = x, y, to_pred

In [None]:
x, y, to_pred = x_before_selection, y_before_selection, to_pred_before_selection

In [None]:
def print_score(clf, label):
    scores = cross_val_score(clf, x, np.array(y), cv = 7, scoring = haversine_scorer)
    print("Mean Haversine Distance: %0.4f (+/- %0.4f) [%s]"
          % (abs(scores.mean()), abs(scores.std()), label))

In [None]:
linreg = LinearRegression()
print_score(linreg, 'LinearRegression')

Mean Haversine Distance: 0.0180 (+/- 0.0089) [LinearRegression]


In [None]:
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV

In [None]:
lasso1 = LassoCV(cv=7, max_iter=10000)
featureSelection = SelectFromModel(lasso1, threshold=10**(-6))
featureSelection.fit(x, y['Lng'])
selectedFeatures = featureSelection.transform(X)
x.columns[featureSelection.get_support()]

Index(['PatternID', 'Yaw', 'ErrYaw', 'TAlt', 'Status', 'GMS', 'Spd', 'MagX',
       'MagY'],
      dtype='object')

In [None]:
lasso2 = LassoCV(cv=7, max_iter=10000)
featureSelection2 = SelectFromModel(lasso2, threshold=10**(-6))
featureSelection2.fit(x, y['Lat'])
selectedFeatures2 = featureSelection2.transform(X)
x.columns[featureSelection2.get_support()]

Index(['LineNo_ATT', 'ThH', 'TAlt', 'Spd', 'GCrs', 'MagX', 'MagY', 'MagZ_2'], dtype='object')

In [None]:
x_selected = x.loc[:, (featureSelection.get_support() + featureSelection2.get_support() ) ]

In [None]:
x_selected.columns

Index(['PatternID', 'LineNo_ATT', 'Yaw', 'ErrYaw', 'ThH', 'TAlt', 'Status',
       'GMS', 'Spd', 'GCrs', 'MagX', 'MagY', 'MagZ_2'],
      dtype='object')

In [None]:
print('7-fold cross validation:\n')

for clf, label in zip([knn, linreg, extratree, decisiontree, randomforest,
                       randomforest2, lasso, lars, omp, bagging],
                      ['K-Neighbors Regressor',
                       'Linear Regression',
                       'Extra Tree Regressor',
                       'Decision Tree Regressor',
                       'Random Forest Regressor 1',
                       'Random Forest Regressor 2',
                       'Lasso',
                       'Lasso Lars',
                       'Orthogonal Matching Pursuit',
                       'Bagging Regressor']):

    try:
        scores = cross_val_score(clf, x_selected, np.array(y), cv = 7, scoring = haversine_scorer)
        print("Mean Haversine Distance: %0.4f (+/- %0.4f) [%s]"
              % (abs(scores.mean()), abs(scores.std()), label))
    except Exception as e:
        print(type(e).__name__ + " [%s]"%(label))
        if str(e).strip() != '':
            print('\t' + str(e))
        continue

7-fold cross validation:

Mean Haversine Distance: 0.0090 (+/- 0.0058) [K-Neighbors Regressor]
Mean Haversine Distance: 0.0128 (+/- 0.0007) [Linear Regression]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Extra Tree Regressor]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Decision Tree Regressor]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Random Forest Regressor 1]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Random Forest Regressor 2]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Lasso]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Lasso Lars]
Mean Haversine Distance: 0.0163 (+/- 0.0017) [Orthogonal Matching Pursuit]
Mean Haversine Distance: 0.0181 (+/- 0.0033) [Bagging Regressor]


In [None]:
x_selected.corr()

Unnamed: 0,PatternID,LineNo_ATT,Yaw,ErrYaw,ThH,TAlt,Status,GMS,Spd,GCrs,MagX,MagY,MagZ_2
PatternID,1.0,0.29106,-0.083016,-0.430281,-0.692587,0.000886,-0.379523,0.276741,0.088025,-0.110923,-0.024298,0.034795,0.146346
LineNo_ATT,0.29106,1.0,-0.142028,-0.002994,0.011788,0.153542,0.495022,0.183302,0.077042,-0.136287,-0.13243,-0.031997,-0.135625
Yaw,-0.083016,-0.142028,1.0,0.132562,0.08926,-0.193811,-0.112226,-0.147095,-0.022357,0.396576,-0.044982,0.628115,-0.271521
ErrYaw,-0.430281,-0.002994,0.132562,1.0,0.33467,0.170189,0.213032,-0.489251,0.148774,0.115172,-0.227989,0.191391,-0.126427
ThH,-0.692587,0.011788,0.08926,0.33467,1.0,-0.060477,0.390548,-0.084143,-0.139424,0.084824,0.018789,-0.008988,-0.309093
TAlt,0.000886,0.153542,-0.193811,0.170189,-0.060477,1.0,0.117038,0.069876,0.331166,-0.04173,-0.200524,-0.04658,-0.197296
Status,-0.379523,0.495022,-0.112226,0.213032,0.390548,0.117038,1.0,0.187861,0.009051,-0.045299,-0.060388,-0.045685,-0.194457
GMS,0.276741,0.183302,-0.147095,-0.489251,-0.084143,0.069876,0.187861,1.0,0.068087,0.000911,0.041756,0.046105,-0.271124
Spd,0.088025,0.077042,-0.022357,0.148774,-0.139424,0.331166,0.009051,0.068087,1.0,-0.109883,-0.533132,0.016657,-0.050847
GCrs,-0.110923,-0.136287,0.396576,0.115172,0.084824,-0.04173,-0.045299,0.000911,-0.109883,1.0,0.035589,0.513723,-0.237596


# Grid Search Optimization

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
KNeighborsRegressor()

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [None]:
knn_param = {
    'n_neighbors' : list(range(1, 21)),
    'weights' : ['uniform', 'distance']
}

gridknn = GridSearchCV(knn, knn_param, cv = 7, scoring = haversine_scorer)
gridknn.fit(x_selected, np.array(y));
print("(best score: %s, best parameters: %s)"%(gridknn.best_score_, gridknn.best_params_))

(best score: -0.008865404768801233, best parameters: {'n_neighbors': 3, 'weights': 'distance'})


In [None]:
def print_score_sel(clf, label):
    scores = cross_val_score(clf, x_selected, np.array(y), cv = 7, scoring = haversine_scorer)
    print("Mean Haversine Distance: %0.4f (+/- %0.4f) [%s]"
          % (abs(scores.mean()), abs(scores.std()), label))

In [None]:
knn_tuned = KNeighborsRegressor(n_neighbors=3, weights='distance')
print_score_sel(knn_tuned, "KNNT")

Mean Haversine Distance: 0.0089 (+/- 0.0057) [KNNT]


# The Regressors

In [None]:
from sklearn.linear_model import *
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

from xgboost import XGBRegressor

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import VotingRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.model_selection import cross_val_score

In [None]:
knnt = KNeighborsRegressor(n_neighbors=3, weights='distance')
linreg = LinearRegression()

In [None]:
multi_knnt = MultiOutputRegressor(knnt)
print_score_sel(multi_knnt, 'MultiKNN')

Mean Haversine Distance: 0.0089 (+/- 0.0057) [MultiKNN]


In [None]:
hist = MultiOutputRegressor(HistGradientBoostingRegressor())
print_score_sel(hist, 'HistGradientBoostingRegressor')

Mean Haversine Distance: 0.0068 (+/- 0.0024) [HistGradientBoostingRegressor]


In [None]:
hist.get_params().keys()

dict_keys(['estimator__learning_rate', 'n_jobs', 'estimator__random_state', 'estimator__tol', 'estimator__scoring', 'estimator__max_depth', 'estimator__loss', 'estimator__n_iter_no_change', 'estimator__min_samples_leaf', 'estimator__max_iter', 'estimator__l2_regularization', 'estimator', 'estimator__max_leaf_nodes', 'estimator__validation_fraction', 'estimator__verbose', 'estimator__max_bins'])

In [None]:
hist_param = {
    'estimator__l2_regularization' : [0, 0.5],
    'estimator__max_depth' :[2, 3, 4, 5],
    'estimator__tol':[10**(-5)]
}

gridhist = GridSearchCV(hist, hist_param, cv = 7, scoring = haversine_scorer)
gridhist.fit(x_selected, np.array(y));
print("(best score: %s, best parameters: %s)"%(gridhist.best_score_, gridhist.best_params_))

(best score: -0.0069249579167031255, best parameters: {'estimator__max_depth': 5, 'estimator__tol': 1e-05, 'estimator__l2_regularization': 0})


In [None]:
hist_tuned = MultiOutputRegressor(HistGradientBoostingRegressor(max_depth=5))
print_score_sel(hist_tuned, "HistGradientBoostingRegressor_Tuned")

Mean Haversine Distance: 0.0069 (+/- 0.0020) [HistGradientBoostingRegressor_Tuned]


In [None]:
vote1   = VotingRegressor([('linreg', linreg), ('knn', knnt)])
vote1   = MultiOutputRegressor(vote1)
stackr1 = StackingRegressor(meta_regressor = knnt, regressors = [linreg])
stackr2 = StackingRegressor(meta_regressor = linreg, regressors = [knnt])

In [None]:
print('7-fold cross validation:\n')

for reg, label in zip([knnt, linreg, vote1, stackr1, stackr2],
                      ['KNN Regressor', 'LinearRegression', 'Voting Regressor 1',
                       'KNN with Linreg', 'Linreg with KNN']):
    try:
        scores = cross_val_score(reg, x_selected, np.array(y), cv = 7, scoring = haversine_scorer)
        print('Mean Haversine Distance: %0.4f (+/- %0.4f) [%s]'%(abs(scores.mean()), abs(scores.std()), label))
    except ValueError:
        scores = cross_val_score(MultiOutputRegressor(reg), x_selected, np.array(y), cv = 7, scoring = haversine_scorer)
        print('Mean Haversine Distance: %0.4f (+/- %0.4f) [Multi-Output %s]'%(abs(scores.mean()), abs(scores.std()), label))
    except Exception as e:
        print(type(e).__name__ + ' [%s]'%(label))
        if str(e).strip() != '':
            print('\t' + str(e))
        continue

7-fold cross validation:

Mean Haversine Distance: 0.0089 (+/- 0.0057) [KNN Regressor]
Mean Haversine Distance: 0.0128 (+/- 0.0007) [LinearRegression]
Mean Haversine Distance: 0.0099 (+/- 0.0029) [Voting Regressor 1]
Mean Haversine Distance: 0.0129 (+/- 0.0008) [KNN with Linreg]
Mean Haversine Distance: 0.0089 (+/- 0.0057) [Linreg with KNN]


In [None]:
vote2 = MultiOutputRegressor(VotingRegressor([('knnt', KNeighborsRegressor(n_neighbors=3)),
                                              ('hist_tuned', HistGradientBoostingRegressor(max_depth=7))
                                              ]))
print_score_sel(vote2, "Voting KNN Hist")

Mean Haversine Distance: 0.0069 (+/- 0.0031) [Voting KNN Hist]


# Prediction

In [None]:
data = x_selected
target = y

model = MultiOutputRegressor(VotingRegressor([('knnt', KNeighborsRegressor(n_neighbors=3)),
                                              ('hist_tuned', HistGradientBoostingRegressor(max_depth=5))
                                              ]))
model.fit(data, target)
print_score_sel(model, "Voting KNN Hist")

Mean Haversine Distance: 0.0070 (+/- 0.0030) [Voting KNN Hist]


In [None]:
to_pred_selected = to_pred.loc[:, (featureSelection.get_support() + featureSelection2.get_support() )]

In [None]:
Preds = model.predict(to_pred_selected)

In [None]:
Out = pd.DataFrame({'id': pd.read_csv('QuadData_test.csv')['id']})
Out['Lng'] = [Preds[el][0] for el in range(2351)]
Out['Lat'] = [Preds[el][1] for el in range(2351)]
Out.head()

Unnamed: 0,id,Lng,Lat
0,516,106.733932,-6.55708
1,517,106.733936,-6.55708
2,518,106.733936,-6.55708
3,519,106.733936,-6.55708
4,520,106.733936,-6.55708


In [None]:
Out.to_csv('Submission 10_Voting_Tuned_KNN_w_Tuned_Hist_w_LassoFeatureSelection_w_MinMaxScaler.csv', index = False)
#!cp 'Submission 10_Voting_Tuned_KNN_w_Tuned_Hist_w_LassoFeatureSelection_w_MinMaxScaler.csv' 'drive/My Drive/ITToday/Submissions'

In [None]:
Submission = pd.read_csv('Submission 10_Voting_Tuned_KNN_w_Tuned_Hist_w_LassoFeatureSelection_w_MinMaxScaler.csv')
Submission.head()

Unnamed: 0,id,Lng,Lat
0,516,106.734238,-6.557372
1,517,106.734241,-6.557371
2,518,106.734241,-6.557373
3,519,106.734231,-6.557375
4,520,106.734232,-6.55733


In [None]:
# result_test = pd.concat([to_pred_before_scaling, Out['Lat'], Out['Lng']],axis=1)
# result_test.to_excel('Hasil.xlsx', index=False)