In [139]:
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

%matplotlib notebook
plt.style.use('seaborn-pastel')

SEED = 1488
hold_out_portion = 0.2

In [140]:
train_df = pd.read_csv('train.csv')
holdout_indices = random.sample(range(len(train_df)), int(len(train_df) * hold_out_portion))

holdout_df = train_df.iloc[holdout_indices, :]
train_df = train_df.drop(holdout_indices)

# fix index
train_df = train_df.reset_index(drop=True)
holdout_df = holdout_df.reset_index(drop=True)

target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
target_df = train_df.loc[:, target_cols]
target_df['id'] = train_df['id']
train_df = train_df.drop(target_cols, axis=1)

# train_df.head()

In [141]:
test_df = pd.read_csv('./Track_1/test.csv')

# test_df.head()

In [142]:
sat_ids = train_df['sat_id'].unique()
print(len(sat_ids)) # Number of inuqie satelites
print(np.all(sat_ids == np.arange(0, 600))) # Check their numbers are 0, 1, ..., 600

600
True


In [143]:
# if you need to use some filter
filt = train_df['sat_id'] == 0
req = train_df[filt]
req = req.set_index('id').join(target_df.set_index('id'), on='id').reset_index()
# display(req.head())

## Metric

In [144]:
def SMAPE(y_pred, y_true): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

## Model and Validation

In [145]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [146]:
X_train = train_df.drop(['id', 'epoch', 'sat_id'], axis=1).values
X_val = holdout_df.drop(['id', 'epoch', 'sat_id'] + target_cols, axis=1).values

Tune all regressors independently

In [156]:
config = dict()
config["n_estimators"] = 40
config["booster"] = 'gbtree'
config["objective"] = 'reg:squarederror'
config["random_state"] = SEED
config["learning_rate"] = 0.1
config["gamma"] = 0.01  # Minimum loss reduction to split
config["scoring"] = SMAPE


param_grid_1 = {
 'max_depth':range(4,10,3),
 'min_child_weight':range(1,3,1)
}

In [160]:
for column in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    model = XGBRegressor(**config)
    grid_search_1 = GridSearchCV(estimator=model, param_grid=param_grid_1, n_jobs=10, iid=False, cv=5, verbose=True)
    grid_search_1.fit(X_train, target_df[column].values)
    
    print("For column: {}".format(column))
    print(grid_search_1.best_params_)
    print(grid_search_1.best_score_)
    print("-------------------------\n")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:  1.1min finished


For column: x
{'max_depth': 4, 'min_child_weight': 1}
0.9218533117646072
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:  1.1min finished


For column: y
{'max_depth': 4, 'min_child_weight': 1}
0.9236192561726314
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:  1.0min finished


For column: z
{'max_depth': 4, 'min_child_weight': 1}
0.9199725609808335
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:  1.0min finished


For column: Vx
{'max_depth': 4, 'min_child_weight': 1}
0.7469426135545694
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:  1.1min finished


For column: Vy
{'max_depth': 4, 'min_child_weight': 1}
0.71681519755171
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  20 out of  20 | elapsed:   59.9s finished


For column: Vz
{'max_depth': 4, 'min_child_weight': 2}
0.7080882365054846
-------------------------



In [165]:
param_grid_2 = {
 'max_depth': range(3,6,1),
 'min_child_weight': [0.75, 1.0, 1.25, 1.75, 2.0, 2.25]
}

for column in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    model = XGBRegressor(**config)
    grid_search_1 = GridSearchCV(estimator=model, param_grid=param_grid_2, n_jobs=-1, iid=False, cv=5, verbose=True)
    grid_search_1.fit(X_train, target_df[column].values)
    
    print("For column: {}".format(column))
    print(grid_search_1.best_params_)
    print(grid_search_1.best_score_)
    print("-------------------------\n")

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.8min finished


For column: x
{'max_depth': 5, 'min_child_weight': 1.25}
0.9220216215518487
-------------------------

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.9min finished


For column: y
{'max_depth': 5, 'min_child_weight': 2.25}
0.9238235026881749
-------------------------

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.7min finished


For column: z
{'max_depth': 4, 'min_child_weight': 2.25}
0.919999318043212
-------------------------

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.7min finished


For column: Vx
{'max_depth': 5, 'min_child_weight': 0.75}
0.7471534386625829
-------------------------

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.7min finished


For column: Vy
{'max_depth': 3, 'min_child_weight': 0.75}
0.7203652970058814
-------------------------

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   59.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.7min finished


For column: Vz
{'max_depth': 3, 'min_child_weight': 0.75}
0.7088187734276608
-------------------------



I had to save params automatically, but I forgor :/

In [176]:
columns_config = {
    "x" : {**config, **{'max_depth': 5, 'min_child_weight': 1.25}},
    "y" : {**config, **{'max_depth': 5, 'min_child_weight': 2.25}},
    "z" : {**config, **{'max_depth': 4, 'min_child_weight': 2.25}},
    "Vx" : {**config, **{'max_depth': 5, 'min_child_weight': 0.75}},
    "Vy" : {**config, **{'max_depth': 3, 'min_child_weight': 0.75}},
    "Vz" : {**config, **{'max_depth': 3, 'min_child_weight': 0.75}},
}

In [177]:
param_test3 = {
 'gamma': [i/10.0 for i in range(0,5)]
}
# Remove initial gamma
for column in columns_config:
    value = columns_config[column]
    value.pop('gamma')

for column in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    model = XGBRegressor(**columns_config[column])
    
    grid_search_1 = GridSearchCV(estimator=model, param_grid=param_test3, n_jobs=-1, iid=False, cv=5, verbose=True)
    grid_search_1.fit(X_train, target_df[column].values)
    
    print("For column: {}".format(column))
    print(grid_search_1.best_params_)
    print(grid_search_1.best_score_)
    print("-------------------------\n")

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:  1.3min remaining:   19.8s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.3min finished


For column: x
{'gamma': 0.0}
0.9220216215518487
-------------------------

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:  1.3min remaining:   19.1s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.3min finished


For column: y
{'gamma': 0.0}
0.9238235026881749
-------------------------

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:   58.8s remaining:   14.7s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   59.4s finished


For column: z
{'gamma': 0.0}
0.919999318043212
-------------------------

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:  1.3min remaining:   18.8s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.3min finished


For column: Vx
{'gamma': 0.3}
0.7471534420749751
-------------------------

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:   46.2s remaining:   11.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   46.5s finished


For column: Vy
{'gamma': 0.0}
0.7203652970058814
-------------------------

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:   47.0s remaining:   11.8s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   47.1s finished


For column: Vz
{'gamma': 0.0}
0.7088187734276608
-------------------------



Okay, gamma = 0.0

In [178]:
columns_config = {
    "x" : {**config, **{'gamma': 0.0}},
    "y" : {**config, **{'gamma': 0.0}},
    "z" : {**config, **{'gamma': 0.0}},
    "Vx" : {**config, **{'gamma': 0.0}},
    "Vy" : {**config, **{'gamma': 0.0}},
    "Vz" : {**config, **{'gamma': 0.0}},
}

In [181]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1]
}

for column in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    model = XGBRegressor(**columns_config[column])
    
    grid_search_1 = GridSearchCV(estimator=model, param_grid=param_test6, n_jobs=-1, iid=False, cv=5, verbose=True)
    grid_search_1.fit(X_train, target_df[column].values)
    
    print("For column: {}".format(column))
    print(grid_search_1.best_params_)
    print(grid_search_1.best_score_)
    print("-------------------------\n")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   31.8s remaining:   31.8s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   42.0s finished


For column: x
{'reg_alpha': 1e-05}
0.9202821761494906
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   28.7s remaining:   28.7s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   38.6s finished


For column: y
{'reg_alpha': 0.1}
0.921969447087252
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   28.2s remaining:   28.2s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   39.5s finished


For column: z
{'reg_alpha': 1}
0.9181795758054717
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   29.6s remaining:   29.6s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   39.8s finished


For column: Vx
{'reg_alpha': 1e-05}
0.7468763532463811
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   30.4s remaining:   30.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   40.5s finished


For column: Vy
{'reg_alpha': 1}
0.7207050739967398
-------------------------

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   30.3s remaining:   30.3s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   40.6s finished


For column: Vz
{'reg_alpha': 0.1}
0.7088191228166302
-------------------------



In [182]:
columns_config = {
    "x" : {**config, **{'reg_alpha': 1e-05}},
    "y" : {**config, **{'reg_alpha': 0.1}},
    "z" : {**config, **{'reg_alpha': 1.0}},
    "Vx" : {**config, **{'reg_alpha': 1e-5}},
    "Vy" : {**config, **{'reg_alpha': 1.0}},
    "Vz" : {**config, **{'reg_alpha': 0.1}},
}

TODO : Tune subsample and colsample_bytree

Reduce learning rate, add more trees and submit

In [192]:
for column in columns_config:
    config = columns_config[column]
    config["n_estimators"] = 150
    config["learning_rate"] = 0.01
    config["n_jobs"] = -1

In [193]:
# n_est = 120
# booster = 'gblinear' # gbtree, gblinear

class Model():
    def __init__(self, config):
        self.config = config
        
        self.x_model = XGBRegressor(**config['x'])
        self.y_model = XGBRegressor(**config['y'])
        self.z_model = XGBRegressor(**config['z'])

        self.Vx_model = XGBRegressor(**config['Vx'])
        self.Vy_model = XGBRegressor(**config['Vy'])
        self.Vz_model = XGBRegressor(**config['Vz'])

    def fit(self, X_train, target_df):
        self.x_model.fit(X_train, target_df['x'].values)
        self.y_model.fit(X_train, target_df['y'].values)
        self.z_model.fit(X_train, target_df['z'].values)

        self.Vx_model.fit(X_train, target_df['Vx'].values)
        self.Vy_model.fit(X_train, target_df['Vy'].values)
        self.Vz_model.fit(X_train, target_df['Vz'].values)
        
    def predict(self, X):
        pred = pd.DataFrame()
        pred['x'] = self.x_model.predict(X)
        pred['y'] = self.y_model.predict(X)
        pred['z'] = self.z_model.predict(X)

        pred['Vx'] = self.Vx_model.predict(X)
        pred['Vy'] = self.Vy_model.predict(X)
        pred['Vz'] = self.Vz_model.predict(X)
        return pred
    
    def score(self, y_pred, y_true):
        return SMAPE(y_pred, y_true)

In [194]:
model = Model(columns_config)
model.fit(X_train, target_df)

In [195]:
# Holdout score
pred = model.predict(X_val)
100 * (1 - SMAPE(pred.values, holdout_df.loc[:, target_cols].values))

70.13276042361387

## Make prediction

In [196]:
def beautify_submit(df):
    submit_cols = ['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']
    return df.loc[:, submit_cols]

In [197]:
train_df = pd.read_csv('train.csv')

target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
target_df = train_df.loc[:, target_cols]
target_df['id'] = train_df['id']
train_df = train_df.drop(target_cols, axis=1)

In [198]:
X_train = train_df.drop(['id', 'epoch', 'sat_id'], axis=1).values
X_test = test_df.drop(['id', 'epoch', 'sat_id'], axis=1).values

In [200]:
model = Model(columns_config)
model.fit(X_train, target_df)

In [201]:
pred = model.predict(X_test)
pred['id'] = test_df['id']

pred = beautify_submit(pred)

In [203]:
pred.to_csv('19_01_2020.csv', index=None)
pred.head()

Unnamed: 0,id,x,y,z,Vx,Vy,Vz
0,3927,-9479.583008,-9838.43457,4397.086426,3.140021,-0.500944,0.671815
1,3928,-4842.244629,-9838.43457,4616.777344,3.17979,0.381785,0.364836
2,3929,-357.389923,-8838.695312,4616.777344,3.17979,1.511498,-0.280927
3,3930,3897.159912,-5422.936035,3416.396484,3.136972,3.111646,-1.212746
4,3931,6728.619141,-1415.591187,1621.241821,1.277891,3.13217,-2.020882


Итог:
---
59 - public score<br>
70 - holdout score