In [25]:
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

%matplotlib notebook
plt.style.use('seaborn-pastel')

SEED = 42
hold_out_portion = 0.2

In [3]:
train_df = pd.read_csv('train.csv')
holdout_indices = random.sample(range(len(train_df)), int(len(train_df) * hold_out_portion))

holdout_df = train_df.iloc[holdout_indices, :]
train_df = train_df.drop(holdout_indices)

# fix index
train_df = train_df.reset_index(drop=True)
holdout_df = holdout_df.reset_index(drop=True)

target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
target_df = train_df.loc[:, target_cols]
target_df['id'] = train_df['id']
train_df = train_df.drop(target_cols, axis=1)

train_df.head()

Unnamed: 0,id,epoch,sat_id,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
0,2,2014-01-01T01:33:26.001,0,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
1,3,2014-01-01T02:20:09.001,0,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
2,7,2014-01-01T05:27:01.003,0,2831.900642,-39595.997138,7364.088245,1.160316,-0.131566,2.689303
3,8,2014-01-01T06:13:44.004,0,5996.014434,-39065.326088,14679.572942,1.090515,0.495341,2.514879
4,9,2014-01-01T07:00:27.004,0,8911.9528,-36900.814799,21387.028371,0.984956,1.035218,2.259425


In [6]:
test_df = pd.read_csv('./Track_1/test.csv')

test_df.head()

Unnamed: 0,id,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
0,3927,1,2014-02-01T00:01:45.162,-13366.891347,-14236.753503,6386.774555,4.333815,-0.692764,0.810774
1,3928,1,2014-02-01T00:22:57.007,-7370.434039,-14498.77152,7130.411325,5.077413,0.360609,0.313402
2,3929,1,2014-02-01T00:44:08.852,-572.068654,-13065.289498,7033.794876,5.519106,2.01283,-0.539412
3,3930,1,2014-02-01T01:05:20.697,6208.945257,-9076.852425,5548.2969,4.849212,4.338955,-1.8696
4,3931,1,2014-02-01T01:26:32.542,10768.200284,-2199.706707,2272.014862,1.940505,6.192887,-3.167724


In [8]:
sat_ids = train_df['sat_id'].unique()
print(len(sat_ids)) # Number of inuqie satelites
print(np.all(sat_ids == np.arange(0, 600))) # Check their numbers are 0, 1, ..., 600

600
True


In [9]:
# if you need to use some filter
filt = train_df['sat_id'] == 0
req = train_df[filt]
req = req.set_index('id').join(target_df.set_index('id'), on='id').reset_index()
display(req.head())

Unnamed: 0,id,epoch,sat_id,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,x,y,z,Vx,Vy,Vz
0,2,2014-01-01T01:33:26.001,0,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155
1,3,2014-01-01T02:20:09.001,0,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115
2,7,2014-01-01T05:27:01.003,0,2831.900642,-39595.997138,7364.088245,1.160316,-0.131566,2.689303,2863.147037,-39594.503233,7420.53828,1.162076,-0.128606,2.687907
3,8,2014-01-01T06:13:44.004,0,5996.014434,-39065.326088,14679.572942,1.090515,0.495341,2.514879,6031.593902,-39056.319613,14731.102545,1.091816,0.497608,2.512783
4,9,2014-01-01T07:00:27.004,0,8911.9528,-36900.814799,21387.028371,0.984956,1.035218,2.259425,8950.655291,-36886.362968,21432.111677,0.985854,1.036692,2.25693


## Metric

In [14]:
def SMAPE(y_pred, y_true): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

## Model and Validation

In [15]:
from xgboost import XGBRegressor

In [61]:
X_train = train_df.drop(['id', 'epoch', 'sat_id'], axis=1).values
X_val = holdout_df.drop(['id', 'epoch', 'sat_id'] + target_cols, axis=1).values

In [64]:
n_est = 120
booster = 'gblinear' # gbtree, gblinear
class Model():
    def __init__(self, n_est=n_est):
        self.x_model = XGBRegressor(booster=booster, n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.y_model = XGBRegressor(booster=booster, n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.z_model = XGBRegressor(booster=booster, n_estimators=n_est, objective='reg:squarederror', random_state=SEED)

        self.Vx_model = XGBRegressor(booster=booster, n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.Vy_model = XGBRegressor(booster=booster, n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.Vz_model = XGBRegressor(booster=booster, n_estimators=n_est, objective='reg:squarederror', random_state=SEED)

    def fit(self, X_train, target_df):
        self.x_model.fit(X_train, target_df['x'].values)
        self.y_model.fit(X_train, target_df['y'].values)
        self.z_model.fit(X_train, target_df['z'].values)

        self.Vx_model.fit(X_train, target_df['Vx'].values)
        self.Vy_model.fit(X_train, target_df['Vy'].values)
        self.Vz_model.fit(X_train, target_df['Vz'].values)
        
    def predict(self, X):
        pred = pd.DataFrame()
        pred['x'] = self.x_model.predict(X)
        pred['y'] = self.y_model.predict(X)
        pred['z'] = self.z_model.predict(X)

        pred['Vx'] = self.Vx_model.predict(X)
        pred['Vy'] = self.Vy_model.predict(X)
        pred['Vz'] = self.Vz_model.predict(X)
        return pred

In [65]:
t = time.time()
model = Model()
model.fit(X_train, target_df)
print("Time for fit {}".format(time.time() - t))

Time for fit 65.26273918151855


In [66]:
pred = model.predict(X_val)

In [67]:
# Holdout score
100 * (1 - SMAPE(pred.values, holdout_df.loc[:, target_cols].values))

74.76688715681863

## Make prediction

In [29]:
def beautify_submit(df):
    submit_cols = ['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']
    return df.loc[:, submit_cols]

In [30]:
train_df = pd.read_csv('train.csv')

target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
target_df = train_df.loc[:, target_cols]
target_df['id'] = train_df['id']
train_df = train_df.drop(target_cols, axis=1)

In [31]:
X_train = train_df.drop(['id', 'epoch', 'sat_id'], axis=1).values
X_test = test_df.drop(['id', 'epoch', 'sat_id'], axis=1).values

In [32]:
model = Model()
model.fit(X_train, target_df)

In [33]:
pred = model.predict(X_test)
pred['id'] = test_df['id']

pred = beautify_submit(pred)

In [34]:
pred.to_csv('baseline.csv', index=None)
pred.head()

Unnamed: 0,id,x,y,z,Vx,Vy,Vz
0,3927,-12808.18457,-13226.972656,5384.361816,3.91985,-0.815696,0.85583
1,3928,-5955.625488,-13582.921875,5973.576172,4.417094,0.293631,0.411783
2,3929,399.847198,-12268.516602,6368.050781,4.350489,1.845988,-0.349111
3,3930,4686.44873,-7578.091797,4868.881348,3.831818,3.964919,-1.180427
4,3931,8119.049316,-1318.029053,3377.385986,0.610685,4.082651,-2.175351


Итог:
---
65.32 - public score<br>
78.06 - holdout score