In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

%matplotlib notebook
plt.style.use('seaborn-pastel')

SEED = 42

In [2]:
train_df = pd.read_csv('train.csv')
holdout_df = train_df.iloc[random.sample(range(len(train_df)), int(len(train_df) * 0.1)), :]

# fix index
train_df = train_df.reset_index(drop=True)
holdout_df = holdout_df.reset_index(drop=True)

target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
target_df = train_df.loc[:, target_cols]
target_df['id'] = train_df['id']
train_df = train_df.drop(target_cols, axis=1)

train_df.head()

Unnamed: 0,id,epoch,sat_id,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
0,0,2014-01-01T00:00:00.000,0,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,1,2014-01-01T00:46:43.000,0,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,2,2014-01-01T01:33:26.001,0,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
3,3,2014-01-01T02:20:09.001,0,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,4,2014-01-01T03:06:52.002,0,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237


In [3]:
test_df = pd.read_csv('Track_1/test.csv')

test_df.head()

Unnamed: 0,id,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
0,3927,1,2014-02-01T00:01:45.162,-13366.891347,-14236.753503,6386.774555,4.333815,-0.692764,0.810774
1,3928,1,2014-02-01T00:22:57.007,-7370.434039,-14498.77152,7130.411325,5.077413,0.360609,0.313402
2,3929,1,2014-02-01T00:44:08.852,-572.068654,-13065.289498,7033.794876,5.519106,2.01283,-0.539412
3,3930,1,2014-02-01T01:05:20.697,6208.945257,-9076.852425,5548.2969,4.849212,4.338955,-1.8696
4,3931,1,2014-02-01T01:26:32.542,10768.200284,-2199.706707,2272.014862,1.940505,6.192887,-3.167724


In [4]:
sat_ids = train_df['sat_id'].unique()
# Number of inuqie satelites
print(len(sat_ids))
# Their number are 0, 1, ..., 600
print(np.all(sat_ids == np.arange(0, 600)))

600
True


In [5]:
# if you need to use some filter
filt = train_df['sat_id'] == 0
req = train_df[filt]
req = req.set_index('id').join(target_df.set_index('id'), on='id').reset_index()
display(req.head())

Unnamed: 0,id,epoch,sat_id,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,x,y,z,Vx,Vy,Vz
0,0,2014-01-01T00:00:00.000,0,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083
1,1,2014-01-01T00:46:43.000,0,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796
2,2,2014-01-01T01:33:26.001,0,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155
3,3,2014-01-01T02:20:09.001,0,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115
4,4,2014-01-01T03:06:52.002,0,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703


## Metric

In [6]:
def SMAPE(y_pred, y_true): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

## Model and Validation

In [7]:
from xgboost import XGBRegressor

In [8]:
X_train = train_df.drop(['id', 'epoch', 'sat_id'], axis=1).values
X_val = holdout_df.drop(['id', 'epoch', 'sat_id'] + target_cols, axis=1).values

In [12]:
class Model():
    def __init__(self, n_est=20):
        self.x_model = XGBRegressor(n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.y_model = XGBRegressor(n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.z_model = XGBRegressor(n_estimators=n_est, objective='reg:squarederror', random_state=SEED)

        self.Vx_model = XGBRegressor(n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.Vy_model = XGBRegressor(n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
        self.Vz_model = XGBRegressor(n_estimators=n_est, objective='reg:squarederror', random_state=SEED)
    def fit(self, X_train, target_df):
        self.x_model.fit(X_train, target_df['x'].values)
        self.y_model.fit(X_train, target_df['y'].values)
        self.z_model.fit(X_train, target_df['z'].values)

        self.Vx_model.fit(X_train, target_df['Vx'].values)
        self.Vy_model.fit(X_train, target_df['Vy'].values)
        self.Vz_model.fit(X_train, target_df['Vz'].values)
    def predict(self, X):
        pred = pd.DataFrame()
        pred['x'] = self.x_model.predict(X)
        pred['y'] = self.y_model.predict(X)
        pred['z'] = self.z_model.predict(X)

        pred['Vx'] = self.Vx_model.predict(X)
        pred['Vy'] = self.Vy_model.predict(X)
        pred['Vz'] = self.Vz_model.predict(X)
        return pred

In [13]:
model = Model()
model.fit(X_train, target_df.drop(['id'], axis=1))

In [14]:
pred = model.predict(X_val)

In [15]:
# Holdout score
SMAPE(pred.values, holdout_df.loc[:, target_cols].values)

0.25198066829210075

## Make prediction

In [16]:
def beautify_submit(df):
    submit_cols = ['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']
    return df.loc[:, submit_cols]

In [18]:
X_test = test_df.drop(['id', 'epoch', 'sat_id'], axis=1).values

In [19]:
pred = model.predict(X_test)
pred['id'] = test_df['id']

pred = beautify_submit(pred)

In [20]:
pred.to_csv('baseline.csv', index=None)
pred.head()

Unnamed: 0,id,x,y,z,Vx,Vy,Vz
0,3927,-10488.30957,-11056.802734,5118.977051,3.464758,-0.674862,0.696167
1,3928,-5358.646973,-11056.802734,5370.294434,3.53478,0.360124,0.323093
2,3929,-406.584198,-9881.235352,5370.294434,3.53478,1.655924,-0.382357
3,3930,4321.761719,-6568.908691,4009.753418,3.417272,3.443033,-1.278439
4,3931,7833.654785,-1546.940918,1854.802124,1.2268,3.471484,-2.360902


Итог:
---
62.89 - public score