In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import gc
from tqdm import tqdm

%matplotlib inline

PATH = '/kaggle/input/idao2020-track1/'

In [2]:
%%time
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
submission = pd.read_csv(PATH + 'submission.csv')

CPU times: user 3.55 s, sys: 453 ms, total: 4 s
Wall time: 4.04 s


In [3]:
h = np.cross(train[['x_sim', 'y_sim', 'z_sim']], train[['Vx_sim', 'Vy_sim', 'Vz_sim']])
nhat = np.cross([0, 0, 1], h)

In [4]:
h.shape, nhat.shape

((649912, 3), (649912, 3))

In [5]:
h

array([[-105513.90450385,     923.88872107,   45570.77973893],
       [-105512.32679845,     925.4353031 ,   45570.93948196],
       [-105512.85067838,     927.05011847,   45570.9856485 ],
       ...,
       [ -17862.17732634,   -7048.54275073,  -92280.67608504],
       [ -17861.8982763 ,   -7049.69178907,  -92280.0242109 ],
       [ -17861.95055836,   -7050.77471969,  -92279.33465936]])

In [6]:
train[['h_x', 'h_y', 'h_z']] = pd.DataFrame(h, index=train.index)
test[['h_x', 'h_y', 'h_z']] = pd.DataFrame(np.cross(test[['x_sim', 'y_sim', 'z_sim']], test[['Vx_sim', 'Vy_sim', 'Vz_sim']]), index=test.index)

In [7]:
train.head()

Unnamed: 0,id,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,h_x,h_y,h_z
0,0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133,-105513.904504,923.888721,45570.779739
1,1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468,-105512.326798,925.435303,45570.939482
2,2,2014-01-01T01:33:26.001,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768,-105512.850678,927.050118,45570.985648
3,3,2014-01-01T02:20:09.001,0,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306,-105514.511002,928.086863,45570.980643
4,4,2014-01-01T03:06:52.002,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237,-105516.080435,928.540321,45570.99908


### LB Score Calc

Для подсчета скора просто используйте функцию getLBScore

In [8]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs(satellite_predicted_values - satellite_true_values) / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values)))

### Model training

In [9]:
training_columns = ['epoch', 'sat_id', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim', 'h_x', 'h_y', 'h_z']
prediction_columns = ['x','y','z','Vx', 'Vy', 'Vz']
X = train.loc[:, training_columns]
y = train.loc[:, prediction_columns]
Xtest = test.loc[:, training_columns]

In [10]:
X.drop(columns = 'epoch', inplace = True)
Xtest.drop(columns = 'epoch', inplace = True)

In [11]:
X['r'] = np.sqrt(X.x_sim**2 + X.y_sim**2 + X.z_sim**2)
Xtest['r'] = np.sqrt(Xtest.x_sim**2 + Xtest.y_sim**2 + Xtest.z_sim**2)

X['phi'] = np.arctan(X.y_sim/X.x_sim)
Xtest['phi'] = np.arctan(Xtest.y_sim/Xtest.x_sim)

X['theta'] = np.arccos(X.z_sim/X.r)
Xtest['theta'] = np.arccos(Xtest.z_sim/Xtest.r)



X['Vr'] = np.sqrt(X.Vx_sim**2 + X.Vy_sim**2 + X.Vz_sim**2)
Xtest['Vr'] = np.sqrt(Xtest.Vx_sim**2 + Xtest.Vy_sim**2 + Xtest.Vz_sim**2)

X['Vphi'] = np.arctan(X.Vy_sim/X.Vx_sim)
Xtest['Vphi'] = np.arctan(Xtest.Vy_sim/Xtest.Vx_sim)

X['Vtheta'] = np.arccos(X.Vz_sim/X.Vr)
Xtest['Vtheta'] = np.arccos(Xtest.Vz_sim/Xtest.Vr)

In [12]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
model = Ridge()
trainSize = 0.8
submissionValid = train.copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
smp = []
SMP = []
nTargets = 6
for sat_id in tqdm(X['sat_id'].unique()):
    if sat_id >= 0:
        smp = []
        satX = X[X['sat_id'] == sat_id]
        satX = pd.concat([satX, satX.shift(1).fillna(0), 
                          satX.shift(2).fillna(0), satX.shift(3).fillna(0),
                         satX.shift(4).fillna(0)], axis = 1)
        satY = y[X['sat_id'] == sat_id]
        
        size = int(satX.shape[0] * trainSize)
        Xtr, Xval = satX.iloc[:size, :], satX.iloc[size:, :]
        ytr, yval = satY.iloc[:size, :], satY.iloc[size:, :]
        for i in range(nTargets):
            model.fit(Xtr, ytr.iloc[:, i])
            ypred = model.predict(Xval)
            smp.append(smape(ypred, yval.iloc[:, i]))
            SMP.append(smape(ypred, yval.iloc[:, i]))
print('Final', 100*(1- np.mean(SMP)))

100%|██████████| 600/600 [01:17<00:00,  7.70it/s]

Final 91.14454750610535





In [13]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor
model = Ridge()
nTargets = 6
for sat_id in tqdm(Xtest['sat_id'].unique()):
    satX = X[X['sat_id'] == sat_id].drop(columns = ['sat_id'])
    satX = pd.concat([satX, satX.shift(1).fillna(0), 
                          satX.shift(2).fillna(0), satX.shift(3).fillna(0),
                         satX.shift(4).fillna(0)], axis = 1)
    satY = y[X['sat_id'] == sat_id]
    satXtest = Xtest[Xtest['sat_id'] == sat_id].drop(columns = ['sat_id'])
    satXtest = pd.concat([satXtest, satXtest.shift(1).fillna(0),satXtest.shift(2).fillna(0), satXtest.shift(3).fillna(0),
                         satXtest.shift(4).fillna(0)], axis = 1)
    
    for i in range(nTargets):
        model.fit(satX, satY.iloc[:, i])
        ypred = model.predict(satXtest)
        submission.loc[satXtest.index, submission.columns[i+1]] = ypred

100%|██████████| 300/300 [00:44<00:00,  6.79it/s]


In [14]:
submission.to_csv('submission.csv', index = None)