In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import gc
from tqdm import tqdm

%matplotlib inline

PATH = '../'

In [2]:
%%time
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'Track 1/test.csv')
submission = pd.read_csv(PATH + 'Track 1/submission.csv')

Wall time: 2.13 s


### LB Score Calc

Для подсчета скора просто используйте функцию getLBScore

In [3]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs(satellite_predicted_values - satellite_true_values) / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values)))

In [4]:
def getLBScore(real, predicted):
    smp = []
    for sat_id in real['sat_id'].unique():
        rv = real[real['sat_id'] == sat_id][['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
        sym = predicted[predicted['sat_id'] == sat_id][['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
        smp.append(smape(sym.ravel(), rv.ravel()))
    meanSmape = np.mean(smp)
    lbScore = 100 * (1 - meanSmape)
    print('Mean SMAPE ', meanSmape, '; LB score ', lbScore)

In [5]:
%%time
# LB ~ 66
# Скор при использовании симулирующих значений на трейне как предсказаний
trainSym = train.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values
getLBScore(train, trainSym)

Mean SMAPE  0.15090231392579082 ; LB score  84.90976860742092
Wall time: 2.22 s


In [6]:
%%time
# Скор при использовании симулирующих значений на трейне как предсказаний только для айдишников из теста
trainTestSatellites = train['sat_id'].unique()[[True if sat_id in test['sat_id'].unique() else False for sat_id in train['sat_id'].unique()]]
trainTestData = train.loc[[True if sat_id in trainTestSatellites else False for sat_id in train['sat_id'].values], :]

trainSym = trainTestData.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values

getLBScore(trainTestData, trainSym)

Mean SMAPE  0.14887327646453175 ; LB score  85.11267235354683
Wall time: 4.05 s


In [7]:
%%time
# Скор при полностью правильном предсказании
getLBScore(train, train)

Mean SMAPE  0.0 ; LB score  100.0
Wall time: 1.83 s


In [8]:
%%time
# Скор при рандомном предсказании
trainRnd = train.copy()
trainRnd.iloc[:, 3:9] = np.random.rand(*train.iloc[:, 3:9].shape)
getLBScore(train, trainRnd)

Mean SMAPE  0.8808432094280598 ; LB score  11.915679057194017
Wall time: 2.38 s


In [9]:
# LB ~ 32
# Половинку значений зануляем
trainSym = train.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values
trainSym.iloc[157810:, 3:9] = 0
getLBScore(train, trainSym)

Mean SMAPE  0.7728933735292876 ; LB score  22.710662647071246


### Model training

In [10]:
training_columns = ['epoch', 'sat_id', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
prediction_columns = ['x','y','z','Vx', 'Vy', 'Vz']
X = train.loc[:, training_columns]
y = train.loc[:, prediction_columns]
Xtest = test.loc[:, training_columns]

In [11]:
def extractTimeFeats(data):
    timeFeats = data['epoch'].apply(lambda x: str(x).split('T')[1].split('.')[0].split(':')).values
    h,m,s = [],[],[]
    for feat in timeFeats:
        h.append(feat[0]); m.append(feat[1]); s.append(feat[2])
    return h,m,s

In [12]:
h,m,s = extractTimeFeats(X)
X.loc[:, 'h'] = list(map(int, h)); X.loc[:, 'm'] = list(map(int, m)); X.loc[:, 's'] =list(map(int, s))
X.drop(columns = 'epoch', inplace = True)

h,m,s = extractTimeFeats(Xtest)
Xtest.loc[:, 'h'] = list(map(int, h)); Xtest.loc[:, 'm'] = list(map(int, m)); Xtest.loc[:, 's'] = list(map(int, s))
Xtest.drop(columns = 'epoch', inplace = True)

In [13]:
trainTestSatellites = X['sat_id'].unique()[[True if sat_id in Xtest['sat_id'].unique() else False for sat_id in X['sat_id'].unique()]]
trainTestData = X.loc[[True if sat_id in trainTestSatellites else False for sat_id in X['sat_id'].values], :]

In [13]:
val_data = X.loc[[True if sat_id in trainTestSatellites[:150] else False for sat_id in X['sat_id'].values], :]
hold_data = X.loc[[True if sat_id in trainTestSatellites[150:] else False for sat_id in X['sat_id'].values], :]
train_data = X.iloc[[idx for idx in X.index if (idx not in val_data.index and idx not in hold_data.index)], :]
len(train_data) / len(X), len(val_data) / len(X), len(hold_data) / len(X)

(0.5150374204507687, 0.2315805832174202, 0.2533819963318111)

In [26]:
train_data

Unnamed: 0,sat_id,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,h,m,s
0,0,-8843.131454,13138.221690,-20741.615306,-0.907527,-3.804930,-2.024133,0,0,0
1,0,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468,0,46,43
2,0,-10571.858472,-10145.939908,-24271.169776,0.274880,-4.046788,0.718768,1,33,26
3,0,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306,2,20,9
4,0,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237,3,6,52
...,...,...,...,...,...,...,...,...,...,...
649907,599,-20717.958996,-16245.240500,5250.939232,-1.653931,3.157321,0.079069,22,0,22
649908,599,-22673.444496,-11192.339393,5243.608790,-0.945328,3.603371,-0.092202,22,25,13
649909,599,-23461.830699,-5570.167175,4966.813869,-0.087089,3.912550,-0.281989,22,50,3
649910,599,-22858.679929,373.249102,4396.055679,0.920162,4.021955,-0.485364,23,14,54


In [29]:
len(Xtr), len(Xval), len(Xhold)

(334729, 150507, 164676)

In [30]:
%%time

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split

models = {
      'logreg': LinearRegression(), 
      'lasso': Lasso(),
      'ridge': Ridge(),
#       'mlp': MLPRegressor(), 
      #'knn': KNeighborsRegressor(), 
      #'dtree': DecisionTreeRegressor(), 
      #'rfc': RandomForestRegressor(), 
      #'gbm': GradientBoostingRegressor()
     }

for name, model in models.items():
    print(f'Running model {name}...')
    submissionValid = train.copy().iloc[val_data.index][['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
    submissionHold = train.copy().iloc[hold_data.index][['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
    Xtr, Xval, Xhold = train_data, val_data, hold_data
    Ytr, Yval, Yhold = y.iloc[train_data.index, :], y.iloc[val_data.index, :], y.iloc[hold_data.index, :]
    for predValue in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        ytr, yval, yhold = Ytr[predValue].values, Yval[predValue].values, Yhold[predValue].values
        model.fit(Xtr, ytr)
        yval_pred = model.predict(Xval)
        yhold_pred = model.predict(Xhold)
        print('SMAPE on {} | Val: {} | Hold: {}'.format(predValue, smape(yval_pred, yval), smape(yhold_pred, yhold)))
        submissionValid[[predValue]] = yval_pred
        submissionHold[[predValue]] = yhold_pred
    print('VALIDATION')
    getLBScore(train.iloc[Xval.index, :], submissionValid)
    print('HOLDOUT')
    getLBScore(train.iloc[Xhold.index, :], submissionHold)
    print('\n-----------------------------------\n')

Running model logreg...
SMAPE on x | Val: 0.1677998685422036 | Hold: 0.20528472514365748
SMAPE on y | Val: 0.1781878444484531 | Hold: 0.2108765890537499
SMAPE on z | Val: 0.22013289418361887 | Hold: 0.2448377298905368
SMAPE on Vx | Val: 0.20383277576094788 | Hold: 0.22815085584184827
SMAPE on Vy | Val: 0.22048238737542886 | Hold: 0.2507097782878269
SMAPE on Vz | Val: 0.26305323899477834 | Hold: 0.2779522384663766
VALIDATION
Mean SMAPE  0.19556663789442727 ; LB score  80.44333621055728
HOLDOUT
Mean SMAPE  0.2040746757738119 ; LB score  79.59253242261882

-----------------------------------

Running model lasso...
SMAPE on x | Val: 0.16779504051814803 | Hold: 0.20527315182014963
SMAPE on y | Val: 0.17817245951966043 | Hold: 0.2108627045002185
SMAPE on z | Val: 0.2201035416516983 | Hold: 0.2447957625165284
SMAPE on Vx | Val: 0.26170580359078666 | Hold: 0.2859220280791816
SMAPE on Vy | Val: 0.2801750801175843 | Hold: 0.3065620329563943
SMAPE on Vz | Val: 0.3522396476814794 | Hold: 0.374272

In [40]:
%%time

models = {
    'ridge': Ridge()
}

for name, model in models.items():
    print(f'Running model {name}...')
    for predValue in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        model.fit(X, y[predValue].values)
        submission[[predValue]] = model.predict(Xtest)
    print('\n-----------------------------------\n')

Running model ridge...

-----------------------------------

Wall time: 871 ms


In [41]:
# Реальный скор: 64.63, скор при трейн тест сплите 56-59.66
submission.to_csv('baseline_50_50.csv', index = None)