In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import gc
from tqdm import tqdm

%matplotlib inline

PATH = ''

In [2]:
%%time
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'Track 1/test.csv')
submission = pd.read_csv(PATH + 'Track 1/submission.csv')

Wall time: 2.57 s


### LB Score Calc

Для подсчета скора просто используйте функцию getLBScore

In [71]:
# там в формуле IDAO забыли двоечку походу приписать, поэтому при их формуле LB не совпадает с тем, что тут получается
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(2*np.abs(satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values)))

In [54]:
def getLBScore(real, predicted):
    smp = []
    for sat_id in real['sat_id'].unique():
        rv = real[real['sat_id'] == sat_id][['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
        sym = predicted[predicted['sat_id'] == sat_id][['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
        smp.append(smape(sym, rv))
    meanSmape = np.mean(smp)
    lbScore = 100 * (1 - meanSmape)
    print('Mean SMAPE ', meanSmape, '; LB score ', lbScore)

In [56]:
%%time
# Скор при использовании симулирующих значений на трейне как предсказаний
trainSym = train.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values
getLBScore(train, trainSym)

Mean SMAPE  0.30180462785158163 ; LB score  69.81953721484184
Wall time: 2.68 s


In [58]:
%%time
# Скор при использовании симулирующих значений на трейне как предсказаний только для айдишников из теста
trainTestSatellites = train['sat_id'].unique()[[True if sat_id in test['sat_id'].unique() else False for sat_id in train['sat_id'].unique()]]
trainTestData = train.loc[[True if sat_id in trainTestSatellites else False for sat_id in train['sat_id'].values], :]

trainSym = trainTestData.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values

getLBScore(trainTestData, trainSym)

Mean SMAPE  0.2977465529290635 ; LB score  70.22534470709365
Wall time: 4.17 s


In [132]:
%%time
getLBScore(train, train)

Mean SMAPE  0.0 ; LB score  100.0
Wall time: 2.55 s


In [60]:
%%time
# Скор при рандомном предсказании
trainRnd = train.copy()
trainRnd.iloc[:, 3:9] = np.random.rand(*train.iloc[:, 3:9].shape)
getLBScore(trainRnd, trainRnd)

Mean SMAPE  0.0 ; LB score  100.0
Wall time: 3.03 s


### Model training

In [61]:
training_columns = ['epoch', 'sat_id', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
prediction_columns = ['x','y','z','Vx', 'Vy', 'Vz']
X = train.loc[:, training_columns]
y = train.loc[:, prediction_columns]
Xtest = test.loc[:, training_columns]

In [62]:
def extractTimeFeats(data):
    timeFeats = data['epoch'].apply(lambda x: str(x).split('T')[1].split('.')[0].split(':')).values
    h,m,s = [],[],[]
    for feat in timeFeats:
        h.append(feat[0]); m.append(feat[1]); s.append(feat[2])
    return h,m,s

In [63]:
h,m,s = extractTimeFeats(X)
X.loc[:, 'h'] = list(map(int, h)); X.loc[:, 'm'] = list(map(int, m)); X.loc[:, 's'] =list(map(int, s))
X.drop(columns = 'epoch', inplace = True)

h,m,s = extractTimeFeats(Xtest)
Xtest.loc[:, 'h'] = list(map(int, h)); Xtest.loc[:, 'm'] = list(map(int, m)); Xtest.loc[:, 's'] = list(map(int, s))
Xtest.drop(columns = 'epoch', inplace = True)

In [65]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split

models = {
      'logreg': LinearRegression(), 
      'lasso': Lasso(),
      'ridge': Ridge(),
#       'mlp': MLPRegressor(), 
      #'knn': KNeighborsRegressor(), 
      #'dtree': DecisionTreeRegressor(), 
      #'rfc': RandomForestRegressor(), 
      #'gbm': GradientBoostingRegressor()
     }

trainSize = 0.8
for name, model in models.items():
    print(f'Running model {name}...')
    submissionValid = train.iloc[int(X.shape[0] * 0.8):, :].copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
    Xtr, Xval = X.iloc[:int(X.shape[0] * 0.8), :], X.iloc[int(X.shape[0] * 0.8):, :]
    Ytr, Yval = y.iloc[:int(X.shape[0] * 0.8), :], y.iloc[int(X.shape[0] * 0.8):, :]
    for predValue in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        ytr,yval = Ytr[predValue].values, Yval[predValue].values
        model.fit(Xtr, ytr)
        ypred = model.predict(Xval)
        print(f'SMAPE on {predValue} ', smape(ypred, yval))
        submissionValid[[predValue]] = ypred
    getLBScore(train.iloc[int(X.shape[0] * 0.8):, :],submissionValid)
    print('\n-----------------------------------\n')

Running model logreg...
SMAPE on x  0.44593138666276755
SMAPE on y  0.4569021260033375
SMAPE on z  0.4892066897362788
SMAPE on Vx  0.5039980169178703
SMAPE on Vy  0.5282332941699648
SMAPE on Vz  0.5487530191478822
Mean SMAPE  0.43425689587355565 ; LB score  56.57431041264444

-----------------------------------

Running model lasso...
SMAPE on x  0.4459088702433908
SMAPE on y  0.45688227797431985
SMAPE on z  0.4891867574322932
SMAPE on Vx  0.607537166095202
SMAPE on Vy  0.6376480619404448
SMAPE on Vz  0.7353245025424536
Mean SMAPE  0.5050390943702264 ; LB score  49.49609056297736

-----------------------------------

Running model ridge...
SMAPE on x  0.44593138248709063
SMAPE on y  0.4569021224603669
SMAPE on z  0.48920667132115486
SMAPE on Vx  0.5039981534183725
SMAPE on Vy  0.5282334392497413
SMAPE on Vz  0.5487532578447482
Mean SMAPE  0.4342569861592736 ; LB score  56.57430138407265

-----------------------------------



Реальный скор того что внизу - 64.63

In [72]:
models = {
    'ridge': Ridge()
}

trainSize = 0.9
for name, model in models.items():
    print(f'Running model {name}...')
    Xtr, Xval = train_test_split(X, train_size = trainSize)
    Ytr, Yval = y.iloc[Xtr.index], y.iloc[Xval.index]
    submissionValid = train.iloc[Xval.index, :].copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
    for predValue in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        ytr,yval = Ytr[predValue].values, Yval[predValue].values
        model.fit(Xtr, ytr)
        ypred = model.predict(Xval)
        print(f'SMAPE on {predValue} ', smape(ypred, yval))
        submissionValid[[predValue]] = ypred
        submission[[predValue]] = model.predict(Xtest)
    getLBScore(train.iloc[Xval.index, :],submissionValid)
    print('\n-----------------------------------\n')

Running model ridge...




SMAPE on x  0.3955118351043253
SMAPE on y  0.4015755672843913
SMAPE on z  0.4566425834936432
SMAPE on Vx  0.4706716199832256
SMAPE on Vy  0.4719360944633116
SMAPE on Vz  0.5440604856005392
Mean SMAPE  0.4044577157068018 ; LB score  59.55422842931982

-----------------------------------



In [74]:
# Реальный скор: 64.63, скор при трейн тест сплите 56-59.66
submission.to_csv('baseline.csv', index = None)