In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import gc
from tqdm import tqdm

%matplotlib inline

PATH = ''
dataYear = 2014

In [2]:
%%time
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'Track 1/test.csv')
submission = pd.read_csv(PATH + 'Track 1/submission.csv')
earth = pd.read_csv(PATH + 'omni_data.csv').drop(columns = 'Unnamed: 0')

earth = earth[earth['year'] == dataYear]
earth['day'] = earth['day'].map(int)
earth['hour'] = earth['hour'].map(int)
earth.fillna(0, inplace = True)
gc.collect()

Wall time: 7.83 s


### LB Score Calc

Для подсчета скора просто используйте функцию getLBScore

In [3]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs(satellite_predicted_values - satellite_true_values) / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values)))

In [4]:
def getLBScore(real, predicted):
    smp = []
    for sat_id in real['sat_id'].unique():
        rv = real[real['sat_id'] == sat_id][['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
        sym = predicted[predicted['sat_id'] == sat_id][['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
        smp.append(smape(sym.ravel(), rv.ravel()))
    meanSmape = np.mean(smp)
    lbScore = 100 * (1 - meanSmape)
    print('Mean SMAPE ', meanSmape, '; LB score ', lbScore)

In [53]:
%%time
# LB ~ 66
# Скор при использовании симулирующих значений на трейне как предсказаний
trainSym = train.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values
getLBScore(train, trainSym)

Mean SMAPE  0.15090231392579082 ; LB score  84.90976860742092
Wall time: 2.64 s


In [54]:
%%time
# Скор при использовании симулирующих значений на трейне как предсказаний только для айдишников из теста
trainTestSatellites = train['sat_id'].unique()[[True if sat_id in test['sat_id'].unique() else False for sat_id in train['sat_id'].unique()]]
trainTestData = train.loc[[True if sat_id in trainTestSatellites else False for sat_id in train['sat_id'].values], :]

trainSym = trainTestData.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values

getLBScore(trainTestData, trainSym)

Mean SMAPE  0.14887327646453175 ; LB score  85.11267235354683
Wall time: 4.19 s


In [55]:
%%time
# Скор при полностью правильном предсказании
getLBScore(train, train)

Mean SMAPE  0.0 ; LB score  100.0
Wall time: 2.26 s


In [56]:
%%time
# Скор при рандомном предсказании
trainRnd = train.copy()
trainRnd.iloc[:, 3:9] = np.random.rand(*train.iloc[:, 3:9].shape)
getLBScore(train, trainRnd)

Mean SMAPE  0.8809915007551554 ; LB score  11.900849924484458
Wall time: 2.84 s


In [57]:
# LB ~ 32
# Половинку значений зануляем
trainSym = train.copy()
trainSym.iloc[:, 3:9] = trainSym.iloc[:, 9:].values
trainSym.iloc[157810:, 3:9] = 0
getLBScore(train, trainSym)

Mean SMAPE  0.7728933735292876 ; LB score  22.710662647071246


### Model training

In [5]:
training_columns = ['epoch', 'sat_id', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
prediction_columns = ['x','y','z','Vx', 'Vy', 'Vz']
X = train.loc[:, training_columns]
y = train.loc[:, prediction_columns]
Xtest = test.loc[:, training_columns]

In [6]:
def extractTimeFeats(data):
    timeFeats = data['epoch'].apply(lambda x: str(x).split('T')[1].split('.')[0].split(':')).values
    h,m,s = [],[],[]
    for feat in timeFeats:
        h.append(feat[0]); m.append(feat[1]); s.append(feat[2])
    return h,m,s

In [7]:
h,m,s = extractTimeFeats(X)
X.loc[:, 'h'] = list(map(int, h)); X.loc[:, 'm'] = list(map(int, m)); X.loc[:, 's'] =list(map(int, s))
X.loc[:, 'd'] = train['epoch'].apply(lambda x: int(str(x).split('T')[0].split('-')[2])).values
X.drop(columns = 'epoch', inplace = True)

h,m,s = extractTimeFeats(Xtest)
Xtest.loc[:, 'h'] = list(map(int, h)); Xtest.loc[:, 'm'] = list(map(int, m)); Xtest.loc[:, 's'] = list(map(int, s))
Xtest.loc[:, 'd'] = 31 + test['epoch'].apply(lambda x: int(str(x).split('T')[0].split('-')[2])).values
Xtest.drop(columns = 'epoch', inplace = True)

In [8]:
X = X.merge(earth, left_on = ['d', 'h'], right_on = ['day', 'hour'])
Xtest = Xtest.merge(earth, left_on = ['d', 'h'], right_on = ['day', 'hour'])

X.drop(columns = ['d', 'year', 'day', 'hour'], inplace = True)
Xtest.drop(columns = ['d', 'year', 'day', 'hour'], inplace = True)

In [12]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split

models = {
      'logreg': LinearRegression(), 
      'lasso': Lasso(),
      'ridge': Ridge(),
#       'mlp': MLPRegressor(), 
      #'knn': KNeighborsRegressor(), 
      #'dtree': DecisionTreeRegressor(), 
      #'rfc': RandomForestRegressor(), 
      #'gbm': GradientBoostingRegressor()
     }

trainSize = 0.8
for name, model in models.items():
    print(f'Running model {name}...')
    submissionValid = train.iloc[int(X.shape[0] * 0.8):, :].copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
    Xtr, Xval = X.iloc[:int(X.shape[0] * 0.8), :], X.iloc[int(X.shape[0] * 0.8):, :]
    Ytr, Yval = y.iloc[:int(X.shape[0] * 0.8), :], y.iloc[int(X.shape[0] * 0.8):, :]
    for predValue in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        ytr,yval = Ytr[predValue].values, Yval[predValue].values
        model.fit(Xtr, ytr)
        ypred = model.predict(Xval)
        print(f'SMAPE on {predValue} ', smape(ypred, yval))
        submissionValid[[predValue]] = ypred
    getLBScore(train.iloc[int(X.shape[0] * 0.8):, :],submissionValid)
    print('\n-----------------------------------\n')

Running model logreg...
SMAPE on x  0.8327224219980626
SMAPE on y  0.8236215090073176
SMAPE on z  0.7947025237412265
SMAPE on Vx  0.9857120780433081
SMAPE on Vy  0.9884593079651188
SMAPE on Vz  0.9851428683528857
Mean SMAPE  0.9243720088063124 ; LB score  7.562799119368757

-----------------------------------

Running model lasso...




SMAPE on x  0.8353008878445565


Реальный скор того что внизу - 64.63

In [14]:
# LB ~ 64
models = {
    'ridge': DecisionTreeRegressor(max_depth = 3)
}

trainSize = 0.9
for name, model in models.items():
    print(f'Running model {name}...')
    Xtr, Xval = train_test_split(X, train_size = trainSize)
    Ytr, Yval = y.iloc[Xtr.index], y.iloc[Xval.index]
    submissionValid = train.iloc[Xval.index, :].copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
    for predValue in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        ytr,yval = Ytr[predValue].values, Yval[predValue].values
        model.fit(Xtr, ytr)
        ypred = model.predict(Xval)
        print(f'SMAPE on {predValue} ', smape(ypred, yval))
        submissionValid[[predValue]] = ypred
        submission[[predValue]] = model.predict(Xtest)
    getLBScore(train.iloc[Xval.index, :],submissionValid)
    print('\n-----------------------------------\n')

Running model ridge...




SMAPE on x  0.834780223017743
SMAPE on y  0.9432737939571062
SMAPE on z  0.8526274878404746
SMAPE on Vx  0.9786190960221042
SMAPE on Vy  0.9874047146710144
SMAPE on Vz  0.9757479135028767
Mean SMAPE  0.94057951015458 ; LB score  5.942048984542003

-----------------------------------



In [67]:
submission.to_csv('submission.csv', index = None)