In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import gc
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.cluster import KMeans
RND_SEED = 44


%matplotlib inline

PATH = '../'

In [5]:
%%time
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'Track 1/test.csv')
submission = pd.read_csv(PATH + 'Track 1/submission.csv')

Wall time: 2.56 s


In [6]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs(satellite_predicted_values - satellite_true_values) / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values)))

In [7]:
training_columns = ['epoch', 'sat_id', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
prediction_columns = ['x','y','z','Vx', 'Vy', 'Vz']
X = train.loc[:, training_columns]
y = train.loc[:, prediction_columns]
Xtest = test.loc[:, training_columns]

In [8]:
X.drop(columns = 'epoch', inplace = True)
Xtest.drop(columns = 'epoch', inplace = True)

In [9]:
linearSat = [6,
 9,
 24,
 26,
 28,
 29,
 34,
 35,
 36,
 39,
 40,
 42,
 44,
 49,
 51,
 53,
 54,
 57,
 64,
 68,
 75,
 91,
 93,
 102,
 106,
 113,
 114,
 121,
 128,
 130,
 132,
 135,
 138,
 142,
 143,
 144,
 145,
 152,
 156,
 158,
 159,
 160,
 162,
 165,
 170,
 172,
 183,
 187,
 191,
 193,
 203,
 207,
 210,
 211,
 220,
 225,
 227,
 229,
 233,
 239,
 240,
 245,
 253,
 254,
 261,
 263,
 268,
 277,
 281,
 284,
 292,
 293,
 294,
 298,
 299,
 306,
 309,
 310,
 312,
 314,
 316,
 319,
 326,
 332,
 333,
 342,
 343,
 344,
 347,
 356,
 358,
 362,
 372,
 373,
 377,
 384,
 387,
 389,
 391,
 393,
 396,
 399,
 400,
 411,
 419,
 423,
 424,
 426,
 428,
 435,
 436,
 443,
 446,
 448,
 456,
 460,
 462,
 465,
 467,
 471,
 474,
 479,
 488,
 489,
 499,
 510,
 516,
 517,
 519,
 522,
 528,
 543,
 544,
 547,
 566,
 572,
 578,
 579,
 583,
 593]

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
rfc = LinearRegression()
trainSize = 0.8
cluster = KMeans(n_clusters=24, tol = 1e-3, algorithm = 'elkan', random_state=RND_SEED)
submissionValid = train.copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
smp = []
SMP = []
nTargets = 6
for sat_id in Xtest['sat_id'].unique():
    if sat_id not in linearSat:
        smp = []
        satX = X[X['sat_id'] == sat_id].drop(columns = ['sat_id'])
        satY = y[X['sat_id'] == sat_id]
        size = int(satX.shape[0] * trainSize)
        
        Xtr, Xval = satX.iloc[:size, :], satX.iloc[size:, :]
        ytr, yval = satY.iloc[:size, :], satY.iloc[size:, :]
        for i in range(nTargets):
            rfc.fit(Xtr, ytr.iloc[:, i])
            ypred = rfc.predict(Xval)
            smp.append(smape(ypred, yval.iloc[:, i]))
            SMP.append(smape(ypred, yval.iloc[:, i]))
        print(f'SatId:',sat_id, 'Score:', 100*(1- np.mean(smp)))
    else:
        satX = X[X['sat_id'] == sat_id].drop(columns = ['sat_id'])
        satY = y[X['sat_id'] == sat_id]
        satXtest = Xtest[Xtest['sat_id'] == sat_id].drop(columns = ['sat_id'])
        
        size = int(satX.shape[0] * trainSize)
        Xtr, Xval = satX.iloc[:size, :], satX.iloc[size:, :]
        ytr, yval = satY.iloc[:size, :], satY.iloc[size:, :]

        labelsTrain = cluster.fit_predict(Xtr)
        labelsTest = cluster.predict(Xval)
        smp = []
        for label in np.unique(labelsTest):
            for i in range(nTargets):
                rfc.fit(Xtr[labelsTrain == label], ytr[labelsTrain == label].iloc[:, i])
                ypred = rfc.predict(Xval[labelsTest == label])
                smp.append(smape(ypred, yval[labelsTest == label].iloc[:, i]))
                SMP.append(smape(ypred, yval[labelsTest == label].iloc[:, i]))
        print(f'SatId:',sat_id, 'Score:', 100*(1- np.mean(smp)))
print('Final', 100*(1- np.mean(SMP)))

SatId: 1 Score: 80.02550583038634
SatId: 2 Score: 85.21921223181886
SatId: 3 Score: 84.83233869388683
SatId: 4 Score: 82.88255117961916
SatId: 6 Score: 95.10226364034506
SatId: 9 Score: 99.33402435680362
SatId: 16 Score: 95.38404354286432
SatId: 20 Score: 62.830388411811164
SatId: 22 Score: 71.41881780297649
SatId: 24 Score: 99.33072960582506
SatId: 25 Score: 84.15128121482114
SatId: 26 Score: 88.4320910859689
SatId: 27 Score: 85.94836020567435
SatId: 28 Score: 85.56114452470293
SatId: 29 Score: 99.65802772982357
SatId: 32 Score: 86.45949102801703
SatId: 34 Score: 99.98317149911875
SatId: 35 Score: 70.51551013336164
SatId: 36 Score: 99.9340109180328
SatId: 37 Score: 20.2694957125634
SatId: 38 Score: 83.19207762871461
SatId: 39 Score: 99.69853000904641
SatId: 40 Score: 99.98712427345609
SatId: 41 Score: 82.4583849422288
SatId: 42 Score: 99.88643130097076
SatId: 44 Score: 99.74170086529512
SatId: 45 Score: 77.27991488574276
SatId: 49 Score: 99.55672186875555
SatId: 51 Score: 99.886569826

In [95]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor
model = LinearRegression()
cluster = KMeans(n_clusters=24, tol = 1e-3, algorithm = 'elkan', random_state=RND_SEED)
nTargets = 6
for sat_id in tqdm(Xtest['sat_id'].unique()):
    if sat_id not in linearSat:
        satX = X[X['sat_id'] == sat_id].drop(columns = ['sat_id'])
        satY = y[X['sat_id'] == sat_id]
        satXtest = Xtest[Xtest['sat_id'] == sat_id].drop(columns = ['sat_id'])
        
        for i in range(nTargets):
            model.fit(satX, satY.iloc[:, i])
            ypred = model.predict(satXtest)
            submission.loc[satXtest.index, submission.columns[i+1]] = ypred
    else:
        satX = X[X['sat_id'] == sat_id].drop(columns = ['sat_id'])
        satY = y[X['sat_id'] == sat_id]
        satXtest = Xtest[Xtest['sat_id'] == sat_id].drop(columns = ['sat_id'])
        

        labelsTrain = cluster.fit_predict(satX)
        labelsTest = cluster.predict(satXtest)
        smp = []
        for label in np.unique(labelsTest):
            for i in range(nTargets):
                model.fit(satX[labelsTrain == label], satY[labelsTrain == label].iloc[:, i])
                ypred = model.predict(satXtest[labelsTest == label])
                submission.loc[satXtest[labelsTest == label].index, submission.columns[i+1]] = ypred

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:44<00:00,  2.87it/s]


In [96]:
submission.to_csv('submission.csv', index = None)