In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import gc
from tqdm import tqdm

%matplotlib inline

PATH = '../'

In [2]:
%%time
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'Track 1/test.csv')
submission = pd.read_csv(PATH + 'Track 1/submission.csv')

Wall time: 2.57 s


In [3]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs(satellite_predicted_values - satellite_true_values) / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values)))

In [4]:
training_columns = ['epoch', 'sat_id', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
prediction_columns = ['x','y','z','Vx', 'Vy', 'Vz']
X = train.loc[:, training_columns]
y = train.loc[:, prediction_columns]
Xtest = test.loc[:, training_columns]

In [5]:
X.drop(columns = 'epoch', inplace = True)
Xtest.drop(columns = 'epoch', inplace = True)

In [6]:
def splitData(X,y, sat_id, perc = 0.8):
    size = X[X['sat_id']==sat_id].shape[0]
    Xval = X[X['sat_id'] == sat_id].iloc[int(size * perc):, :]
    Xtr = X[X['sat_id'] == sat_id].iloc[:int(size * perc), :]
    
    yval = y[X['sat_id'] == sat_id].iloc[int(size * perc):, :]
    ytr = y[X['sat_id'] == sat_id].iloc[:int(size * perc), :]
    return Xtr, Xval, ytr, yval

In [7]:
linearSat = [5,
 6,
 8,
 9,
 11,
 14,
 20,
 22,
 24,
 26,
 28,
 29,
 30,
 31,
 34,
 35,
 36,
 39,
 40,
 42,
 43,
 44,
 46,
 47,
 49,
 51,
 53,
 54,
 55,
 57,
 58,
 60,
 61,
 64,
 67,
 68,
 69,
 71,
 72,
 74,
 75,
 76,
 79,
 81,
 83,
 91,
 93,
 97,
 102,
 103,
 104,
 106,
 107,
 108,
 111,
 113,
 114,
 115,
 116,
 119,
 120,
 121,
 122,
 128,
 129,
 130,
 131,
 132,
 135,
 137,
 138,
 139,
 140,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 152,
 154,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 168,
 170,
 171,
 172,
 176,
 179,
 180,
 183,
 187,
 191,
 193,
 196,
 198,
 200,
 203,
 207,
 210,
 211,
 212,
 214,
 215,
 216,
 218,
 220,
 221,
 222,
 224,
 225,
 226,
 227,
 229,
 231,
 232,
 233,
 234,
 235,
 237,
 238,
 239,
 240,
 245,
 250,
 252,
 253,
 254,
 259,
 260,
 261,
 262,
 263,
 265,
 268,
 271,
 273,
 276,
 277,
 279,
 281,
 284,
 285,
 287,
 289,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 302,
 303,
 305,
 306,
 307,
 309,
 310,
 311,
 312,
 313,
 314,
 316,
 317,
 319,
 321,
 322,
 324,
 326,
 330,
 331,
 332,
 333,
 335,
 337,
 339,
 341,
 342,
 343,
 344,
 345,
 346,
 347,
 350,
 352,
 353,
 355,
 356,
 358,
 360,
 362,
 367,
 369,
 370,
 373,
 374,
 377,
 378,
 379,
 383,
 384,
 386,
 387,
 388,
 389,
 391,
 393,
 394,
 395,
 396,
 397,
 398,
 399,
 400,
 401,
 403,
 404,
 405,
 408,
 410,
 411,
 413,
 414,
 417,
 419,
 423,
 424,
 425,
 426,
 428,
 430,
 431,
 432,
 433,
 435,
 436,
 438,
 442,
 443,
 444,
 446,
 447,
 448,
 455,
 456,
 458,
 459,
 460,
 462,
 463,
 465,
 466,
 467,
 469,
 471,
 474,
 479,
 485,
 487,
 488,
 489,
 492,
 496,
 499,
 504,
 507,
 510,
 515,
 516,
 517,
 519,
 520,
 522,
 524,
 525,
 526,
 528,
 532,
 534,
 537,
 540,
 543,
 544,
 545,
 547,
 553,
 556,
 557,
 562,
 563,
 564,
 566,
 567,
 570,
 571,
 572,
 573,
 574,
 578,
 579,
 582,
 583,
 586,
 589,
 591,
 593,
 594,
 599]

In [8]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
RND_SEED = 44
cluster = KMeans(n_clusters=24, tol = 1e-3, algorithm = 'elkan', random_state=RND_SEED)
model = LinearRegression()
smp = []
SMP = []
trainSize = 0.5
submissionValid = train.copy()[['id', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
smp = []
nTargets = 6
for sat_id in tqdm(Xtest['sat_id'].unique()):
    if sat_id not in linearSat:
        satXtest = Xtest[Xtest['sat_id'] == sat_id]
        smp = []
        Xtr, Xval, ytr, yval = splitData(X,y, sat_id=sat_id, perc = trainSize)
        for i in range(nTargets):
            model.fit(Xtr, ytr.iloc[:, i])
            ypred = model.predict(Xval)
            smp.append(smape(ypred, yval.iloc[:, i]))
            SMP.append(smape(ypred, yval.iloc[:, i]))
        #print(f'SatId:',sat_id, 'Score:', 100*(1- np.mean(smp)))
    else:
        Xtr, Xval, ytr, yval = splitData(X,y, sat_id=sat_id, perc = trainSize)

        labelsTrain = cluster.fit_predict(Xtr)
        labelsTest = cluster.predict(Xval)
        smp = []
        for label in np.unique(labelsTest):
            for i in range(nTargets):
                model.fit(Xtr[labelsTrain == label], ytr[labelsTrain == label].iloc[:, i])
                ypred = model.predict(Xval[labelsTest == label])
                smp.append(smape(ypred, yval[labelsTest == label].iloc[:, i]))
                SMP.append(smape(ypred, yval[labelsTest == label].iloc[:, i]))
        #print(f'SatId:',sat_id, 'Score:', 100*(1- np.mean(smp)))
print(100*(1- np.mean(SMP)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:41<00:00,  2.95it/s]


94.25310132962
