In [1]:
import numpy as np
import pandas as pd
import lightgbm
import time
import matplotlib.pyplot as plt
import seaborn as sns

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Load in data
data = 'data/'
train = pd.read_csv(data + 'training_set.csv')
test_chunk = pd.read_csv(data + 'test_set.csv.zip', compression='zip', chunksize=200000)
test = test_chunk.get_chunk()
train_meta = pd.read_csv(data + 'training_set_metadata.csv')
test_meta = pd.read_csv(data + 'test_set_metadata.csv')

In [3]:
# Feature engineering
train['uncertainty'] = (100*train['flux_err'])/abs(train['flux'])
test['uncertainty'] = (100*test['flux_err'])/abs(test['flux'])

train['flux_diff']=train['flux']-train['flux'].mean()
test['flux_diff']=test['flux']-test['flux'].mean()

train['flux_diff_2']=(train['flux']-train['flux'].mean())**2
test['flux_diff_2']=(test['flux']-test['flux'].mean())**2

In [4]:
train.sort_values(['object_id', 'passband', 'mjd']).head(20)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,uncertainty,flux_diff,flux_diff_2
54,615,59819.1532,0,6.878784,3.633152,0,52.816777,-17.809768,317.187846
55,615,59820.1047,0,39.364853,3.775619,1,9.591345,14.676301,215.393803
56,615,59821.1026,0,-10.422381,4.172683,0,40.035794,-35.110933,1232.777635
57,615,59822.1105,0,-65.48513,4.362876,1,6.662392,-90.173682,8131.292975
58,615,59823.1505,0,-113.349159,4.069051,1,3.589838,-138.037711,19054.409734
74,615,59851.1114,0,-68.502457,3.338555,1,4.873628,-93.191009,8684.564209
105,615,59874.0599,0,-97.353195,3.13399,1,3.219196,-122.041747,14894.188078
106,615,59875.0311,0,-97.52388,2.963075,1,3.038307,-122.212432,14935.878602
107,615,59876.0231,0,-108.672577,3.449714,1,3.174411,-133.361129,17785.190801
108,615,59877.0238,0,-116.913223,3.097836,1,2.649688,-141.601775,20051.062761


In [5]:
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,uncertainty,flux_diff,flux_diff_2
0,615,59750.4229,2,-544.810303,3.622952,1,0.664993,-569.498855,324328.946158
1,615,59750.4306,1,-816.434326,5.55337,1,0.680198,-841.122878,707487.696355
2,615,59750.4383,3,-471.385529,3.801213,1,0.806392,-496.074081,246089.494111
3,615,59750.445,4,-388.984985,11.395031,1,2.929427,-413.673537,171125.79544
4,615,59752.407,2,-681.858887,4.041204,1,0.592675,-706.547439,499209.283944


In [6]:
# Aggregate time-series features
aggregate = {
    'mjd': ['min', 'max', 'mean', 'var'],
    'flux': ['min', 'max', 'mean', 'var'],
    'flux_err': ['min', 'max', 'mean', 'var', 'sum'],
    'detected': ['mean', 'var'],
    'uncertainty': ['mean', 'var', 'min', 'max', 'sum'],
    'flux_diff': ['mean', 'var', 'min', 'max', 'sum'],
    'flux_diff_2': ['mean', 'var', 'min', 'max', 'sum'],
}

train = train.groupby(['object_id', 'passband'], as_index=False).agg(aggregate)

In [7]:
train.head()

Unnamed: 0_level_0,object_id,passband,mjd,mjd,mjd,mjd,flux,flux,flux,flux,...,flux_diff,flux_diff,flux_diff,flux_diff,flux_diff,flux_diff_2,flux_diff_2,flux_diff_2,flux_diff_2,flux_diff_2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,mean,var,min,max,mean,var,...,mean,var,min,max,sum,mean,var,min,max,sum
0,615,0,59819.1532,60617.0295,60278.94073,76785.911468,-116.913223,125.182808,-3.254554,7046.718507,...,-27.943107,7046.718507,-141.601775,100.494256,-1760.415719,7715.683042,40416910.0,0.032537,20051.06,486088.0
1,615,1,59750.4306,60624.1836,60175.754255,100428.682815,-1100.440063,660.626343,-385.699911,362147.956509,...,-410.388463,362147.956509,-1125.128615,635.937791,-23802.530866,524322.716955,196934200000.0,103.277225,1265914.0,30410720.0
2,615,2,59750.4229,60624.176,60175.746548,100428.690259,-681.858887,611.984558,-134.146566,207135.439497,...,-158.835118,207135.439497,-706.547439,587.296006,-9212.436839,228792.733498,26842710000.0,46.39221,499209.3,13269980.0
3,615,3,59750.4383,60624.1913,60175.761947,100428.678892,-530.644592,445.737061,-121.103501,112509.965946,...,-145.792053,112509.965946,-555.333144,421.048509,-8455.9391,131825.461795,10433640000.0,29.50849,308394.9,7645877.0
4,615,4,59750.445,60624.2022,60175.77294,100428.738115,-422.184509,381.953735,-55.954592,85149.252849,...,-80.643144,85149.252849,-446.873061,357.265183,-4677.302381,90184.479037,4088733000.0,618.382347,199695.5,5230700.0


In [8]:
train.columns = pd.Index([e[0] + "_" + e[1] for e in train.columns.tolist()])

In [9]:
train = train.rename(columns={'object_id_': 'object_id', 'passband_': 'passband'})

In [10]:
train.columns

Index(['object_id', 'passband', 'mjd_min', 'mjd_max', 'mjd_mean', 'mjd_var',
       'flux_min', 'flux_max', 'flux_mean', 'flux_var', 'flux_err_min',
       'flux_err_max', 'flux_err_mean', 'flux_err_var', 'flux_err_sum',
       'detected_mean', 'detected_var', 'uncertainty_mean', 'uncertainty_var',
       'uncertainty_min', 'uncertainty_max', 'uncertainty_sum',
       'flux_diff_mean', 'flux_diff_var', 'flux_diff_min', 'flux_diff_max',
       'flux_diff_sum', 'flux_diff_2_mean', 'flux_diff_2_var',
       'flux_diff_2_min', 'flux_diff_2_max', 'flux_diff_2_sum'],
      dtype='object')

In [11]:
original_columns=list(train.columns)

In [12]:
original_columns.remove('object_id')

In [13]:
train.head()

Unnamed: 0,object_id,passband,mjd_min,mjd_max,mjd_mean,mjd_var,flux_min,flux_max,flux_mean,flux_var,...,flux_diff_mean,flux_diff_var,flux_diff_min,flux_diff_max,flux_diff_sum,flux_diff_2_mean,flux_diff_2_var,flux_diff_2_min,flux_diff_2_max,flux_diff_2_sum
0,615,0,59819.1532,60617.0295,60278.94073,76785.911468,-116.913223,125.182808,-3.254554,7046.718507,...,-27.943107,7046.718507,-141.601775,100.494256,-1760.415719,7715.683042,40416910.0,0.032537,20051.06,486088.0
1,615,1,59750.4306,60624.1836,60175.754255,100428.682815,-1100.440063,660.626343,-385.699911,362147.956509,...,-410.388463,362147.956509,-1125.128615,635.937791,-23802.530866,524322.716955,196934200000.0,103.277225,1265914.0,30410720.0
2,615,2,59750.4229,60624.176,60175.746548,100428.690259,-681.858887,611.984558,-134.146566,207135.439497,...,-158.835118,207135.439497,-706.547439,587.296006,-9212.436839,228792.733498,26842710000.0,46.39221,499209.3,13269980.0
3,615,3,59750.4383,60624.1913,60175.761947,100428.678892,-530.644592,445.737061,-121.103501,112509.965946,...,-145.792053,112509.965946,-555.333144,421.048509,-8455.9391,131825.461795,10433640000.0,29.50849,308394.9,7645877.0
4,615,4,59750.445,60624.2022,60175.77294,100428.738115,-422.184509,381.953735,-55.954592,85149.252849,...,-80.643144,85149.252849,-446.873061,357.265183,-4677.302381,90184.479037,4088733000.0,618.382347,199695.5,5230700.0


In [14]:
for n in range(2,len(train.columns)):
    train[str(train.columns[1]) + '_0_' + str(train.columns[n])]=train[train['passband']==0][train.columns[n]]
    train[str(train.columns[1]) + '_1_' + str(train.columns[n])]=train[train['passband']==1][train.columns[n]]
    train[str(train.columns[1]) + '_2_' + str(train.columns[n])]=train[train['passband']==2][train.columns[n]]
    train[str(train.columns[1]) + '_3_' + str(train.columns[n])]=train[train['passband']==3][train.columns[n]]
    train[str(train.columns[1]) + '_4_' + str(train.columns[n])]=train[train['passband']==4][train.columns[n]]
    train[str(train.columns[1]) + '_5_' + str(train.columns[n])]=train[train['passband']==5][train.columns[n]]

In [15]:
train.head()

Unnamed: 0,object_id,passband,mjd_min,mjd_max,mjd_mean,mjd_var,flux_min,flux_max,flux_mean,flux_var,...,passband_2_flux_diff_2_max,passband_3_flux_diff_2_max,passband_4_flux_diff_2_max,passband_5_flux_diff_2_max,passband_0_flux_diff_2_sum,passband_1_flux_diff_2_sum,passband_2_flux_diff_2_sum,passband_3_flux_diff_2_sum,passband_4_flux_diff_2_sum,passband_5_flux_diff_2_sum
0,615,0,59819.1532,60617.0295,60278.94073,76785.911468,-116.913223,125.182808,-3.254554,7046.718507,...,,,,,486088.031654,,,,,
1,615,1,59750.4306,60624.1836,60175.754255,100428.682815,-1100.440063,660.626343,-385.699911,362147.956509,...,,,,,,30410720.0,,,,
2,615,2,59750.4229,60624.176,60175.746548,100428.690259,-681.858887,611.984558,-134.146566,207135.439497,...,499209.283944,,,,,,13269980.0,,,
3,615,3,59750.4383,60624.1913,60175.761947,100428.678892,-530.644592,445.737061,-121.103501,112509.965946,...,,308394.901128,,,,,,7645877.0,,
4,615,4,59750.445,60624.2022,60175.77294,100428.738115,-422.184509,381.953735,-55.954592,85149.252849,...,,,199695.532892,,,,,,5230700.0,


In [16]:
train = train.groupby('object_id', as_index=False).agg(sum)

In [17]:
train.head()

Unnamed: 0,object_id,passband,mjd_min,mjd_max,mjd_mean,mjd_var,flux_min,flux_max,flux_mean,flux_var,...,passband_2_flux_diff_2_max,passband_3_flux_diff_2_max,passband_4_flux_diff_2_max,passband_5_flux_diff_2_max,passband_0_flux_diff_2_sum,passband_1_flux_diff_2_sum,passband_2_flux_diff_2_sum,passband_3_flux_diff_2_sum,passband_4_flux_diff_2_sum,passband_5_flux_diff_2_sum
0,615,15,358573.3335,363737.9958,361165.222271,577435.707472,-3274.856368,2603.672646,-747.608971,860884.299781,...,499209.283944,308394.901128,199695.532892,200259.513428,486088.031654,30410720.0,13269980.0,7645877.0,5230700.0,5162743.0
1,713,15,358977.5867,364014.5281,361525.838849,581750.582331,-75.411404,70.097027,-8.215843,249.517401,...,1208.012295,1375.159663,1367.17675,1513.187926,56079.077326,38806.19,38196.44,39205.44,38925.82,42042.67
2,730,15,358809.9625,363908.7947,361321.522353,554272.986103,-40.133798,154.672128,14.578134,397.101259,...,758.307863,907.47654,931.767217,1922.678962,44294.350678,31500.71,27406.82,27353.2,26917.77,29593.54
3,745,15,358670.1405,363740.3866,361184.994953,570530.08956,-40.340398,959.450644,55.519947,4710.045401,...,38457.821989,31884.441315,25263.374974,13648.019345,39086.583278,57233.67,68733.67,73163.65,67621.79,47364.54
4,1124,15,358573.3335,363737.9958,361165.222271,577435.707472,-41.71591,541.748975,43.428951,2397.205604,...,6721.235199,13254.882989,14139.977349,7135.01749,36716.863391,27071.97,38010.03,50069.21,53815.51,43772.09


In [18]:
original_columns

['passband',
 'mjd_min',
 'mjd_max',
 'mjd_mean',
 'mjd_var',
 'flux_min',
 'flux_max',
 'flux_mean',
 'flux_var',
 'flux_err_min',
 'flux_err_max',
 'flux_err_mean',
 'flux_err_var',
 'flux_err_sum',
 'detected_mean',
 'detected_var',
 'uncertainty_mean',
 'uncertainty_var',
 'uncertainty_min',
 'uncertainty_max',
 'uncertainty_sum',
 'flux_diff_mean',
 'flux_diff_var',
 'flux_diff_min',
 'flux_diff_max',
 'flux_diff_sum',
 'flux_diff_2_mean',
 'flux_diff_2_var',
 'flux_diff_2_min',
 'flux_diff_2_max',
 'flux_diff_2_sum']

In [19]:
train = train.drop(original_columns, axis=1)

In [20]:
train.head()

Unnamed: 0,object_id,passband_0_mjd_min,passband_1_mjd_min,passband_2_mjd_min,passband_3_mjd_min,passband_4_mjd_min,passband_5_mjd_min,passband_0_mjd_max,passband_1_mjd_max,passband_2_mjd_max,...,passband_2_flux_diff_2_max,passband_3_flux_diff_2_max,passband_4_flux_diff_2_max,passband_5_flux_diff_2_max,passband_0_flux_diff_2_sum,passband_1_flux_diff_2_sum,passband_2_flux_diff_2_sum,passband_3_flux_diff_2_sum,passband_4_flux_diff_2_sum,passband_5_flux_diff_2_sum
0,615,59819.1532,59750.4306,59750.4229,59750.4383,59750.445,59752.4435,60617.0295,60624.1836,60624.176,...,499209.283944,308394.901128,199695.532892,200259.513428,486088.031654,30410720.0,13269980.0,7645877.0,5230700.0,5162743.0
1,713,59851.2006,59825.2676,59825.26,59825.2752,59825.2862,59825.2971,60674.0798,60668.0723,60668.0647,...,1208.012295,1375.159663,1367.17675,1513.187926,56079.077326,38806.19,38196.44,39205.44,38925.82,42042.67
2,730,59818.274,59798.3281,59798.3205,59798.3357,59798.3466,59798.3576,60648.0642,60652.1365,60652.1289,...,758.307863,907.47654,931.767217,1922.678962,44294.350678,31500.71,27406.82,27353.2,26917.77,29593.54
3,745,59818.2219,59770.374,59770.3662,59770.3817,59770.3928,59770.4039,60620.1257,60624.0425,60624.0348,...,38457.821989,31884.441315,25263.374974,13648.019345,39086.583278,57233.67,68733.67,73163.65,67621.79,47364.54
4,1124,59819.1532,59750.4306,59750.4229,59750.4383,59750.445,59752.4435,60617.0295,60624.1836,60624.176,...,6721.235199,13254.882989,14139.977349,7135.01749,36716.863391,27071.97,38010.03,50069.21,53815.51,43772.09


In [21]:
train_meta.merge(train, on='object_id')

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,passband_2_flux_diff_2_max,passband_3_flux_diff_2_max,passband_4_flux_diff_2_max,passband_5_flux_diff_2_max,passband_0_flux_diff_2_sum,passband_1_flux_diff_2_sum,passband_2_flux_diff_2_sum,passband_3_flux_diff_2_sum,passband_4_flux_diff_2_sum,passband_5_flux_diff_2_sum
0,615,349.046051,-61.943836,320.796530,-51.753706,1,0.0000,0.0000,0.0000,,...,4.992093e+05,3.083949e+05,1.996955e+05,2.002595e+05,4.860880e+05,3.041072e+07,1.326998e+07,7.645877e+06,5.230700e+06,5.162743e+06
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,1.208012e+03,1.375160e+03,1.367177e+03,1.513188e+03,5.607908e+04,3.880619e+04,3.819644e+04,3.920544e+04,3.892582e+04,4.204267e+04
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.2320,0.2262,0.0157,40.2561,...,7.583079e+02,9.074765e+02,9.317672e+02,1.922679e+03,4.429435e+04,3.150071e+04,2.740682e+04,2.735320e+04,2.691777e+04,2.959354e+04
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,3.845782e+04,3.188444e+04,2.526337e+04,1.364802e+04,3.908658e+04,5.723367e+04,6.873367e+04,7.316365e+04,6.762179e+04,4.736454e+04
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,6.721235e+03,1.325488e+04,1.413998e+04,7.135017e+03,3.671686e+04,2.707197e+04,3.801003e+04,5.006921e+04,5.381551e+04,4.377209e+04
5,1227,35.683594,-5.379379,171.992947,-59.253501,1,0.0000,0.0000,0.0000,,...,7.552602e+02,8.055243e+02,1.080100e+03,1.397542e+03,4.483584e+04,3.125737e+04,3.167984e+04,3.135047e+04,3.025120e+04,3.393190e+04
6,1598,347.846710,-64.760857,318.929827,-49.143596,1,0.1352,0.1820,0.0304,39.7279,...,1.600637e+06,9.565544e+05,5.079710e+05,3.948310e+05,1.758654e+05,3.504372e+06,2.702932e+06,1.753384e+06,9.689403e+05,7.320698e+05
7,1632,348.595886,-63.072620,320.023289,-50.713060,1,0.6857,0.7014,0.0100,43.1524,...,7.630805e+02,9.221610e+02,9.288892e+02,1.537761e+03,3.833936e+04,3.515165e+04,3.433583e+04,3.262539e+04,2.880343e+04,2.555677e+04
8,1920,149.414062,3.433834,234.919132,42.245550,1,0.3088,0.3229,0.3360,41.1401,...,4.290689e+04,4.037663e+04,2.678881e+04,1.572918e+04,1.646764e+04,8.991627e+04,1.511053e+05,1.493320e+05,1.148741e+05,7.122908e+04
9,1926,149.414062,1.940072,236.565366,41.393323,1,0.0000,0.0000,0.0000,,...,7.292674e+02,9.713690e+02,1.135734e+03,5.999010e+03,1.756032e+04,2.767968e+04,2.636975e+04,2.704638e+04,2.937581e+04,3.851595e+04
