In [1]:
import sys
import logging
import math

import glob
import os
import sys

import pandas as pd
import numpy as np
from dask.distributed import Client

from cesium import featurize

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

sys.path.insert(0, "/opt/vssexclude/personal/kaggle/k_tab_aug")

In [2]:
client = Client()

In [3]:
client.scheduler

<pooled rpc to 'tcp://127.0.0.1:43521'>

In [4]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [5]:
import src.config.constants as constants
import src.munging as process_data
import src.common as common

In [6]:
logger = common.get_logger("main")

In [9]:
features_to_use = ['all_times_nhist_numpeaks',
 'all_times_nhist_peak1_bin',
 'all_times_nhist_peak2_bin',
 'all_times_nhist_peak3_bin',
 'all_times_nhist_peak4_bin',
 'all_times_nhist_peak_1_to_2',
 'all_times_nhist_peak_1_to_3',
 'all_times_nhist_peak_1_to_4',
 'all_times_nhist_peak_2_to_3',
 'all_times_nhist_peak_2_to_4',
 'all_times_nhist_peak_3_to_4',
 'all_times_nhist_peak_val',
 'avg_double_to_single_step',
 'avg_err',
 'avgt',
 'cad_probs_1',
 'cad_probs_10',
 'cad_probs_20',
 'cad_probs_30',
 'cad_probs_40',
 'cad_probs_50',
 'cad_probs_100',
 'cad_probs_500',
 'cad_probs_1000',
 'cad_probs_5000',
 'cad_probs_10000',
 'cad_probs_50000',
 'cad_probs_100000',
 'cad_probs_500000',
 'cad_probs_1000000',
 'cad_probs_5000000',
 'cad_probs_10000000',
 'cads_avg',
 'cads_med',
 'cads_std',
 'mean',
 'med_double_to_single_step',
 'med_err',
 'n_epochs',
 'std_double_to_single_step',
 'std_err',
 'total_time',
 'amplitude',
 'flux_percentile_ratio_mid20',
 'flux_percentile_ratio_mid35',
 'flux_percentile_ratio_mid50',
 'flux_percentile_ratio_mid65',
 'flux_percentile_ratio_mid80',
 'max_slope',
 'maximum',
 'median',
 'median_absolute_deviation',
 'minimum',
 'percent_amplitude',
 'percent_beyond_1_std',
 'percent_close_to_median',
 'percent_difference_flux_percentile',
 'period_fast',
 'qso_log_chi2_qsonu',
 'qso_log_chi2nuNULL_chi2nu',
 'skew',
 'std',
 'stetson_j',
 'stetson_k',
 'weighted_average',
 'fold2P_slope_10percentile',
 'fold2P_slope_90percentile',
 'freq1_amplitude1',
 'freq1_amplitude2',
 'freq1_amplitude3',
 'freq1_amplitude4',
 'freq1_freq',
 'freq1_lambda',
 'freq1_rel_phase2',
 'freq1_rel_phase3',
 'freq1_rel_phase4',
 'freq1_signif',
 'freq2_amplitude1',
 'freq2_amplitude2',
 'freq2_amplitude3',
 'freq2_amplitude4',
 'freq2_freq',
 'freq2_rel_phase2',
 'freq2_rel_phase3',
 'freq2_rel_phase4',
 'freq3_amplitude1',
 'freq3_amplitude2',
 'freq3_amplitude3',
 'freq3_amplitude4',
 'freq3_freq',
 'freq3_rel_phase2',
 'freq3_rel_phase3',
 'freq3_rel_phase4',
 'freq_amplitude_ratio_21',
 'freq_amplitude_ratio_31',
 'freq_frequency_ratio_21',
 'freq_frequency_ratio_31',
 'freq_model_max_delta_mags',
 'freq_model_min_delta_mags',
 'freq_model_phi1_phi2',
 'freq_n_alias',
 'freq_signif_ratio_21',
 'freq_signif_ratio_31',
 'freq_varrat',
 'freq_y_offset',
 'linear_trend',
 'medperc90_2p_p',
 'p2p_scatter_2praw',
 'p2p_scatter_over_mad',
 'p2p_scatter_pfold_over_mad',
 'p2p_ssqr_diff_over_var',
 'scatter_res_raw']

In [11]:
train_df, test_df, combined_df = process_data.read_processed_data(logger=logger, data_dir=constants.PROCESSED_DATA_DIR)

[INFO]2021-08-27 13:10:26,282:main:Reading Data from /opt/vssexclude/personal/kaggle/k_tab_aug/data/processed...
[INFO]2021-08-27 13:10:26,601:main:Shape of train_df : (250000, 101)
[INFO]2021-08-27 13:10:26,743:main:Shape of test_df : (150000, 100)
[INFO]2021-08-27 13:10:26,754:main:Shape of sample_submission_df : (150000, 1)


In [12]:
combined_df = pd.concat([train_df.drop("loss", axis=1), test_df])

In [13]:
combined_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,-0.00235,59,0.766739,-1.35046,42.272701,16.685699,30.3599,1.2673,0.392007,1.09101,1.96874,1.87464,117.286003,6.7162,0.985656,0.973428,4004232,0.377896,1.03382,0.574531,0.122371,8.18617,1517.829956,3.13219,0.112836,5.01868,116.765999,10891,8.19366,5.79715,1.09999,14.8684,-0.275434,0.915721,167.800003,-7.53414,4.23632,1.62956,1.1444,-0.303139,4.09209,3.22617,0.074799,0.259497,289.492004,327.464996,5.38931,7.39479,-0.48995,20.292299,2.4566,1.44772,-10639.0,85.600502,-0.178513,2815,-234.772003,1.83317,88.560501,0.367916,8575340000.0,70.973297,0.380057,0.031812,1.09527,0.563482,0.122689,1.16072,1.69391,1.07217,65.154297,0.022504,-5.6068,1.79866,0.528068,6696.299805,-0.562078,1.30102,6.71624,1.14347,2.29983,0.010485,-0.127223,0.231086,4.51614,0.594494,397,0.264022,8.6879,15.0701,0.376622,-42.439899,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898
1,0.784462,145,-0.463845,-0.530421,27324.900391,3.47545,160.498001,0.828007,3.73586,1.28138,-2.73947,-0.529506,157.669998,0.696384,1.44123,0.159056,23567462,-0.089583,-0.711628,-1.04586,0.13992,3.69294,-123.353996,7.74097,-0.852302,8.51025,161.175003,87801,12.0202,1.78393,1.231,10.1497,-0.018724,1.01128,127.401001,11.8214,5.99681,-0.95385,1.37937,1.07953,0.711923,2.90602,0.051206,0.755122,140.893005,29.5252,14.2296,3.53213,-0.405681,42.535702,1.43533,0.939771,138312.0,59.881001,-0.070959,1435,1046.880005,1.5677,29.430599,2.45515,4518200000.0,75.560204,1.98799,0.318177,1.14901,0.723574,0.184821,-0.348303,-7.17633,1.46258,43.112099,-0.060801,64.045502,2.35845,5.75969,3958.139893,1.57661,-1.24179,5.91412,0.959826,2.56631,0.000652,-0.335617,-0.271723,5.10319,8.70622,98,0.210513,7.86416,3.3719,0.147973,-184.132004,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383
2,0.317816,19,-0.432571,-0.382644,1383.26001,19.7129,31.1026,-0.515354,34.430801,1.2421,2.9018,-0.96034,118.589996,7.69642,1.48876,0.387277,235760,0.055568,0.26856,0.718133,0.027133,11.6734,270.247009,3.44051,-0.679071,13.3781,150.362,14173,1.69954,7.04728,1.10513,7.62871,0.685721,0.809485,120.064003,194.427002,6.77866,0.634136,0.934386,0.92698,0.741116,2.42205,0.260362,0.626639,369.57901,370.023987,4.48893,7.8429,1.19601,43.534302,2.59182,1.25671,168881.0,83.841904,2.71323,2911,23256.900391,3.91896,97.5578,4.38538,844497000.0,99.493301,5.09804,0.258909,1.16878,0.049053,0.173547,0.937,2.05947,1.22245,50.2267,-0.026215,71.684898,2.43417,1.90456,27165.800781,-0.773223,-1.83339,4.98548,1.17087,1.17201,0.016848,-0.235581,-0.724935,3.22565,4.17099,105,-0.155451,8.91829,0.186334,0.335985,7.43721,37.218102,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055
3,0.210753,17,-0.616454,0.946362,-119.252998,4.08235,185.257004,1.38331,-47.5214,1.0913,-1.512,-1.29234,125.460999,7.34323,-3.09239,0.713795,1146032,0.326534,0.454842,0.219958,0.037982,5.15195,4893.859863,6.89751,-0.830552,4.43184,132.854996,77147,32.808998,4.0639,1.11911,3.91776,0.500353,0.206448,120.411003,233.537003,7.75131,0.625762,-1.94081,1.33321,-5.79317,1.56512,0.309483,0.134558,84.863701,24.3353,4.8712,2.55104,-0.3723,14.896,1.10013,0.892193,17006.599609,78.407799,0.425686,1723,375.23999,1.94539,98.150002,-0.109164,3543970000.0,60.808201,2.35759,0.23908,1.16116,0.535797,0.222185,1.16312,2.36343,1.4153,116.181999,0.018674,55.442799,2.22824,4.3036,2643.76001,-1.66632,0.792398,6.45162,1.07733,2.90676,0.023736,-0.091992,-0.098701,4.2782,5.34753,512,0.855981,8.27663,4.06665,0.33649,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288
4,0.439671,20,0.968126,-0.092546,74.302002,12.3065,72.185997,-0.233964,24.399099,1.10151,1.77348,-0.546781,147.186005,17.3943,0.964678,0.964894,19272478,0.121071,0.422461,-0.1031,-0.00091,8.14191,162.712997,1.56561,-0.300743,7.56458,160.994995,5780,-1.54254,8.09081,1.60582,7.04632,-1.05685,1.72744,126.848,0.910761,6.08868,0.150619,1.13461,1.55355,16.436399,2.48867,0.229194,0.37476,465.292999,76.859299,0.763062,8.20657,-0.646135,72.767403,1.44481,0.858525,-40791.898438,70.799797,0.169193,1199,-120.388,1.68786,84.064903,-0.081201,6379450000.0,103.989998,4.3963,0.248451,1.18082,0.54646,0.140713,0.804404,7.55191,-2.50995,68.245903,0.00123,47.288502,0.461938,1.07244,703.401001,0.691108,5.01408,6.06393,1.12025,1.73348,-0.001272,-0.333872,-0.063781,1.1142,5.23399,109,-0.158318,5.43062,0.991616,0.528518,290.657013,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197


In [14]:
combined_df.shape

(400000, 100)

In [13]:
combined_df_min_max = combined_df.copy()
for name in combined_df.columns:
    mm = MinMaxScaler()
    combined_df_min_max.loc[:, name] = mm.fit_transform(combined_df[[name]])

In [14]:
combined_df_min_max = process_data.change_dtype(
        logger, combined_df_min_max, np.float64, np.float32
    )

[INFO]2021-08-27 10:22:36,232:main:Changing dtype of [f0] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,295:main:Changing dtype of [f1] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,355:main:Changing dtype of [f2] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,412:main:Changing dtype of [f3] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,468:main:Changing dtype of [f4] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,524:main:Changing dtype of [f5] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,581:main:Changing dtype of [f6] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,637:main:Changing dtype of [f7] from [<class 'numpy.float64'>] to [<class 'numpy.float32'>]
[INFO]2021-08-27 10:22:36,692:main:Changing dtype of [f8

### Create inputs for cesium

In [15]:
a = []
for i in range(0, len(combined_df)):
    a.append(np.arange(start=0, stop=100, step=1))

In [21]:
a = []
for i in range(0, 100):
    a.append(np.arange(start=0, stop=100, step=1))

In [24]:
len(a), len(a[0])

(100, 100)

In [22]:
ts = list(combined_df_min_max[0:100].values)

In [23]:
len(ts), len(ts[0])

(100, 100)

In [30]:
from cesium import featurize
fset_cesium = featurize.featurize_time_series(times=a,
                                              values=ts,
                                              errors=None,
                                              features_to_use=features_to_use, scheduler="processes")

In [48]:
fset_cesium.columns = fset_cesium.columns.get_level_values(0)

In [51]:
fset_cesium.to_parquet("cesium.parquet", index=True)

In [15]:
df = pd.read_parquet(f"{constants.FEATURES_DATA_DIR}/cesium_features_1.parquet")

In [17]:
df.head()

feature,all_times_nhist_numpeaks,all_times_nhist_peak1_bin,all_times_nhist_peak2_bin,all_times_nhist_peak3_bin,all_times_nhist_peak4_bin,all_times_nhist_peak_1_to_2,all_times_nhist_peak_1_to_3,all_times_nhist_peak_1_to_4,all_times_nhist_peak_2_to_3,all_times_nhist_peak_2_to_4,all_times_nhist_peak_3_to_4,all_times_nhist_peak_val,avg_err,avgt,cad_probs_1,cad_probs_10,cad_probs_20,cad_probs_30,cad_probs_40,cad_probs_50,cad_probs_100,cad_probs_500,cad_probs_1000,cad_probs_5000,cad_probs_10000,cad_probs_50000,cad_probs_100000,cad_probs_500000,cad_probs_1000000,cad_probs_5000000,cad_probs_10000000,cads_avg,cads_med,cads_std,mean,med_err,n_epochs,std_err,total_time,amplitude,flux_percentile_ratio_mid20,flux_percentile_ratio_mid35,flux_percentile_ratio_mid50,flux_percentile_ratio_mid65,flux_percentile_ratio_mid80,max_slope,maximum,median,median_absolute_deviation,minimum,percent_amplitude,percent_beyond_1_std,percent_close_to_median,percent_difference_flux_percentile,period_fast,qso_log_chi2_qsonu,qso_log_chi2nuNULL_chi2nu,skew,std,stetson_j,stetson_k,weighted_average
0,1.0,,,,,,,,,,,0.019896,0.0001,49.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.384039,0.0001,100.0,0.0,99.0,0.45084,0.19113,0.338361,0.452699,0.55985,0.772959,0.890374,0.924067,0.362063,0.149278,0.022386,0.404069,0.33,0.29,0.602679,3.30573,7.122306,-0.123582,0.56339,0.208602,1.085198,1.007635,0.384039
1,1.0,,,,,,,,,,,0.019896,0.0001,49.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.389141,0.0001,100.0,0.0,99.0,0.456057,0.143732,0.23817,0.437306,0.616274,0.794307,0.771707,0.952506,0.339848,0.146471,0.040393,0.431236,0.34,0.37,0.592125,11.713204,6.959596,0.051253,0.562886,0.20987,0.997002,0.986508,0.389141
2,1.0,,,,,,,,,,,0.019896,0.0001,49.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.414719,0.0001,100.0,0.0,99.0,0.413268,0.203387,0.361056,0.544165,0.681001,0.904587,0.648589,0.867214,0.415744,0.160334,0.040679,0.412608,0.38,0.28,0.562971,65.30343,6.766104,0.152233,0.037125,0.200383,1.097396,1.058024,0.414719
3,1.0,,,,,,,,,,,0.019896,0.0001,49.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.370996,0.0001,100.0,0.0,99.0,0.392384,0.226197,0.321468,0.458245,0.606283,0.787765,0.733048,0.841225,0.326606,0.132212,0.056456,0.377504,0.29,0.28,0.53447,4.935687,6.795354,0.067326,0.603137,0.194884,0.904983,1.01047,0.370996
4,1.0,,,,,,,,,,,0.019896,0.0001,49.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.376182,0.0001,100.0,0.0,99.0,0.400262,0.177094,0.319867,0.476586,0.571985,0.773294,0.729045,0.824837,0.38153,0.146866,0.024313,0.38959,0.33,0.3,0.582511,29.220779,6.69712,0.118499,0.204622,0.190352,0.932655,1.035006,0.376182
