In [1]:
import numpy as np
import pandas as pd

pd.options.mode.copy_on_write = True

In [2]:
# Load data
dtypes = {
    'object_id': np.int32,
    'mjd': np.float64,
    'passband': np.int8,
    'flux': np.float32,
    'flux_err': np.float32,
    'detected': np.bool_
}

df = pd.read_csv('data/test_set_batch1.csv', dtype=dtypes)
df.head()

Unnamed: 0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str
0,0,13,59798.3205,2,-1.299735,1.357315,False,red
1,1,13,59798.3281,1,-2.095392,1.148654,False,green
2,2,13,59798.3357,3,-0.923794,1.763655,False,infrared
3,3,13,59798.3466,4,-4.009815,2.602911,False,near-z
4,4,13,59798.3576,5,-3.403503,5.367328,False,near-y


In [3]:
# Annotate data
sketches = [
    [4],
    [2, 7, 3, 5],
    [8, 3, 6, 3, 0, 5, 2, 7]
]


df['mjd_annot'] = pd.cut(df['mjd'], bins=10)
df['flux_annot'] = pd.cut(df['flux'], bins=10)
df['flux_err_annot'] = pd.cut(df['flux_err'], bins=10)
df.head()

Unnamed: 0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,flux_annot,flux_err_annot
0,0,13,59798.3205,2,-1.299735,1.357315,False,red,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
1,1,13,59798.3281,1,-2.095392,1.148654,False,green,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
2,2,13,59798.3357,3,-0.923794,1.763655,False,infrared,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
3,3,13,59798.3466,4,-4.009815,2.602911,False,near-z,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
4,4,13,59798.3576,5,-3.403503,5.367328,False,near-y,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"


In [4]:
df.max()

index                          10855957
object_id                        342868
mjd                          60674.0798
passband                              5
flux                       61436.507812
flux_err                     339.724915
detected                           True
passband_str                ultraviolet
mjd_annot         (60564.905, 60674.08]
flux_annot        (47989.44, 61436.508]
flux_err_annot       (305.799, 339.725]
dtype: object

In [5]:
df.min()

index                                    0
object_id                               13
mjd                             59582.3282
passband                                 0
flux                         -73034.171875
flux_err                          0.463753
detected                             False
passband_str                         green
mjd_annot           (59581.236, 59691.503]
flux_annot        (-73168.643, -59587.104]
flux_err_annot              (0.124, 34.39]
dtype: object

In [6]:
# Filters
def filter_isin(df, col, sketch):
    return df.loc[df[col].cat.codes.isin(sketch)]

filtered1 = filter_isin(df, 'mjd_annot', sketches[0])
filtered2 = filter_isin(df, 'mjd_annot', sketches[1])
filtered3 = filter_isin(df, 'mjd_annot', sketches[2])

def filter_xs(df, sketch):
    return pd.concat([df.xs(i) for i in sketch])

In [7]:
# MultiIndex
mi = pd.MultiIndex.from_arrays([df['flux_annot'].cat.codes, df['object_id']], names=['flux_annot', 'object_id'])
df_mi = df.set_index(mi)

df_mi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,flux_annot,flux_err_annot
flux_annot,object_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5,13,0,13,59798.3205,2,-1.299735,1.357315,False,red,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
5,13,1,13,59798.3281,1,-2.095392,1.148654,False,green,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
5,13,2,13,59798.3357,3,-0.923794,1.763655,False,infrared,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
5,13,3,13,59798.3466,4,-4.009815,2.602911,False,near-z,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"
5,13,4,13,59798.3576,5,-3.403503,5.367328,False,near-y,"(59691.503, 59800.679]","(-5798.832, 7648.236]","(0.124, 34.39]"


In [8]:
max_possible_flux = (df['flux'] + df['flux_err']).max()
max_possible_flux

61729.234375

In [9]:
# %%timeit
max_possible_flux = (df['flux'] + df['flux_err']).max()

In [10]:
%%timeit
filtered = filter_isin(df, 'flux_annot', [9])
max_possible_flux = (filtered['flux'] + filtered['flux_err']).max()

30.1 ms ± 4.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%%timeit
mi = pd.MultiIndex.from_arrays([df['flux_annot'].cat.codes, df['object_id']], names=['flux_annot', 'object_id'])
df_mi = df.set_index(mi).sort_index()
filtered = filter_xs(df_mi, [9])
max_possible_flux = (filtered['flux'] + filtered['flux_err']).max()

2.15 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit
filtered = filter_xs(df_mi, [9])
max_possible_flux = (filtered['flux'] + filtered['flux_err']).max()

14.9 ms ± 781 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%%timeit
df['flux'].sum()

13.6 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit
df['flux'].prod()

  return umr_prod(a, axis, dtype, out, keepdims, initial, where)
  return umr_prod(a, axis, dtype, out, keepdims, initial, where)


41.6 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%timeit
df['flux'].mean()

28.3 ms ± 2.89 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit
filtered1['flux'].sum()

370 μs ± 40.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [17]:
%%timeit
filter_isin(df, sketches[0])['flux'].sum()

TypeError: filter_isin() missing 1 required positional argument: 'sketch'

In [None]:
%%timeit
df_mi['flux'].sum()

18.3 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
filter_xs(df_mi, sketches[0])['flux'].sum()

3.21 ms ± 724 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
