In [2]:
import numpy as np
import pandas as pd

pd.options.mode.copy_on_write = True

# PLAsTiCC Astronomical Data
- `object_id` -- Unique identifier, `int32`
- `mjd` -- Modified Julian Date, time of observation, `float64`
- `passband` -- Specific LSST passband integer, `int8`
- `flux` -- Measured flux in the passband observation, `float32`
- `flux_err` -- Uncertainty on `flux`, `float32`
- `detected` -- Whether object's brightness is significantly different at 3-sigma level relative to reference template, `bool`

In [3]:
# Load test data
dtypes = {
    'object_id': np.int32,
    'mjd': np.float64,
    'passband': np.int8,
    'flux': np.float32,
    'flux_err': np.float32,
    'detected': np.bool_
}

batch1_rows = 10855959
batch2_rows = 44281696
sample_rows = 1000001

df = pd.read_csv('data/test_set_batch1.csv', dtype=dtypes, nrows=1000000)
# df = pd.read_csv('data/test_set_batch2.csv', dtype=dtypes)
# df = pd.read_csv('data/test_set_sample.csv', dtype=dtypes)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   index         1000000 non-null  int64  
 1   object_id     1000000 non-null  int32  
 2   mjd           1000000 non-null  float64
 3   passband      1000000 non-null  int8   
 4   flux          1000000 non-null  float32
 5   flux_err      1000000 non-null  float32
 6   detected      1000000 non-null  bool   
 7   passband_str  1000000 non-null  object 
dtypes: bool(1), float32(2), float64(1), int32(1), int64(1), int8(1), object(1)
memory usage: 36.2+ MB


In [5]:
def column_memory_info(df):
    mem_ser = df.memory_usage(index=True, deep=True).apply(lambda x: x / (1024**2))
    mem_df = pd.DataFrame(mem_ser, columns=['col_MB'])
    
    mem_df['dtype'] = df.dtypes
    mem_df.loc['Index', 'dtype'] = str(df.index.dtype)
    
    mem_df['num_objects'] = df.count()
    mem_df.loc['Index', 'num_objects'] = df.index.size
    mem_df['num_objects'] = mem_df['num_objects'].astype(int)    
    return mem_df

column_memory_info(df)

Unnamed: 0,col_MB,dtype,num_objects
Index,0.000126,int64,1000000
index,7.629395,int64,1000000
object_id,3.814697,int32,1000000
mjd,7.629395,float64,1000000
passband,0.953674,int8,1000000
flux,3.814697,float32,1000000
flux_err,3.814697,float32,1000000
detected,0.953674,bool,1000000
passband_str,53.032151,object,1000000


In [6]:
df.nunique()

index           1000000
object_id          3036
mjd                1638
passband              6
flux             949056
flux_err         821938
detected              2
passband_str          6
dtype: int64

In [7]:
# Define intervals for object ids
object_id_intervals = df['object_id'].unique()

In [8]:
# Define intervals for mjd
print(df['mjd'].min(), df['mjd'].max())
print(df['flux'].min(), df['flux'].max())
print(df['flux_err'].min(), df['flux_err'].max())

df['mjd_annot'] = pd.cut(df['mjd'], bins=10)
df['mjd_annot_int'] = df['mjd_annot'].cat.codes.astype(np.int8)

df['flux_annot'] = pd.cut(df['flux'], bins=10)
df['flux_err_annot'] = pd.cut(df['flux_err'], bins=10)
df

59582.3282 60674.0798
-10383.8935546875 39288.9453125
0.46375399827957153 186.75535583496094


Unnamed: 0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,mjd_annot_int,flux_annot,flux_err_annot
0,0,13,59798.3205,2,-1.299735,1.357315,False,red,"(59691.503, 59800.679]",1,"(-449.326, 4517.958]","(0.277, 19.093]"
1,1,13,59798.3281,1,-2.095392,1.148654,False,green,"(59691.503, 59800.679]",1,"(-449.326, 4517.958]","(0.277, 19.093]"
2,2,13,59798.3357,3,-0.923794,1.763655,False,infrared,"(59691.503, 59800.679]",1,"(-449.326, 4517.958]","(0.277, 19.093]"
3,3,13,59798.3466,4,-4.009815,2.602911,False,near-z,"(59691.503, 59800.679]",1,"(-449.326, 4517.958]","(0.277, 19.093]"
4,4,13,59798.3576,5,-3.403503,5.367328,False,near-y,"(59691.503, 59800.679]",1,"(-449.326, 4517.958]","(0.277, 19.093]"
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,32300,60583.1640,0,-0.295705,3.016026,False,ultraviolet,"(60564.905, 60674.08]",9,"(-449.326, 4517.958]","(0.277, 19.093]"
999996,999996,32300,60584.1591,0,-1.179237,2.365768,False,ultraviolet,"(60564.905, 60674.08]",9,"(-449.326, 4517.958]","(0.277, 19.093]"
999997,999997,32300,60585.1601,0,-1.001320,1.888240,False,ultraviolet,"(60564.905, 60674.08]",9,"(-449.326, 4517.958]","(0.277, 19.093]"
999998,999998,32300,60586.1564,0,3.691630,1.857002,False,ultraviolet,"(60564.905, 60674.08]",9,"(-449.326, 4517.958]","(0.277, 19.093]"


In [9]:
mjd_sketch_cats = np.array([0, 2, 5])


In [None]:
sk1 = [0, 2, 5]

cond = ' or '.join([f'mjd_annot_int == {i}' for i in sk1])
cond

'mjd_annot_int == 0 and mjd_annot_int == 2 and mjd_annot_int == 5'

In [None]:
sk1 = mjd_sketch_cats
sk2 = [4, 3, 7]
sk3 = [1, 6, 8]

col1 = 'mjd_annot'
col2 = 'flux_annot'
col3 = 'flux_err_annot'

sketches = [sk1, sk2, sk3]
columns = [col1, col2, col3]

cond = ' and '.join(f'{col}.cat.codes in @sketches[{i}]' for i, col in enumerate(columns))
df.query(cond)
# cond

Unnamed: 0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,mjd_annot_int,flux_annot,flux_err_annot
37445,37445,1063,59896.0992,3,5907.763184,28.547117,True,infrared,"(59800.679, 59909.854]",2,"(4517.958, 9485.242]","(19.093, 37.722]"
37446,37446,1063,59896.1101,4,4584.966797,22.460413,True,near-z,"(59800.679, 59909.854]",2,"(4517.958, 9485.242]","(19.093, 37.722]"
37513,37513,1063,60192.3192,4,7017.147949,33.507481,True,near-z,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
37514,37514,1063,60192.3301,5,6532.17627,33.086098,True,near-y,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
37533,37533,1063,60212.2409,4,6871.523438,32.845257,True,near-z,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
37534,37534,1063,60212.2519,5,6423.333496,32.625515,True,near-y,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
133651,133651,4025,60183.2922,4,5624.978027,27.545336,True,near-z,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
133652,133652,4025,60183.3031,5,7454.17627,37.345146,True,near-y,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
133665,133665,4025,60209.1965,3,7008.894043,34.657139,True,infrared,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"
133670,133670,4025,60212.183,3,4820.763184,24.084248,True,infrared,"(60128.204, 60237.379]",5,"(4517.958, 9485.242]","(19.093, 37.722]"


In [None]:
# df.query('mjd_annot.cat.codes in @mjd_sketch_cats')

In [None]:
# df[df['mjd_annot'].cat.codes.isin(mjd_sketch_cats)]

In [None]:
%%timeit
df[df['mjd_annot'].cat.codes.isin(mjd_sketch_cats)]

30 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit 
df[np.isin(df['mjd_annot'].cat.codes.values, mjd_sketch_cats)]

24.7 ms ± 3.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
df[df['mjd_annot_int'].isin(mjd_sketch_cats)]

29.4 ms ± 2.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
df.loc[df['mjd_annot_int'].isin(mjd_sketch_cats)]

32.6 ms ± 5.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
df.query('mjd_annot_int in @mjd_sketch_cats')

41.5 ms ± 4.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
sketch = mjd_sketch_cats
df.query('mjd_annot_int in @sketch')

Unnamed: 0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,mjd_annot_int
5,5,13,59801.3553,2,-1.778855,2.448943,False,red,"(59800.679, 59909.854]",2
6,6,13,59801.3629,1,2.491993,3.540421,False,green,"(59800.679, 59909.854]",2
7,7,13,59801.3705,3,1.644129,2.284999,False,infrared,"(59800.679, 59909.854]",2
8,8,13,59801.3815,4,-0.158192,2.515900,False,near-z,"(59800.679, 59909.854]",2
9,9,13,59801.3924,5,-6.457387,5.381231,False,near-y,"(59800.679, 59909.854]",2
...,...,...,...,...,...,...,...,...,...,...
999907,999907,32300,60226.3337,2,0.991051,0.951415,False,red,"(60128.204, 60237.379]",5
999908,999908,32300,60226.3413,1,-0.170953,0.953229,False,green,"(60128.204, 60237.379]",5
999909,999909,32300,60226.3489,3,2.333871,1.803218,False,infrared,"(60128.204, 60237.379]",5
999910,999910,32300,60226.3599,4,-3.905768,3.082055,False,near-z,"(60128.204, 60237.379]",5


# MultiIndex

In [None]:
# MultiIndex and then filter
mi = pd.MultiIndex.from_arrays([df['mjd_annot'].cat.codes, df['object_id']], names=['mjd_cat', 'object_id'])
df_mi = df.set_index(mi).sort_index()
df_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,mjd_annot_int
mjd_cat,object_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,96,3782,96,59582.3282,0,40.029842,2.979331,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3783,96,59583.2409,0,63.055912,2.795795,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3784,96,59584.2432,0,84.849129,2.927637,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3785,96,59585.2363,0,110.552193,3.367064,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3786,96,59586.2371,0,134.320419,3.861461,True,ultraviolet,"(59581.236, 59691.503]",0
...,...,...,...,...,...,...,...,...,...,...,...
9,32300,999995,32300,60583.1640,0,-0.295705,3.016026,False,ultraviolet,"(60564.905, 60674.08]",9
9,32300,999996,32300,60584.1591,0,-1.179237,2.365768,False,ultraviolet,"(60564.905, 60674.08]",9
9,32300,999997,32300,60585.1601,0,-1.001320,1.888240,False,ultraviolet,"(60564.905, 60674.08]",9
9,32300,999998,32300,60586.1564,0,3.691630,1.857002,False,ultraviolet,"(60564.905, 60674.08]",9


In [None]:
# %%timeit
sketch = mjd_sketch_cats
df_mi.index

MultiIndex([(0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            ...
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300)],
           names=['mjd_cat', 'object_id'], length=1000000)

In [None]:
# %%timeit
df_mi.loc[mjd_sketch_cats]

Unnamed: 0_level_0,Unnamed: 1_level_0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,mjd_annot_int
mjd_cat,object_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,96,3782,96,59582.3282,0,40.029842,2.979331,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3783,96,59583.2409,0,63.055912,2.795795,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3784,96,59584.2432,0,84.849129,2.927637,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3785,96,59585.2363,0,110.552193,3.367064,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3786,96,59586.2371,0,134.320419,3.861461,True,ultraviolet,"(59581.236, 59691.503]",0
...,...,...,...,...,...,...,...,...,...,...,...
5,32300,999907,32300,60226.3337,2,0.991051,0.951415,False,red,"(60128.204, 60237.379]",5
5,32300,999908,32300,60226.3413,1,-0.170953,0.953229,False,green,"(60128.204, 60237.379]",5
5,32300,999909,32300,60226.3489,3,2.333871,1.803218,False,infrared,"(60128.204, 60237.379]",5
5,32300,999910,32300,60226.3599,4,-3.905768,3.082055,False,near-z,"(60128.204, 60237.379]",5


In [None]:
# %%timeit
df_mi.loc[pd.IndexSlice[mjd_sketch_cats, :]]

Unnamed: 0_level_0,Unnamed: 1_level_0,index,object_id,mjd,passband,flux,flux_err,detected,passband_str,mjd_annot,mjd_annot_int
mjd_cat,object_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,96,3782,96,59582.3282,0,40.029842,2.979331,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3783,96,59583.2409,0,63.055912,2.795795,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3784,96,59584.2432,0,84.849129,2.927637,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3785,96,59585.2363,0,110.552193,3.367064,True,ultraviolet,"(59581.236, 59691.503]",0
0,96,3786,96,59586.2371,0,134.320419,3.861461,True,ultraviolet,"(59581.236, 59691.503]",0
...,...,...,...,...,...,...,...,...,...,...,...
5,32300,999907,32300,60226.3337,2,0.991051,0.951415,False,red,"(60128.204, 60237.379]",5
5,32300,999908,32300,60226.3413,1,-0.170953,0.953229,False,green,"(60128.204, 60237.379]",5
5,32300,999909,32300,60226.3489,3,2.333871,1.803218,False,infrared,"(60128.204, 60237.379]",5
5,32300,999910,32300,60226.3599,4,-3.905768,3.082055,False,near-z,"(60128.204, 60237.379]",5


In [None]:
df_mi.index

MultiIndex([(0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            (0,    96),
            ...
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300),
            (9, 32300)],
           names=['mjd_cat', 'object_id'], length=1000000)

# Groupby

In [None]:
%%timeit
df.groupby(df['mjd_annot'].cat.codes).indices
# pd.concat([df.loc[df.groupby(df['mjd_annot'].cat.codes).groups[mjd_cat]] for mjd_cat in mjd_sketch_cats])

36.1 ms ± 6.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
