In [1]:
import numpy as np
import pandas as pd

pd.options.mode.copy_on_write = False

In [2]:
# Load small cities data
cities = pd.read_csv('data/smallcities.csv')
cities

Unnamed: 0,popden,city,state
0,4200,Anchorage,AK
1,6000,San Diego,CA
2,5000,Sacramento,CA
3,7000,New York,NY
4,2000,Buffalo,NY
5,3700,Austin,TX
6,2500,Houstin,TX


In [3]:
cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   popden  7 non-null      int64 
 1   city    7 non-null      object
 2   state   7 non-null      object
dtypes: int64(1), object(2)
memory usage: 300.0+ bytes


In [4]:
# Annotate 'state' and 'popden' columns
cities['state'] = cities['state'].astype('category')
# pd.cut(cities['state'].cat, bins=3)
cities['state_annot_cat'] = pd.cut(cities['state'].cat.codes, bins=3)
cities['state_annot_int'] = pd.cut(cities['state'].cat.codes, bins=3).cat.codes

cities['popden_annot'] = pd.cut(cities['popden'], bins=[1000, 4000, 9000])

cities

Unnamed: 0,popden,city,state,state_annot_cat,state_annot_int,popden_annot
0,4200,Anchorage,AK,"(-0.003, 1.0]",0,"(4000, 9000]"
1,6000,San Diego,CA,"(-0.003, 1.0]",0,"(4000, 9000]"
2,5000,Sacramento,CA,"(-0.003, 1.0]",0,"(4000, 9000]"
3,7000,New York,NY,"(1.0, 2.0]",1,"(4000, 9000]"
4,2000,Buffalo,NY,"(1.0, 2.0]",1,"(1000, 4000]"
5,3700,Austin,TX,"(2.0, 3.0]",2,"(1000, 4000]"
6,2500,Houstin,TX,"(2.0, 3.0]",2,"(1000, 4000]"


In [5]:
# Get state_annot codes
cities['state_annot_cat'].cat.codes

0    0
1    0
2    0
3    1
4    1
5    2
6    2
dtype: int8

In [6]:
# Group by annotations
cities_by_state_idx = cities.set_index(['state_annot_cat', cities.index])
cities_by_state_idx

Unnamed: 0_level_0,Unnamed: 1_level_0,popden,city,state,state_annot_int,popden_annot
state_annot_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-0.003, 1.0]",0,4200,Anchorage,AK,0,"(4000, 9000]"
"(-0.003, 1.0]",1,6000,San Diego,CA,0,"(4000, 9000]"
"(-0.003, 1.0]",2,5000,Sacramento,CA,0,"(4000, 9000]"
"(1.0, 2.0]",3,7000,New York,NY,1,"(4000, 9000]"
"(1.0, 2.0]",4,2000,Buffalo,NY,1,"(1000, 4000]"
"(2.0, 3.0]",5,3700,Austin,TX,2,"(1000, 4000]"
"(2.0, 3.0]",6,2500,Houstin,TX,2,"(1000, 4000]"


In [7]:
# Decode sketch to categories
city_sketch = 3     # 0b0011

def decode_sketch(sketch):
    return [i for i, b in enumerate(bin(sketch)[2:]) if int(b) == 1]

city_sketch_cats = decode_sketch(city_sketch)
city_sketch_cats


[0, 1]

In [8]:
%%timeit
# Filter MultiIndex data with sketch
cities_by_state_idx.loc[cities_by_state_idx.index.levels[0][city_sketch_cats]]

328 μs ± 13.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [9]:
%%timeit
# Filter data with boolean filter on categorical data
cities[cities['state_annot_cat'].cat.codes.isin(city_sketch_cats)]

283 μs ± 6.65 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%%timeit
# Filter data with boolean filter on non-categorical data
cities[cities['state_annot_int'].isin(city_sketch_cats)]

255 μs ± 12.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
