In [1]:
import sketch
import pandas as pd
import sqlite3
import datasketches
import datasketch
import base64
import uuid
import datetime
import logging

Hello, welcome to sketch!


In [2]:
def load_from_sqlite(path):
    conn = sqlite3.connect(path)
    tables = pd.read_sql("SELECT name FROM sqlite_schema WHERE type='table' ORDER BY name;", conn)
    logging.info(f'Found {len(tables)} tables in file {path}')
    all_tables = {}
    for i, table in enumerate(tables.name):
        all_tables[table] = pd.read_sql(f"SELECT * from '{table}'", conn)
        all_tables[table].attrs |= {'table_name': table, 'source': path}
    return all_tables

In [16]:
class SketchBase:
    def __init__(self, data):
        self.data = data

    @classmethod
    def from_series(cls, series):
        raise NotImplementedError(f"Need from_series method for {self.__class__}")
    
    def pack(self):
        return self.data
    
    @staticmethod
    def unpack(data):
        return data
        
    def to_dict(self):
        return {'name': self.__class__.__name__, 'data': self.pack()}
    
    @classmethod
    def from_dict(cls, data):
        tcls = cls
        if data['name'] != cls.__name__:
            for subclass in cls.__subclasses__():
                if subclass.__name__ == data['name']:
                    tcls = subclass
        return tcls(data=tcls.unpack(data['data']))
        

class Rows(SketchBase):
    @classmethod
    def from_series(cls, series):
        return cls(data=int(series.size))

class Count(SketchBase):
    @classmethod
    def from_series(cls, series):
        return cls(data=int(series.count()))
    
    
class MinHash(SketchBase):
    @classmethod
    def from_series(cls, series):
        minhash = datasketch.MinHash()
        minhash.update_batch([str(x).encode('utf-8') for x in series])
        lmh = datasketch.LeanMinHash(minhash)
        return cls(data=lmh)

    def pack(self):
        buf = bytearray(self.data.bytesize())
        self.data.serialize(buf)
        return base64.b64encode(buf).decode('utf-8')
         
    @staticmethod
    def unpack(data):
        return datasketch.LeanMinHash.deserialize(base64.b64decode(data))
        

def get_sketchpad(series, context={}):
    sketches = [Rows, Count, MinHash]
    skpad = {
        'version': '0.0.1',
        'metadata': {
            'id': str(uuid.uuid4()),
            'creation_start': datetime.datetime.utcnow().isoformat()
        },
        'context': {
            'name': series.name,
            'context': context
        },
        'sketches': []
    }
    for skcls in sketches:
        skpad['sketches'].append(skcls.from_series(series).to_dict())
        
    skpad['metadata']['creation_end'] = datetime.datetime.utcnow().isoformat()
    return skpad
        
def get_sketchbook_for_dataframe(df, sketchbook=None):
    sketchbook = sketchbook or []
    for col in df.columns:
        sketchbook.append(get_sketchpad(df[col], context=df.attrs))
    return sketchbook

In [17]:
def get_sketch_by_name(sketchpad, name):
    sketches = [sk for sk in sketchpad['sketches'] if sk['name'] == name]
    if len(sketches) == 1:
        return SketchBase.from_dict(sketches[0])
    return None

In [18]:
# super slow, find close jaccard for proof test
def slow_find_jaccard_similar(sketchpad, sketchbook):
    base_minhash = get_sketch_by_name(sketchpad, 'MinHash')
    if base_minhash is None:
        raise RuntimeError("No minhash in base sketchpad")
    for comp_skpad in sketchbook:
        comp_minhash = get_sketch_by_name(comp_skpad, 'MinHash')
        if comp_minhash:
            print(base_minhash.data.jaccard(comp_minhash.data))

In [None]:
# rename sketchbook to portfolio

In [6]:
all_tables = load_from_sqlite('datasets/fivethirtyeight.db')

In [19]:
skpads = get_sketchbook_for_dataframe(all_tables['./index'])

In [21]:
slow_find_jaccard_similar(skpads[0], skpads)

1.0
0.0
0.0


In [24]:
get_sketch_by_name(skpads[0], 'Count').data

194

In [22]:
import json
json.dumps(skpads[0])

'{"version": "0.0.1", "metadata": {"id": "606aa2f8-f08a-46a4-9a4f-da641e99ddaa", "creation_start": "2022-07-15T18:07:34.014006", "creation_end": "2022-07-15T18:07:34.016996"}, "context": {"name": "dataset_url", "context": {"table_name": "./index", "source": "datasets/fivethirtyeight.db"}}, "sketches": [{"name": "Rows", "data": 194}, {"name": "Count", "data": 194}, {"name": "MinHash", "data": "AQAAAAAAAACAAAAApMJIAO73PQHilHsAUSFWAWpQIQDjkhEDzfBnA9O+2AAI4t8EWj4ZANM6EQApIy0BVVxPAC8fNAFk3sIGxZS7BIZ8mgFmyr0CUxI6AHkg0QCUr9gAAywkATn4DwBzvY4CC6SqASyxZwCRz28ATrRVAL0KaAIOEScA3bbwBrznhgAbuVgANFUJBH3pOwDE3YADH4ufALDE0gAzQjUBFiwJAHejdwE73TUD3FiJAT2JdwXxr4gBH/e6ANh+mQM2JicA0NxcAHrVmgBhfqcF1sp8BQN9rgJzDjMAG+jkBpVPzAJQpc0A8HPtAMIHxQAEzN4AKWh0AXNERwHMq8wEFugdAeCqHAFCeisBK897ATwvuACo+6kCIOqIADZlzwAEWmIEvAzQAHH4zgFZIW0BBdrYADwSNwBb8EgD4f80ALsrHQBBUTUCSC51Am8zuQPoMmMC18dYAJzDIgCBrN4DSbyPAOK/PwErVA4AY65TBFQyNQBH5gAAGjE4Adz0NQCTDgwCG9lvAftsWAALjDwCF13YAJXvFwDnPCABPzlDBPcggwCpN5cAsH1zAFiLZwBT2toC1sFyAYKwlwGY