In [4]:
import sketch
import pandas as pd
import sqlite3
import datasketches
import datasketch
import base64
import uuid
import datetime
import logging
import heapq

Hello, welcome to sketch!


In [1]:
class SketchBase:
    def __init__(self, data):
        self.name = self.__class__.__name__
        self.data = data

    @classmethod
    def from_series(cls, series):
        raise NotImplementedError(f"Need from_series method for {self.__class__}")
    
    def pack(self):
        return self.data
    
    @staticmethod
    def unpack(data):
        return data
        
    def to_dict(self):
        return {'name': self.__class__.__name__, 'data': self.pack()}
    
    @classmethod
    def from_dict(cls, data):
        tcls = cls
        if data['name'] != cls.__name__:
            for subclass in cls.__subclasses__():
                if subclass.__name__ == data['name']:
                    tcls = subclass
        return tcls(data=tcls.unpack(data['data']))
        

class Rows(SketchBase):
    @classmethod
    def from_series(cls, series):
        return cls(data=int(series.size))

class Count(SketchBase):
    @classmethod
    def from_series(cls, series):
        return cls(data=int(series.count()))
    
    
class MinHash(SketchBase):
    @classmethod
    def from_series(cls, series):
        minhash = datasketch.MinHash()
        minhash.update_batch([str(x).encode('utf-8') for x in series])
        lmh = datasketch.LeanMinHash(minhash)
        return cls(data=lmh)

    def pack(self):
        buf = bytearray(self.data.bytesize())
        self.data.serialize(buf)
        return base64.b64encode(buf).decode('utf-8')
         
    @staticmethod
    def unpack(data):
        return datasketch.LeanMinHash.deserialize(base64.b64decode(data))

In [2]:
class SketchPad:
    verison = '0.0.1'
    sketches = [Rows, Count, MinHash]
    
    def __init__(self, context=None):
        self.version = '0.0.1'
        self.id = str(uuid.uuid4())
        self.metadata = {
            'id': self.id,
            'creation_start': datetime.datetime.utcnow().isoformat()
        }
        self.context = context or {}
        # TODO: consider alternate naming convention
        # so can do dictionary lookups
        self.sketches = []
    
    @classmethod
    def from_series(cls, series, context=None):
        sp = cls(context=context)
        for skcls in cls.sketches:
            sp.sketches.append(skcls.from_series(series))
        sp.metadata['creation_end'] = datetime.datetime.utcnow().isoformat()
        sp.context['column_name'] = series.name
        return sp
    
    def get_sketch_by_name(self, name):
        sketches = [sk for sk in self.sketches if sk.name == name]
        if len(sketches) == 1:
            return sketches[0]
        return None
    
    def get_sketchdata_by_name(self, name):
        sketch = self.get_sketch_by_name(name)
        return sketch.data if sketch else None
    
    def minhash_jaccard(self, other):
        self_minhash = self.get_sketchdata_by_name('MinHash')
        other_minhash = other.get_sketchdata_by_name('MinHash')
        if self_minhash is None or other_minhash is None:
            return None
        return self_minhash.jaccard(other_minhash)
    
    def to_dict(self):
        return {
            'version': self.version,
            'metadata': self.metadata,
            'sketches': [s.to_dict() for s in self.sketches],
            'context': self.context
        }

    @classmethod
    def from_dict(cls, data):
        assert data['version'] == cls.version
        sp = cls()
        sp.id = data['metadata']['id']
        sp.metadata = data['metadata']
        sp.context = data['context']
        sp.sketches = [SketchBase(s) for s in data['sketches']]
        return sp

In [3]:
class Portfolio:
    def __init__(self, sketchpads=None):
        self.sketchpads = {sp.id: sp for sp in (sketchpads or [])}
    
    def add_dataframe(self, df):
        for col in df.columns:
            sp = SketchPad.from_series(df[col], context=df.attrs)
            self.add_sketchpad(sp)
    
    def add_dataframes(self, dfs):
        for df in dfs:
            self.add_dataframe(df)
            
    def add_sketchpad(self, sketchpad):
        self.sketchpads[sketchpad.id] = sketchpad
        
    def add_sqlite(self, sqlite_db_path):
        conn = sqlite3.connect(sqlite_db_path)
        tables = pd.read_sql("SELECT name FROM sqlite_schema WHERE type='table' ORDER BY name;", conn)
        logging.info(f'Found {len(tables)} tables in file {sqlite_db_path}')
        all_tables = {}
        for i, table in enumerate(tables.name):
            df = pd.read_sql(f"SELECT * from '{table}'", conn)
            df.attrs |= {'table_name': table, 'source': sqlite_db_path}
            self.add_dataframe(df)
        return list(tables.name)
        
    def closest_overlap(self, sketchpad, n=5):
        scores = []
        for sp in self.sketchpads.values():
            score = sketchpad.minhash_jaccard(sp)
            heapq.heappush(scores, (score, sp.id))
        top_n = heapq.nlargest(n, scores, key=lambda x: x[0])
        return [(s, self.sketchpads[i]) for s, i in top_n]

In [9]:
pf = Portfolio()
_ = pf.add_sqlite('datasets/fivethirtyeight.db')

['./index',
 'ahca-polls/ahca_polls',
 'airline-safety/airline-safety',
 'alcohol-consumption/drinks',
 'antiquities-act/actions_under_antiquities_act',
 'august-senate-polls/august_senate_polls',
 'avengers/avengers',
 'bachelorette/bachelorette',
 'bad-drivers/bad-drivers',
 'bechdel/movies',
 'biopics/biopics',
 'births/US_births_1994-2003_CDC_NCHS',
 'births/US_births_2000-2014_SSA',
 'bob-ross/elements-by-episode',
 'cabinet-turnover/cabinet-turnover',
 'candy-power-ranking/candy-data',
 'chess-transfers/transfers',
 'classic-rock/classic-rock-song-list',
 'college-majors/all-ages',
 'college-majors/grad-students',
 'college-majors/majors-list',
 'college-majors/recent-grads',
 'college-majors/women-stem',
 'comic-characters/dc-wikia-data',
 'comic-characters/marvel-wikia-data',
 'comma-survey/comma-survey',
 'congress-age/congress-terms',
 'congress-generic-ballot/generic_topline_historical',
 'congress-resignations/congressional_resignations',
 'cousin-marriage/cousin-marriage-d

In [93]:
import random
random_sketchpad = random.choice(list(pf.sketchpads.values()))

In [94]:
random_sketchpad.context

{'table_name': 'mlb-allstar-teams/allstar_player_talent',
 'source': 'datasets/fivethirtyeight.db',
 'column_name': 'TOTper9innASG'}

In [95]:
result = pf.closest_overlap(random_sketchpad)
[(s, x.context) for s, x in result]

[(1.0,
  {'table_name': 'mlb-allstar-teams/allstar_player_talent',
   'source': 'datasets/fivethirtyeight.db',
   'column_name': 'TOTper9innASG'}),
 (0.3359375,
  {'table_name': 'foul-balls/foul-balls',
   'source': 'datasets/fivethirtyeight.db',
   'column_name': 'used_zone'}),
 (0.328125,
  {'table_name': 'police-killings/police_killings',
   'source': 'datasets/fivethirtyeight.db',
   'column_name': 'college'}),
 (0.328125,
  {'table_name': 'police-killings/police_killings',
   'source': 'datasets/fivethirtyeight.db',
   'column_name': 'college'}),
 (0.328125,
  {'table_name': 'food-world-cup/food-world-cup-data',
   'source': 'datasets/fivethirtyeight.db',
   'column_name': 'Location (Census Region)'})]

In [96]:
print(get_uniques(random_sketchpad))
print('---')
for x in result:
    print(x, get_uniques(x[1]))

      TOTper9innASG
0         -0.140823
1         -0.124369
2         -0.097249
3         -0.086248
4         -0.085601
...             ...
3784       0.443212
3785       0.460196
3786       0.461654
3787       0.536816
3788       0.542349

[3789 rows x 1 columns]
---
(1.0, <__main__.SketchPad object at 0x7f76e151dc00>)       TOTper9innASG
0         -0.140823
1         -0.124369
2         -0.097249
3         -0.086248
4         -0.085601
...             ...
3784       0.443212
3785       0.460196
3786       0.461654
3787       0.536816
3788       0.542349

[3789 rows x 1 columns]
(0.3359375, <__main__.SketchPad object at 0x7f76e0b12230>)    used_zone
0          1
1          2
2          3
3          4
4          5
5          6
6          7
(0.328125, <__main__.SketchPad object at 0x7f76dfa9a2c0>)       college
0         NaN
1    0.013547
2    0.014118
3    0.015886
4    0.022668
..        ...
451  0.755110
452  0.779063
453  0.794070
454  0.824971
455  0.828070

[456 rows x 1 columns]


In [30]:
# def cardinality_spectogram(self, 

In [31]:
def run_sql(sql, path='datasets/fivethirtyeight.db'):
    conn = sqlite3.connect(path)
    table = pd.read_sql(sql, conn)
    return table

def get_uniques(sketchpad):
    conn = sqlite3.connect(sketchpad.context['source'])
    table = pd.read_sql(f"""
        select 
            "{sketchpad.context['column_name']}"
        from
            "{sketchpad.context['table_name']}"
        group by "{sketchpad.context['column_name']}"
    """, conn)
    return table

         win
0   0.000093
1   0.000126
2   0.000414
3   0.000442
4   0.000585
5   0.000959
6   0.001025
7   0.001069
8   0.001768
9   0.002170
10  0.002504
11  0.002735
12  0.003225
13  0.003517
14  0.004288
15  0.005310
16  0.005797
17  0.006044
18  0.008657
19  0.008964
20  0.009364
21  0.010011
22  0.014904
23  0.016036
24  0.017892
25  0.028030
26  0.037192
27  0.041899
28  0.076761
29  0.106981
30  0.127799
31  0.453437
(1.0, <__main__.SketchPad object at 0x7f76e0551870>)          win
0   0.000093
1   0.000126
2   0.000414
3   0.000442
4   0.000585
5   0.000959
6   0.001025
7   0.001069
8   0.001768
9   0.002170
10  0.002504
11  0.002735
12  0.003225
13  0.003517
14  0.004288
15  0.005310
16  0.005797
17  0.006044
18  0.008657
19  0.008964
20  0.009364
21  0.010011
22  0.014904
23  0.016036
24  0.017892
25  0.028030
26  0.037192
27  0.041899
28  0.076761
29  0.106981
30  0.127799
31  0.453437
(1.0, <__main__.SketchPad object at 0x7f76e0552a70>)          win
0   0.000093
1   0.0001

In [None]:
get_uniques(

In [None]:
run_sql("Select 'F*G' from 'classic-rock/classic-rock-song-list' limit 10")

In [None]:
run_sql("Select 'rees-davies' from 'next-bechdel/nextBechdel_allTests' limit 10")