# Pivoting from an interpreter to a compiler to Pandas vectors

Clepy • Cleveland, Ohio • May 2016

In [None]:
# from __future__ import division
%pylab inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import seaborn as sb
pd.options.display.max_colwidth = 128
pylab.rcParams['figure.figsize'] = 12, 7

from re import search

In [None]:
from functools import partial

def test(categorizer, **kw):
    categorize = partial(categorizer, **kw)
    assert categorize('Delta Airlines') == 'Airfare'
    assert categorize('Delta Faucet') == 'Plumbing'
    assert categorize('Comfort Inn') == 'Lodging'
    assert categorize('Drake Hotel') == 'Lodging'
    assert categorize('Sushi Avenue') == 'Restaurant'
    print 'Tests passed'

In [None]:
def categorize(text):
    if 'Delta' in text:
        if 'Faucet' in text:
            return 'Plumbing'
        return 'Airfare'
    if 'Hotel' in text or 'Inn' in text:
        return 'Lodging'
    return 'Restaurant'

print categorize('Comfort Inn')
print categorize('Sushi Avenue')
test(categorize)

In [None]:
rule_text = """\
- /Delta/:
  - /Faucet/:
    - Plumbing
  - Airfare
- Lodging:
  - /Hotel/
  - /Inn/
- //:
  - Restaurant
"""

In [None]:
import yaml
rules = yaml.load(rule_text)
rules

# Iteration 1: Interpreter

```
- /Delta/:
  - /Faucet/:
    - Plumbing
  - Airfare
- Lodging:
  - /Hotel/
  - /Inn/
- //:
  - Restaurant
```

In [None]:
def categorize1(text, rules, category=None):
    for rule in rules:
        subrules = None
        if isinstance(rule, dict):
            rule, subrules = next(rule.iteritems())
        if rule.startswith('/'):
            if not search(rule[1:-1], text):
                continue
        else:
            category = rule
        if subrules:
            category = categorize1(text, subrules, category)
        if category is not None:
            return category

print categorize1('Comfort Inn', rules)
test(categorize1, rules=rules)

# Iteration 2: Compiler

In [None]:
# https://docs.python.org/2/library/ast.html#abstract-grammar

import ast

tree = ast.parse('2 + 3 * 4')
tree

In [None]:
print ast.dump(tree)

In [None]:
import astunparse
print astunparse.dump(tree)

In [None]:
import astunparse
print astunparse.dump(ast.parse("""
    
    """))

In [None]:
from ast import *

tree = Expression(Num(n=42))
fixed = fix_missing_locations(tree)
code = compile(tree, '<luca>', 'eval')
eval(code)

In [None]:
tree = Expression(BinOp(Num(n=42), Add(), Name(id='n', ctx=Load())))
fixed = fix_missing_locations(tree)
code = compile(tree, '<luca>', 'eval')
eval(code, {}, {'n': 1})

In [None]:
tree = Interactive([
    Assign([Name(id='m', ctx=Store())], Num(n=2)),
    Expr(BinOp(Num(n=42), Add(), Name(id='m', ctx=Load())))
    ])
fixed = fix_missing_locations(tree)
code = compile(tree, '<luca>', 'single')
eval(code, {}, {'n': 1})

In [None]:
from dis import dis
dis(code)

```
- /Delta/:
  - /Faucet/:
    - Plumbing
  - Airfare
- Lodging:
  - /Hotel/
  - /Inn/
- //:
  - Restaurant
```

In [None]:
def compile_test(rule):
    pattern = Str(rule[1:-1])
    text = Name(id='text', ctx=Load())
    return Compare(pattern, [In()], [text])

In [None]:
def compile_rules(rules, category=''):
    return BoolOp(Or(), [compile_rule(r, category) for r in rules])

In [None]:
def compile_rule(rule, category):
    if isinstance(rule, dict):
        rule, subrules = next(rule.iteritems())
        if rule.startswith('/'):
            return BoolOp(And(), [compile_test(rule), compile_rules(subrules, category)])
        return compile_rules(subrules, rule)
    elif rule.startswith('/'):
        return BoolOp(And(), [compile_test(rule), Str(category)])
    return Str(rule)

In [None]:
tree = compile_rules(rules)
fixed = fix_missing_locations(tree)
code = compile(Expression(tree), '<luca>', 'eval')
categorize2 = lambda text: eval(code, None, {'text': text})

print categorize2('Comfort Inn')
print categorize2('Delta Airlines')
test(categorize2)

In [None]:
dis(code)

In [None]:
%timeit categorize1('Sushi Avenue', rules)
%timeit categorize2('Sushi Avenue')

# Iteration 3: Pandas

In [None]:
# NumPy

x = arange(8)
print x
print x + 1
print x * 10
print x * x

In [None]:
print sin(x)

In [None]:
#x = linspace(0, 10, 1000)
plot(x, sin(x) - x / 2.0)

In [None]:
# Pandas

df = pd.read_csv('transactions.txt', parse_dates=['Date'])
df.head()

In [None]:
-df.Amount > 500

In [None]:
df.head()

In [None]:
c = df.assign(Category=None)
c.head()

In [None]:
match = c.Description.str.contains('Delta')
c.Category = c.Category[match].fillna('Airfare')
c.head()

In [None]:
mask = c.Description.str.contains('Inn')
c.Category.mask(mask, 'Lodging')
#c.Category = c.Category[match].fillna('Lodging')
#c.head()

In [None]:
cc = c.Description.copy()
cc[:] = None
cc.fillna(7)

In [None]:
def categorize3(descriptions, rules, category=None):
    cats = pd.Series(None, index=descriptions.index)
    for rule in rules:
        cats = cats.fillna(run_rule(descriptions, rule, category))
    return cats

In [None]:
def run_rule(descriptions, rule, category):
    if isinstance(rule, dict):
        rule, subrules = next(rule.iteritems())
        if rule.startswith('/'):
            mask = descriptions.str.contains(rule[1:-1], regex=False)
            cats = categorize3(descriptions, subrules, category)
            return cats.where(mask, None)
        else:
            return categorize3(descriptions, subrules, rule)
    elif rule.startswith('/'):
        mask = descriptions.str.contains(rule[1:-1], regex=False)
        cats = pd.Series(category, index=descriptions.index)
        return cats.where(mask, None)
    return rule

#print categorize3(df.Description, rules)
#test(categorize1, rules=rules)
%timeit c3 = categorize3(df.Description, rules)

In [None]:
c1 = [categorize1(text, rules) for text in df.Description]
c2 = [categorize2(text) for text in df.Description]
c3 = categorize3(df.Description, rules)
print c1 == c2
print (c1 == c3).all()

In [None]:
%time c1 = [categorize1(text, rules) for text in df.Description]
%time c2 = [categorize2(text) for text in df.Description]
%time c3 = categorize3(df.Description, rules)

In [None]:
bigdf = pd.concat([df] * 100000).reset_index()

In [None]:
bigdf.shape

In [None]:
%time [categorize1(text, rules) for text in bigdf.Description]
%time [categorize2(text) for text in bigdf.Description]
%time categorize3(bigdf.Description, rules)

# PyCon!

# PyCon

Portland, Oregon

2016 May 28 – June 5

# PyCon

Portland, Oregon

2017

# PyCon

2018–2019 — Early May

# PyCon

2018–2019 — Early May

**Cleveland, Ohio**

In [None]:
rules

In [None]:
rules2 = [
    {'/Delta/': [{'/Faucet/': [2.0]}, 1.0]},
    {3.0: ['/Hotel/', '/Inn/']},
    {'//': [4.0]},
]

In [None]:
nan = float('nan')
from time import time

def categorize4(descriptions, rules, category=nan):
    t0 = time()
    cats = pd.Series(nan, index=descriptions.index)
    print time() - t0
    for rule in rules:
        cats = cats.fillna(run_rule4(descriptions, rule, category))
    return cats

def run_rule4(descriptions, rule, category):
    if isinstance(rule, dict):
        rule, subrules = next(rule.iteritems())
        if isinstance(rule, str) and rule.startswith('/'):
            t0 = time()
            descriptions.str.find(rule[1:-1])
            print 'a1', time() - t0
            t0 = time()
            mask = descriptions.str.contains(rule[1:-1], regex=False)
            print 'a1', time() - t0
            cats = categorize4(descriptions, subrules, category)
            return cats.where(mask, nan)
        else:
            return categorize4(descriptions, subrules, rule)
    elif isinstance(rule, str) and rule.startswith('/'):
        t0 = time()
        mask = descriptions.str.contains(rule[1:-1], regex=False)
        print 'a2', time() - t0
        cats = pd.Series(category, index=descriptions.index)
        return cats.where(mask, nan)
    return rule

#print categorize4(df.Description, rules2)
#test(categorize1, rules=rules)

In [None]:
t0 = time()
categorize4(bigdf.Description, rules2)
print time() - t0

In [None]:
%time categorize4(df.Description, rules2)
None

In [None]:
c4 = categorize4(df.Description, rules2)
c4

In [None]:
%time [categorize1(text, rules) for text in bigdf.Description]
%time [categorize2(text) for text in bigdf.Description]
%time categorize3(bigdf.Description, rules)
%time categorize4(bigdf.Description, rules2)

In [None]:
pd.__version__