In [None]:
# from __future__ import division
%pylab inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import seaborn as sb
pd.options.display.max_colwidth = 128
pylab.rcParams['figure.figsize'] = 12, 7

In [None]:
from re import search

In [None]:
%config 

In [None]:
import IPython.core.formatters

In [None]:
IPython.core.formatters

In [None]:
%config DisplayFormatter

In [None]:
#%config PlainTextFormatter.pprint = False

In [None]:
from functools import partial

def test(categorizer, **kw):
    categorize = partial(categorizer, **kw)
    assert categorize('Delta Airlines') == 'Airfare'
    assert categorize('Delta Faucet') == 'Plumbing'
    assert categorize('Comfort Inn') == 'Lodging'
    assert categorize('Drake Hotel') == 'Lodging'
    assert categorize('Sushi Avenue') == 'Restaurant'
    print 'Tests passed'

In [None]:
def categorize(text):
    if search('Delta', text):
        return 'Airfare'
    if search('Hotel', text):
        return 'Lodging'
    if search('Inn', text):
        return 'Lodging'
    if search('.', text):
        return 'Restaurant'
    return None

print categorize('Comfort Inn')
print categorize('Sushi Avenue')

In [None]:
import yaml

In [None]:
rule_text = """\
- /Delta/:
  - /Faucet/:
    - Plumbing
  - Airfare
- Lodging:
  - /Hotel/
  - /Inn/
- //:
  - Restaurant
"""

In [None]:
rules = yaml.load(rule_text)
rules

In [None]:
def categorize(text):
    if search('Delta', text):
        return 'Airfare'
    if search('Hotel', text):
        return 'Lodging'
    if search('Inn', text):
        return 'Lodging'
    if search('.', text):
        return 'Restaurant'
    return 'Unknown'

print categorize('Comfort Inn')
print categorize('Sushi Avenue')

# Iteration 1: Interpreter

```
- /Delta/:
  - /Faucet/:
    - Plumbing
  - Airfare
- Lodging:
  - /Hotel/
  - /Inn/
- //:
  - Restaurant
```

In [None]:
def categorize1(text, rules, category=None):
    for rule in rules:
        subrules = None
        if isinstance(rule, dict):
            rule, subrules = next(rule.iteritems())
        if rule.startswith('/'):
            if not search(rule[1:-1], text):
                continue
        else:
            category = rule
        if subrules:
            category = categorize1(text, subrules, category)
        if category is not None:
            return category

print categorize1('Comfort Inn', rules)
test(categorize1, rules=rules)

# Iteration 2: Compiler

In [None]:
import ast

tree = ast.parse('2 + 3 * 4')
tree

In [None]:
# 2 + 3 * 4
# https://docs.python.org/2/library/ast.html#abstract-grammar

tree.body[0]

In [None]:
print ast.dump(tree)

In [None]:
import astunparse
print astunparse.dump(tree)

In [None]:
import astunparse
print astunparse.dump(ast.parse("""
    
    """))

In [None]:
import astunparse
print astunparse.dump(ast.parse("""
if content == 1:
    return 'foo'
"""))

In [None]:
from ast import (AST, If, Name, Return, Str, Param, FunctionDef, Interactive,
                 arguments, fix_missing_locations, parse, Num, Expression, Add, BinOp, Mult, Pow, Load, Expr,
                Module, Assign)
from ast import *


tree = Expression(Num(n=42))
fixed = fix_missing_locations(tree)
code = compile(tree, '<luca>', 'eval')
eval(code)

In [None]:
tree = Expression(BinOp(Num(n=42), Add(), Name(id='n', ctx=Load())))
fixed = fix_missing_locations(tree)
code = compile(tree, '<luca>', 'eval')
eval(code, {}, {'n': 1})

In [None]:
tree = Interactive([
    Assign([Name(id='m', ctx=Store())], Num(n=2)),
    Expr(BinOp(Num(n=42), Add(), Name(id='m', ctx=Load())))
    ])
fixed = fix_missing_locations(tree)
code = compile(tree, '<luca>', 'single')
eval(code, {}, {'n': 1})

In [None]:
from dis import dis
dis(code)

```
- /Delta/:
  - /Faucet/:
    - Plumbing
  - Airfare
- Lodging:
  - /Hotel/
  - /Inn/
- //:
  - Restaurant
```

In [None]:
def compile_test(rule):
    if rule.startswith('/'):
        pattern = Str(rule[1:-1])
        text = Name(id='text', ctx=Load())
        return Compare(pattern, [In()], [text])
    else:
        return Str(rule)

In [None]:
def compile_rules(rules, category=''):
    return BoolOp(Or(), [compile_rule(r, category) for r in rules])

def compile_rule(rule, category):
    if isinstance(rule, dict):
        rule, subrules = next(rule.iteritems())
        if rule.startswith('/'):
            return BoolOp(And(), [compile_test(rule), compile_rules(subrules, category)])
        return compile_rules(subrules, rule)
    elif rule.startswith('/'):
        return BoolOp(And(), [compile_test(rule), Str(category)])
    return Str(rule)

In [None]:
tree = compile_rules(rules)
fixed = fix_missing_locations(tree)
code = compile(Expression(tree), '<luca>', 'eval')
categorize2 = lambda text: eval(code, None, {'text': text})

print categorize2('Comfort Inn')
print categorize2('Delta Airlines')
test(categorize2)

In [None]:
dis(code)

In [None]:
%timeit categorize1('Sushi Avenue', rules)
%timeit categorize2('Sushi Avenue')

# Iteration 3: Pandas

In [None]:
# NumPy

x = arange(8)
print x
print x + 1
print x * 10
print x * x

In [None]:
print sin(x)

In [None]:
#x = linspace(0, 10, 1000)
plot(x, sin(x) - x / 2.0)

In [None]:
# Pandas

df = pd.read_csv('transactions.txt', parse_dates=['Date'])
df.head()

In [None]:
-df.Amount > 500

In [None]:
df.head()

In [None]:
c = df.assign(Category=None)
c.head()

In [None]:
match = c.Description.str.contains('Delta')
c.Category = c.Category[match].fillna('Airfare')
c.head()

In [None]:
mask = c.Description.str.contains('Inn')
c.Category.mask(mask, 'Lodging')
#c.Category = c.Category[match].fillna('Lodging')
#c.head()

In [None]:
cc = c.Description.copy()
cc[:] = None
cc.fillna(7)

In [505]:
def categorize3(descriptions, rules, category=None):
    cats = pd.Series(None, index=descriptions.index)
    for rule in rules:
        cats = cats.fillna(run_rule(descriptions, rule, category))
    return cats

def run_rule(descriptions, rule, category):
    if isinstance(rule, dict):
        rule, subrules = next(rule.iteritems())
        if rule.startswith('/'):
            mask = descriptions.str.contains(rule[1:-1], regex=False)
            cats = categorize3(descriptions, subrules, category)
            return cats.where(mask, None)
        else:
            return categorize3(descriptions, subrules, rule)
    elif rule.startswith('/'):
        mask = descriptions.str.contains(rule[1:-1], regex=False)
        cats = pd.Series(category, index=descriptions.index)
        return cats.where(mask, None)
    return rule

#print categorize3(df.Description, rules)
#test(categorize1, rules=rules)
%timeit c3 = categorize3(df.Description, rules)

100 loops, best of 3: 4.74 ms per loop


In [462]:
c1 = [categorize1(text, rules) for text in df.Description]
c2 = [categorize2(text) for text in df.Description]
c3 = categorize3(df.Description, rules)
assert c1 == c2
assert (c1 == c3).all()

In [472]:
%time c1 = [categorize1(text, rules) for text in df.Description]
%time c2 = [categorize2(text) for text in df.Description]
%time c3 = categorize3(df.Description, rules)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.13 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 195 µs
CPU times: user 16 ms, sys: 4 ms, total: 20 ms
Wall time: 18.9 ms


In [479]:
bigdf = pd.concat([df] * 100000).reset_index()

In [488]:
bigdf.shape

(1900000, 5)

In [485]:
%time [categorize1(text, rules) for text in bigdf.Description]
%time [categorize2(text) for text in bigdf.Description]
%time categorize3(bigdf.Description, rules)

CPU times: user 18.8 s, sys: 104 ms, total: 18.9 s
Wall time: 18.7 s
CPU times: user 1.36 s, sys: 84 ms, total: 1.45 s
Wall time: 1.32 s
CPU times: user 7.22 s, sys: 1.09 s, total: 8.31 s
Wall time: 8.28 s


0             Airfare
1             Airfare
2          Restaurant
3          Restaurant
4             Lodging
5          Restaurant
6          Restaurant
7             Airfare
8             Lodging
9          Restaurant
10         Restaurant
11         Restaurant
12         Restaurant
13            Lodging
14         Restaurant
15         Restaurant
16         Restaurant
17         Restaurant
18            Lodging
19            Airfare
20            Airfare
21         Restaurant
22         Restaurant
23            Lodging
24         Restaurant
25         Restaurant
26            Airfare
27            Lodging
28         Restaurant
29         Restaurant
              ...    
1899970       Lodging
1899971    Restaurant
1899972    Restaurant
1899973    Restaurant
1899974    Restaurant
1899975       Lodging
1899976    Restaurant
1899977    Restaurant
1899978    Restaurant
1899979    Restaurant
1899980       Lodging
1899981       Airfare
1899982       Airfare
1899983    Restaurant
1899984   

In [489]:
rules

[{'/Delta/': [{'/Faucet/': ['Plumbing']}, 'Airfare']},
 {'Lodging': ['/Hotel/', '/Inn/']},
 {'//': ['Restaurant']}]

In [490]:
rules2 = [
    {'/Delta/': [{'/Faucet/': [2.0]}, 1.0]},
    {3.0: ['/Hotel/', '/Inn/']},
    {'//': [4.0]},
]

In [501]:
nan = float('nan')

def categorize4(descriptions, rules, category=nan):
    cats = pd.Series(nan, index=descriptions.index)
    for rule in rules:
        cats = cats.fillna(run_rule(descriptions, rule, category))
    return cats

def run_rule(descriptions, rule, category):
    if isinstance(rule, dict):
        rule, subrules = next(rule.iteritems())
        if isinstance(rule, str) and rule.startswith('/'):
            mask = descriptions.str.contains(rule[1:-1])
            cats = categorize3(descriptions, subrules, category)
            return cats.where(mask, nan)
        else:
            return categorize3(descriptions, subrules, rule)
    elif isinstance(rule, str) and rule.startswith('/'):
        mask = descriptions.str.contains(rule[1:-1])
        cats = pd.Series(category, index=descriptions.index)
        return cats.where(mask, nan)
    return rule

#print categorize4(df.Description, rules2)
#test(categorize1, rules=rules)
%timeit categorize4(df.Description, rules2)

100 loops, best of 3: 4.03 ms per loop


In [496]:
c4 = categorize4(df.Description, rules2)
c4

0     1.0
1     1.0
2     4.0
3     4.0
4     3.0
5     4.0
6     4.0
7     1.0
8     3.0
9     4.0
10    4.0
11    4.0
12    4.0
13    3.0
14    4.0
15    4.0
16    4.0
17    4.0
18    3.0
dtype: float64

In [502]:
%time [categorize1(text, rules) for text in bigdf.Description]
%time [categorize2(text) for text in bigdf.Description]
%time categorize3(bigdf.Description, rules)
%time categorize4(bigdf.Description, rules2)

CPU times: user 17.8 s, sys: 88 ms, total: 17.9 s
Wall time: 17.7 s
CPU times: user 1.27 s, sys: 28 ms, total: 1.3 s
Wall time: 1.26 s
CPU times: user 7.17 s, sys: 1.01 s, total: 8.18 s
Wall time: 8.17 s
CPU times: user 5.73 s, sys: 1.71 s, total: 7.44 s
Wall time: 7.39 s


0          1.0
1          1.0
2          4.0
3          4.0
4          3.0
5          4.0
6          4.0
7          1.0
8          3.0
9          4.0
10         4.0
11         4.0
12         4.0
13         3.0
14         4.0
15         4.0
16         4.0
17         4.0
18         3.0
19         1.0
20         1.0
21         4.0
22         4.0
23         3.0
24         4.0
25         4.0
26         1.0
27         3.0
28         4.0
29         4.0
          ... 
1899970    3.0
1899971    4.0
1899972    4.0
1899973    4.0
1899974    4.0
1899975    3.0
1899976    4.0
1899977    4.0
1899978    4.0
1899979    4.0
1899980    3.0
1899981    1.0
1899982    1.0
1899983    4.0
1899984    4.0
1899985    3.0
1899986    4.0
1899987    4.0
1899988    1.0
1899989    3.0
1899990    4.0
1899991    4.0
1899992    4.0
1899993    4.0
1899994    3.0
1899995    4.0
1899996    4.0
1899997    4.0
1899998    4.0
1899999    3.0
dtype: float64