In [62]:
import pycrfsuite
import pandas as pd
from address_compare.parsers import hyphen_parse
import json
from address_compare.feature_functions import WordFeatures1 as wf
from collections import OrderedDict
from numpy.random import uniform, seed
from importlib import reload
from address_compare.constants import DIRECTIONS, STREET_TYPES
from string import punctuation
import re
UNIT_TYPES = ['bld', 'bldg', 'bldg', 'lot', 'ste', 'suite', 'unit', 'apt']

def tag_from_str(string, tagger, ff):
    tokens = hyphen_parse(string)
    features = [ff.exec_all(t) for t in tokens]
    return tagger.tag(features)

In [63]:
with open('data/tagged_addresses.json') as f:
    td = json.load(f)
    
print(json.dumps(td[0:3], indent = 2))

[
  {
    "raw_address": "612 S ALASKA ST",
    "tags": [
      "STREET_NUMBER",
      "PRE_DIRECTION",
      "STREET_NAME",
      "STREET_TYPE"
    ],
    "tokens": [
      "612",
      "S",
      "ALASKA",
      "ST"
    ]
  },
  {
    "raw_address": "540 RONLEE LN NW STE B",
    "tags": [
      "STREET_NUMBER",
      "STREET_NAME",
      "STREET_TYPE",
      "POST_DIRECTION",
      "UNIT_TYPE",
      "UNIT_NUMBER"
    ],
    "tokens": [
      "540",
      "RONLEE",
      "LN",
      "NW",
      "STE",
      "B"
    ]
  },
  {
    "raw_address": "624 SUNSET PARK DR",
    "tags": [
      "STREET_NUMBER",
      "STREET_NAME",
      "STREET_NAME",
      "STREET_TYPE"
    ],
    "tokens": [
      "624",
      "SUNSET",
      "PARK",
      "DR"
    ]
  }
]


In [None]:
trainer1 = pycrfsuite.Trainer()
seed(1749)
for item in td:
    trainer1.append(xseq = item["tokens"], yseq = item["tags"], group=int(uniform() < 0.8))

In [None]:
trainer1.train('model1', holdout=0)

In [None]:
trainer1.message()

In [None]:
tagger1 = pycrfsuite.Tagger()
tagger1.open('model1')

In [None]:
inf = tagger1.info()
inf.transitions

In [65]:
def crfparse(s, tagger):
    tokens = hyphen_parse(s)
    tags = tagger.tag(tokens)
    parsed_address = OrderedDict(UNIT_TYPE = [], UNIT_NUMBER = [], STREET_NUMBER = [], PRE_DIRECTION = [],
                         STREET_NAME = [], STREET_TYPE = [], POST_DIRECTION = [], UNKNOWN = [])
    for a, b in zip(tokens, tags):
        parsed_address[b].append(a)
    return parsed_address

In [66]:
crfparse("123 E 5TH AVE", tagger1)

NameError: name 'tagger1' is not defined

In [67]:
tagger1.tag(hyphen_parse("123-W St John Blvd"))
td

NameError: name 'tagger1' is not defined

In [None]:
h = 'sdfs'
'#' in '#h'

In [84]:
class WordFeatures2(ff.FeatureFunctionApplicator):

    def f_is_street_name(self, s: str):
        return s.lower().strip(punctuation) in STREET_TYPES

    def f_is_direction(self, s: str):
        return s.lower() in DIRECTIONS

#     def f_self_tolower(self, s: str):
#         return s.lower()

    def f_is_digit(self, s: str):
        return s.isdigit()

    def f_has_digit(self, s):
        return re.search('\d', s) is not None

    def f_length(self, s: str):
        return len(s)
    
    def f_pound(self, s: str):
        return '#' in s
    
    def f_unit_type(self, s: str):
        return s.lower().strip(punctuation) in UNIT_TYPES
    
    def f_ends_in_hyphen(self, s:str):
        return s[-1] == '-'

#     def f_last_char(self, s: str):
#         return s[-1]

#     def f_first_char(self, s:str):
#         return s[0]

In [85]:
import address_compare.feature_functions as ff
reload(ff)
seed(1729)

trainer2 = pycrfsuite.Trainer()

group = []

with open('data/tagged_addresses.json') as f:
    td = json.load(f)

for item in td:
    g = int(uniform() < 0.7)
    features = [WordFeatures2().exec_all(t) for t in item['tokens']]
    if len(features) != len(item['tags']): print(item['tokens'])
    trainer2.append(xseq = features, yseq = item['tags'], group=g)
    group.append(g)

In [86]:
trainer2.train('model3', holdout = 0)

Holdout group: 1

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 75
Seconds required: 0.001

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 1171.768176
Feature norm: 1.000000
Error norm: 938.190202
Active features: 75
Line search trials: 1
Line search step: 0.001181
Seconds required for this iteration: 0.001
Performance by label (#match, #model, #ref) (precision, recall, F1):
    STREET_NUMBER: (0, 0, 58) (0.0000, 0.0000, 0.0000)
    PRE_DIRECTION: (0, 0, 18) (0.0000, 0.0000, 0.0000)
    STREET_NAME: (70, 276, 70) (0.2536, 1.0000, 0.4046)
    STREET_TYPE: (0, 0, 59) (0.0000, 0.0000, 0.0000)
    POST_DIRECTION: (0, 0, 10) (0.0000, 0.0000, 0.0000)
    UNIT_TYPE: (0, 0, 28) (0.0000

In [87]:
tagger3 = pycrfsuite.Tagger()
tagger3.open('model3')

<contextlib.closing at 0x2cbd5c42e48>

In [88]:
info = tagger3.info()
sorted([(v, k) for k, v in info.state_features.items()], reverse=True)

[(2.790513, ('f_is_street_name', 'STREET_TYPE')),
 (2.320536, ('f_unit_type', 'UNIT_TYPE')),
 (2.089241, ('f_is_direction', 'POST_DIRECTION')),
 (1.801797, ('f_is_digit', 'STREET_NUMBER')),
 (1.769487, ('f_is_direction', 'PRE_DIRECTION')),
 (1.417533, ('f_pound', 'UNIT_NUMBER')),
 (1.092292, ('f_has_digit', 'UNIT_NUMBER')),
 (1.016775, ('f_ends_in_hyphen', 'UNIT_NUMBER')),
 (0.996625, ('f_has_digit', 'STREET_NUMBER')),
 (0.440395, ('f_length', 'STREET_NAME')),
 (0.221288, ('f_is_digit', 'UNIT_NUMBER')),
 (0.179069, ('f_length', 'STREET_NUMBER')),
 (0.059434, ('f_length', 'UNIT_NUMBER')),
 (0.014285, ('f_length', 'UNIT_TYPE')),
 (-0.023212, ('f_ends_in_hyphen', 'PRE_DIRECTION')),
 (-0.035907, ('f_ends_in_hyphen', 'POST_DIRECTION')),
 (-0.048991, ('f_ends_in_hyphen', 'STREET_TYPE')),
 (-0.08792, ('f_pound', 'PRE_DIRECTION')),
 (-0.094938, ('f_unit_type', 'PRE_DIRECTION')),
 (-0.120838, ('f_is_street_name', 'STREET_NAME')),
 (-0.146737, ('f_length', 'STREET_TYPE')),
 (-0.163169, ('f_pound

In [89]:
tag_from_str("2779 Lougheed Hwy", tagger3, WordFeatures2())

['STREET_NUMBER', 'STREET_NAME', 'STREET_TYPE']

In [90]:
test_pred = list()

for item in [t for t, g in zip(td, group) if not g]:
    features = [WordFeatures2().exec_all(w) for w in item['tokens']]
    test_pred.append([item['tokens'], item['tags'], tagger3.tag(features)])

In [91]:
[a for a in test_pred if a[1] != a[2]]

[[['7019', 'RD', '9', 'NW', 'STE', 'J'],
  ['STREET_NUMBER',
   'STREET_TYPE',
   'STREET_NAME',
   'POST_DIRECTION',
   'UNIT_TYPE',
   'UNIT_NUMBER'],
  ['STREET_NUMBER',
   'STREET_NAME',
   'STREET_TYPE',
   'POST_DIRECTION',
   'UNIT_TYPE',
   'UNIT_NUMBER']],
 [['485', 'STATE', 'ROUTE', '409'],
  ['STREET_NUMBER', 'STREET_TYPE', 'STREET_TYPE', 'STREET_NAME'],
  ['STREET_NUMBER', 'STREET_NAME', 'STREET_TYPE', 'STREET_NUMBER']]]

In [50]:
def crfparse2(s, tagger, ff):
    tokens = hyphen_parse(s)
    features = [ff.exec_all(t) for t in tokens]
    tags = tagger.tag(features)
    parsed_address = OrderedDict(UNIT_TYPE = [], UNIT_NUMBER = [], STREET_NUMBER = [], PRE_DIRECTION = [],
                         STREET_NAME = [], STREET_TYPE = [], POST_DIRECTION = [], UNKNOWN = [])
    for a, b in zip(tokens, tags):
        parsed_address[b].append(a)
    return parsed_address

In [100]:
crfparse2("122-1500 West Georgia Peach Street", tagger3, WordFeatures2())

OrderedDict([('UNIT_TYPE', []),
             ('UNIT_NUMBER', ['122-']),
             ('STREET_NUMBER', ['1500']),
             ('PRE_DIRECTION', ['West']),
             ('STREET_NAME', ['Georgia', 'Peach']),
             ('STREET_TYPE', ['Street']),
             ('POST_DIRECTION', []),
             ('UNKNOWN', [])])

In [101]:
from address_compare.standardizers import standardizer

In [None]:
standardizer()

In [93]:
out = crfparse("45678 910 ave Ste 123", tagger1)

NameError: name 'tagger1' is not defined

In [None]:
out2 = out
out2['STREET_NUMBER'] = '56789'

In [None]:
unit_types = []
for d in td:
    for tag, token in zip(d['tags'], d['tokens']):
        if tag == 'UNIT_TYPE':
            unit_types.append(token)

['C-', '12']