# Using the pre-trained random forest model to detect matches

In [1]:
import pandas as pd
from address_compare.tagging import AddressTagger
from address_compare.prob_matchers import MatchClassifier

In [2]:
s1 = pd.Series(["#1401 750 Jervis Street", "25 West King Edward Avenue", "950 EAST 10TH AVE, Apt 1"])
s2 = pd.Series(["#750 1401 Jervis Street", "Unit 1, 950 E 10 AVE", "25 W King Edward Ave", '123 Fake Street'])
print(s1)
print(s2)

0       #1401 750 Jervis Street
1    25 West King Edward Avenue
2      950 EAST 10TH AVE, Apt 1
dtype: object
0    #750 1401 Jervis Street
1       Unit 1, 950 E 10 AVE
2       25 W King Edward Ave
3            123 Fake Street
dtype: object


In [3]:
at = AddressTagger()
ad1 = at.series_to_address_df(s1)
ad2 = at.series_to_address_df(s2)

In [4]:
ad1

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,750,,JERVIS,STREET,,,1401.0
1,25,WEST,KING EDWARD,AVENUE,,,
2,950,EAST,10TH,AVENUE,,APARTMENT,1.0


In [5]:
ad2

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,1401,,JERVIS,STREET,,,750.0
1,950,EAST,10,AVENUE,,UNIT,1.0
2,25,WEST,KING EDWARD,AVENUE,,,
3,123,,FAKE,STREET,,,


In [16]:
from itertools import product
d1 = pd.DataFrame()
d2 = pd.DataFrame()
i1 = []
i2 = []
for a, b in product(ad1.index, ad2.index):
    i1.append(a)
    i2.append(b)
df1 = ad1.iloc[i1,:].reset_index()
df2 = ad2.iloc[i2,:].reset_index()

In [17]:
df1

Unnamed: 0,index,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,0,750,,JERVIS,STREET,,,1401.0
1,0,750,,JERVIS,STREET,,,1401.0
2,0,750,,JERVIS,STREET,,,1401.0
3,0,750,,JERVIS,STREET,,,1401.0
4,1,25,WEST,KING EDWARD,AVENUE,,,
5,1,25,WEST,KING EDWARD,AVENUE,,,
6,1,25,WEST,KING EDWARD,AVENUE,,,
7,1,25,WEST,KING EDWARD,AVENUE,,,
8,2,950,EAST,10TH,AVENUE,,APARTMENT,1.0
9,2,950,EAST,10TH,AVENUE,,APARTMENT,1.0


In [18]:
df2

Unnamed: 0,index,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,0,1401,,JERVIS,STREET,,,750.0
1,1,950,EAST,10,AVENUE,,UNIT,1.0
2,2,25,WEST,KING EDWARD,AVENUE,,,
3,3,123,,FAKE,STREET,,,
4,0,1401,,JERVIS,STREET,,,750.0
5,1,950,EAST,10,AVENUE,,UNIT,1.0
6,2,25,WEST,KING EDWARD,AVENUE,,,
7,3,123,,FAKE,STREET,,,
8,0,1401,,JERVIS,STREET,,,750.0
9,1,950,EAST,10,AVENUE,,UNIT,1.0


In [19]:
threshold = 0.95
pp = MatchClassifier()
probs = pp.predict_prob(df1, df2)
print(probs)
matches = pd.DataFrame(dict(id1 = df1['index'], id2 = df2['index']))[probs > threshold]
matches

[ 0.61496032  0.          0.          1.          0.          0.          1.
  0.          0.          1.          0.          0.        ]


Unnamed: 0,id1,id2
3,0,3
6,1,2
9,2,1


In [10]:
ad1.loc[matches.id1,:]

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,750,,JERVIS,STREET,,,1401.0
1,25,WEST,KING EDWARD,AVENUE,,,
2,950,EAST,10TH,AVENUE,,APARTMENT,1.0


In [11]:
ad2.loc[matches.id2,:]

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
3,123,,FAKE,STREET,,,
2,25,WEST,KING EDWARD,AVENUE,,,
1,950,EAST,10,AVENUE,,UNIT,1.0


In [12]:
from address_compare.prob_matchers import ProbMatcher
pm = ProbMatcher()
pm.match_probabilities(ad1, ad2, -1)

[ 0.61496032  0.          0.          1.          0.          0.          1.
  0.          0.          1.          0.          0.        ]


Unnamed: 0,index_1,index_2,probs
0,0,0,0.61496
1,0,1,0.0
2,0,2,0.0
3,0,3,1.0
4,1,0,0.0
5,1,1,0.0
6,1,2,1.0
7,1,3,0.0
8,2,0,0.0
9,2,1,1.0


In [13]:
ad1

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,750,,JERVIS,STREET,,,1401.0
1,25,WEST,KING EDWARD,AVENUE,,,
2,950,EAST,10TH,AVENUE,,APARTMENT,1.0


In [14]:
ad1[:1]

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
0,750,,JERVIS,STREET,,,1401


In [15]:
ad2[-1:]

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
3,123,,FAKE,STREET,,,


In [20]:
from address_compare.prob_matchers import features_from_tagged_addresses

In [24]:
features_from_tagged_addresses(ad1[:1], ad2[-1:])

Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()
Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()
Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()
Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()
Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()
Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()


ValueError: Can only compare identically-labeled Series objects