# Using the pre-trained random forest model to detect matches

In [1]:
import pandas as pd
from address_compare.crf_tagger import AddressTagger
from address_compare.prob_matchers import ProbPredict

In [2]:
s1 = pd.Series(["#1401-750 Jervis Street", "25 West King Edward Avenue", "1-950 EAST 10TH AVE"])
s2 = pd.Series(["#750 1401 Jervis Street", "Unit 1, 950 E 10 AVE", "25 W King Edward Ave", '123 Fake Street'])
print(s1)
print(s2)

0       #1401-750 Jervis Street
1    25 West King Edward Avenue
2           1-950 EAST 10TH AVE
dtype: object
0    #750 1401 Jervis Street
1       Unit 1, 950 E 10 AVE
2       25 W King Edward Ave
3            123 Fake Street
dtype: object


In [3]:
at = AddressTagger()
ad1 = at.series_to_address_df(s1)
ad2 = at.series_to_address_df(s2)

In [4]:
from itertools import product
d1 = pd.DataFrame()
d2 = pd.DataFrame()
i1 = []
i2 = []
for a, b in product(ad1.index, ad2.index):
    i1.append(a)
    i2.append(b)
df1 = ad1.iloc[i1,:].reset_index()
df2 = ad2.iloc[i2,:].reset_index()

In [5]:
threshold = 0.95
pp = ProbPredict()
probs = pp.predict_prob(df1, df2)
print(probs)
matches = pd.DataFrame(dict(id1 = df1['index'], id2 = df2['index']))[probs > threshold]
matches

[ 0.          0.          0.          0.          0.          0.
  0.99276684  0.          0.          1.          0.1         0.        ]


Unnamed: 0,id1,id2
6,1,2
9,2,1


In [6]:
ad1.loc[matches.id1,:]

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
1,25,WEST,KING EDWARD,AVENUE,,,
2,950,EAST,10TH,AVENUE,,,1.0


In [7]:
ad2.loc[matches.id2,:]

Unnamed: 0,STREET_NUMBER,PRE_DIRECTION,STREET_NAME,STREET_TYPE,POST_DIRECTION,UNIT_TYPE,UNIT_NUMBER
2,25,WEST,KING EDWARD,AVENUE,,,
1,950,EAST,10,AVENUE,,UNIT,1.0
