In [33]:
%load_ext autoreload
%autoreload 2

from amd_bkg_matchids import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. load Amadeus & Booking hotels

In [3]:
amdh, bkgh, matches, geo_index, namdh = initialize_matching(True)

Loading Amadeus hotels
Loaded 6007 hotels
Loading Booking hotels
Loaded 882902 hotels
6007 Amadeus hotels left to match to 882902 Booking hotels
Building Geo Index


# 3. generate candidates

In [59]:
candidates = extract_candidates(amdh[:1000], geo_index, 6, 0.7, namdh)

100.00 %


In [60]:
candidates

{'005abffd-22b2-40cb-8e47-9f0a3dabde1f': [{'amd_id': '005abffd-22b2-40cb-8e47-9f0a3dabde1f',
   'bkg_id': 1194989,
   'chain': u'BAYMONT INNS',
   'chain_bkg': u'Baymont Inn &amp; Suites',
   'dist': 0.0005285519384731723,
   'name': u'BAYMONT INN & SUITES RAPID CIT',
   'name_bkg': u'Baymont Inn & Suites Rapid City',
   'name_sim': 0.9130434782608695,
   'name_sim_sw': 0.9803921568627451},
  {'amd_id': '005abffd-22b2-40cb-8e47-9f0a3dabde1f',
   'bkg_id': 301390,
   'chain': u'BAYMONT INNS',
   'chain_bkg': u'Microtel Inns &amp; Suites',
   'dist': 4.275018226990847,
   'name': u'BAYMONT INN & SUITES RAPID CIT',
   'name_bkg': u'Microtel Inn & Suites by Wyndham Rapid City',
   'name_sim': 0.6181818181818182,
   'name_sim_sw': 0.8695652173913043},
  {'amd_id': '005abffd-22b2-40cb-8e47-9f0a3dabde1f',
   'bkg_id': 677063,
   'chain': u'BAYMONT INNS',
   'chain_bkg': u'Main Stay Suites',
   'dist': 1.365533629544177,
   'name': u'BAYMONT INN & SUITES RAPID CIT',
   'name_bkg': u'MainStay S

# 4. extract features

In [5]:
import itertools
def concatenate(ll):
    return list(itertools.chain.from_iterable(ll))

In [48]:
mdf = pd.DataFrame(concatenate(candidates.values()))

In [50]:
def norm_text(column):
    return column.astype(unicode).fillna('').apply(normalize).astype(unicode)

In [8]:
def row_name_included(row):
    n = normalize(row['name']).replace(' ', '')    
    nb = normalize(row['name_bkg']).replace(' ', '')
    if not n:
        return False
    if not nb:
        return False
    return (n in nb) or (nb in n)    

def row_chain_included(row):
    n = normalize(row['chain']).replace(' ', '')    
    nb = normalize(row['chain_bkg']).replace(' ', '')
    if not n:
        return False
    if not nb:
        return False
    return (n in nb) or (nb in n)

def row_chain_sim(row):
    return get_name_sim(row['chain'], row['chain_bkg'])

def row_chain_sim_sw(row):
    return get_name_sim(row['chain'], row['chain_bkg'], swap_words=True)

# 5. use classifier to pick valid matches

In [39]:
from scipy.sparse import hstack

In [36]:
with open('rdf_matches.pickle','rb') as f:
    clf = pickle.load(f)

In [12]:
def load_chain_encoders():
    with open('chain_encoders.pickle', 'rb') as f:
        le_chain, ohe, le_chain_bk, ohe_bk = pickle.load(f)
    return le_chain, ohe, le_chain_bk, ohe_bk

In [57]:
def add_features(X):
    for col in 'name name_bkg chain chain_bkg'.split():
        X[col] = norm_text(X[col])
    
    X['name_included'] = X.apply(row_name_included, axis=1)
    X['chain_included'] = X.apply(row_chain_included, axis=1)

    X['chain_sim'] = X.apply(row_chain_sim, axis=1)
    X['chain_sim_sw'] = X.apply(row_chain_sim_sw, axis=1)
    
    return X

In [52]:
def pre_process(X):    
    X.chain.fillna('', inplace=True)
    X.chain_bkg.fillna('', inplace=True)
    X.name.fillna('', inplace=True)
    X.name_bkg.fillna('', inplace=True)
    X.chain_included.fillna(False, inplace=True)
    X.name_included.fillna(False, inplace=True)

    Xnum = X[[u'dist', u'name_sim', u'name_sim_sw', u'chain_sim', u'chain_sim_sw',
           u'name_included', u'chain_included']]

    Xcat = X[[u'chain', u'chain_bkg']]

    le_chain, ohe, le_chain_bk, ohe_bk = load_chain_encoders()
    Xchain = le_chain.transform(X.chain)
    Xchain = ohe.transform(Xchain.reshape(-1,1))

    Xchain_bkg = le_chain_bk.transform(X.chain_bkg)
    Xchain_bkg = ohe_bk.transform(Xchain_bkg.reshape(-1,1))

    X = hstack((Xnum.astype(float), Xchain, Xchain_bkg))
    
    return X

In [152]:
retrain_set = []
retrain_labels = []
matches = {}

for amd_id, match_cands in candidates.items():
    # filter candidates with classifier
    if not match_cands:
        continue
    X = pd.DataFrame(match_cands)
    X = add_features(X)
    X['match'] = clf.predict(pre_process(X))
    X = X[X.match == 1]
    
    ncands = X.shape[0]
    
    if ncands == 0:
        match = None
    elif ncands == 1:
        match = X.loc[0,:]
    else:        
        # pick best if many different
        sums = {}
        compare_cols = 'name_sim_sw name_included chain_included chain_sim_sw'.split()
        for c in compare_cols:
            sums[c] = X[c].sum()
        sums['inv_dist'] = sum(1/d for d in X.dist.notnull())
        
        def score_row(row):
            score = sum(row[c]/sums[c] for c in compare_cols if sums[c])
            if row['dist']:
                score += (1 / row['dist']) / sums['inv_dist']
            else:
                score += 1
            return score
        
        X.score = X.apply(score_row, axis=1)
        ind_max = X.score.argmax()
        match = X.loc[ind_max,:]

        # mark the 'losers' as non-matches for future re-training
        for i in X.index:
            if i != ind_max:
                d = X.loc[i,:].to_dict()
                retrain_set.append(d)
                retrain_labels.append(0)
    
    if match is not None:
        d = match.to_dict()
        retrain_set.append(d)
        retrain_labels.append(1)
        matches[r['amd_id']] = r['bkg_id']