In [27]:
%load_ext autoreload
%autoreload 2

from leo_bkg_matchids import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Load Leonardo & Booking hotels

In [2]:
leoh, bkgh, matches, geo_index, nleoh = initialize_matching(True)

Loading Leonardo hotels
Loaded 70170 hotels
Loading Booking hotels
Missing id:
{u'chain': u'', u'title': u'', u'hotel_id': None, u'lat': u'', u'lng': u'', u'_id': ObjectId('5936fabbdccdc4145c14c5e6')}
Loaded 884718 hotels
70170 Leonardo hotels left to match to 884718 Booking hotels
Building Geo Index


# 2. Generate candidates

In [3]:
candidates = extract_candidates(leoh[:1000], geo_index, 6, 0.7, nleoh)

100.00 %


In [4]:
len(candidates)

981

In [6]:
candidates.items()[0]

('1051736',
 [{'bkg_id': 1347394,
   'chain': '',
   'chain_bkg': None,
   'dist': 3.7073948414551023,
   'leo_id': '1051736',
   'name': u'Best Western Anaheim Sportstown',
   'name_bkg': u'Anaheim Beauty',
   'name_sim': 0.45,
   'name_sim_sw': 0.8461538461538461},
  {'bkg_id': 1788838,
   'chain': '',
   'chain_bkg': None,
   'dist': 3.07506313120018,
   'leo_id': '1051736',
   'name': u'Best Western Anaheim Sportstown',
   'name_bkg': u'BEST WESTERN Plus Meridian Inn & Suites Anaheim-Orange',
   'name_sim': 0.5428571428571428,
   'name_sim_sw': 0.8363636363636363},
  {'bkg_id': 513858,
   'chain': '',
   'chain_bkg': None,
   'dist': 2.2639554691890225,
   'leo_id': '1051736',
   'name': u'Best Western Anaheim Sportstown',
   'name_bkg': u'BEST WESTERN PLUS Anaheim Inn',
   'name_sim': 0.68,
   'name_sim_sw': 0.8},
  {'bkg_id': 43739,
   'chain': '',
   'chain_bkg': None,
   'dist': 2.529163795274538,
   'leo_id': '1051736',
   'name': u'Best Western Anaheim Sportstown',
   'name_b

# 3. extract features

In [30]:
import itertools
def concatenate(ll):
    return list(itertools.chain.from_iterable(ll))

In [31]:
mdf = pd.DataFrame(concatenate(candidates.values()))

In [32]:
def norm_text(column):
    return column.astype(unicode).fillna('').apply(normalize).astype(unicode)

In [33]:
def row_name_included(row):
    n = normalize(row['name']).replace(' ', '')    
    nb = normalize(row['name_bkg']).replace(' ', '')
    if not n:
        return False
    if not nb:
        return False
    return (n in nb) or (nb in n)    

def row_chain_included(row):
    n = normalize(row['chain']).replace(' ', '')    
    nb = normalize(row['chain_bkg']).replace(' ', '')
    if not n:
        return False
    if not nb:
        return False
    return (n in nb) or (nb in n)

def row_chain_sim(row):
    return get_name_sim(row['chain'], row['chain_bkg'])

def row_chain_sim_sw(row):
    return get_name_sim(row['chain'], row['chain_bkg'], swap_words=True)

# 4. use classifier to pick valid matches

In [34]:
from scipy.sparse import hstack

In [35]:
with open('rdf_matches.pickle','rb') as f:
    clf = pickle.load(f)

In [36]:
def load_chain_encoders():
    with open('chain_encoders.pickle', 'rb') as f:
        le_chain, ohe, le_chain_bk, ohe_bk = pickle.load(f)
    return le_chain, ohe, le_chain_bk, ohe_bk

In [37]:
def add_features(X):
    for col in 'name name_bkg chain chain_bkg'.split():
        X[col] = norm_text(X[col])
    
    X['name_included'] = X.apply(row_name_included, axis=1)
    X['chain_included'] = X.apply(row_chain_included, axis=1)

#     X['chain_sim'] = X.apply(row_chain_sim, axis=1)
#     X['chain_sim_sw'] = X.apply(row_chain_sim_sw, axis=1)
    # No chain names on this dataset
    X['chain_sim'] = 0.0
    X['chain_sim_sw'] = 0.0
        
    return X

In [38]:
def pre_process(X):    
    X.chain.fillna('', inplace=True)
    X.chain_bkg.fillna('', inplace=True)
    X.name.fillna('', inplace=True)
    X.name_bkg.fillna('', inplace=True)
    X.chain_included.fillna(False, inplace=True)
    X.name_included.fillna(False, inplace=True)

    Xnum = X[[u'dist', u'name_sim', u'name_sim_sw', u'chain_sim', u'chain_sim_sw',
           u'name_included', u'chain_included']]

    Xcat = X[[u'chain', u'chain_bkg']]

    le_chain, ohe, le_chain_bk, ohe_bk = load_chain_encoders()
    Xchain = le_chain.transform(X.chain)
    Xchain = ohe.transform(Xchain.reshape(-1,1))

    Xchain_bkg = le_chain_bk.transform(X.chain_bkg)
    Xchain_bkg = ohe_bk.transform(Xchain_bkg.reshape(-1,1))

    X = hstack((Xnum.astype(float), Xchain, Xchain_bkg))
    
    return X

In [40]:
retrain_set = []
retrain_labels = []
matches = {}

for leo_id, match_cands in candidates.items():
    # filter candidates with classifier
    if not match_cands:
        continue
    X = pd.DataFrame(match_cands)
    X = add_features(X)
    X['match'] = clf.predict(pre_process(X))
    X = X[X.match == 1]
    
    ncands = X.shape[0]
    
    if ncands == 0:
        match = None
    elif ncands == 1:
        match = X.iloc[0,:]
    else:        
        # pick best if many different
        sums = {}
        compare_cols = 'name_sim_sw name_included chain_included chain_sim_sw'.split()
        for c in compare_cols:
            sums[c] = X[c].sum()
        sums['inv_dist'] = sum(1/d for d in X.dist.notnull())
        
        def score_row(row):
            score = sum(row[c]/sums[c] for c in compare_cols if sums[c])
            if row['dist']:
                score += (1 / row['dist']) / sums['inv_dist']
            else:
                score += 1
            return score
        
        X.score = X.apply(score_row, axis=1)
        ind_max = X.score.argmax()
        match = X.loc[ind_max,:]

        # mark the 'losers' as non-matches for future re-training
        for i in X.index:
            if i != ind_max:
                d = X.loc[i,:].to_dict()
                retrain_set.append(d)
                retrain_labels.append(0)
    
    if match is not None:
        d = match.to_dict()
        retrain_set.append(d)
        retrain_labels.append(1)
        matches[d['leo_id']] = d['bkg_id']

## Explore matches

In [41]:
len(matches)

913

In [42]:
match_details = [d for i, d in enumerate(retrain_set) if retrain_labels[i]]
from random import sample
sample(match_details, 10)

[{'bkg_id': 183208,
  'chain': u'',
  'chain_bkg': u'none',
  'chain_included': False,
  'chain_sim': 0.0,
  'chain_sim_sw': 0.0,
  'dist': 0.17170930307355864,
  'leo_id': '1050526',
  'match': 1.0,
  'name': u'radisson rochester ap',
  'name_bkg': u'radisson rochester ap',
  'name_included': True,
  'name_sim': 1.0,
  'name_sim_sw': 1.0},
 {'bkg_id': 1484917,
  'chain': u'',
  'chain_bkg': u'none',
  'chain_included': False,
  'chain_sim': 0.0,
  'chain_sim_sw': 0.0,
  'dist': 0.65591960151953865,
  'leo_id': '1051256',
  'match': 1.0,
  'name': u'nai harn',
  'name_bkg': u'nai harn',
  'name_included': True,
  'name_sim': 1.0,
  'name_sim_sw': 1.0},
 {'bkg_id': 46199,
  'chain': u'',
  'chain_bkg': u'none',
  'chain_included': False,
  'chain_sim': 0.0,
  'chain_sim_sw': 0.0,
  'dist': 0.062120032726664956,
  'leo_id': '1050559',
  'match': 1.0,
  'name': u'comfort inn gatineau',
  'name_bkg': u'comfort inn gatineau',
  'name_included': True,
  'name_sim': 1.0,
  'name_sim_sw': 1.0}

# 5. filter matches with selected ids

In [43]:
with open('bkgids.txt') as f:
    bkgids = [int(l.strip()) for l in f.readlines()]

filtered_matches = {lid: bid for (lid, bid) in matches.items() if bid in bkgids}

In [44]:
len(filtered_matches)

49

In [None]:
match_details = [d for i, d in enumerate(retrain_set) \
                     if retrain_labels[i] and\
                     d['bkg_id'] in bkgids]

In [50]:
detail_fields = 'bkg_id leo_id name name_bkg dist name_sim name_sim_sw'.split()
match_details = [{k: d[k] for k in detail_fields} for d in match_details]

In [51]:
len(match_details)

49

In [52]:
match_details[0]

{'bkg_id': 25477,
 'dist': 0.48955903850312055,
 'leo_id': '1050813',
 'name': u'phoenician luxury collection scottsdale',
 'name_bkg': u'scottsdale camelback',
 'name_sim': 0.33898305084745761,
 'name_sim_sw': 0.82608695652173914}

In [53]:
with open('match_details.json', 'w') as f:
    json.dump(match_details, f)