# Agoda Mapping Challenge

- Compute embeddings of data
    - TF-IDF (term frequency-inverse document frequency)
- Columnwise / Concatenation of columns
- Compute cosine similarity between embeddings
    - Gives a similarity matrix per field
- Total similarity matrix is an addition/multiplication combination of field matrices
- For each row choose the column corresponding to the row maximum
    - In case of collision (i.e. two or more hotels mapped to the same hotel) – go for the next unassigned index with the largest similarity
    - If the similarity falls behind a fixed threshold – do not map any hotel

Testing was performed on all examples + some rows from the unlabeled data
Simulate hotels without true mapping


In [232]:
#@title Mode { display-mode: "form" }
MODE = "test" #@param ["test", "real"]

In [233]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

Import data:

In [234]:
data = pd.read_excel('mappinghotelsdataset.xlsx', sheet_name=None, keep_default_na=False)

Preview data:

In [235]:
data['examples'].head()

Unnamed: 0,p1.key,p1.hotel_name,p1.city_name,p1.country_code,p1.hotel_address,p1.star_rating,p1.postal_code,p2.key,p2.hotel_name,p2.city_name,p2.country_code,p2.hotel_address,p2.star_rating,p2.postal_code
0,074BF1CC1F1C150E080EBB9855D23EAC,Grand Malioboro Hotel,Jambi,ID,Jl. Iskandar Muda no. 168 Jambi,3.0,,CBEF956F35D16548C939056575C7E0C7,Grand Malioboro Hotel,Jambi,ID,"Jalan Iskandar Muda No. 168, Sei Asam, Pasar J...",3.0,36113
1,103756D573E5A0C80ED374C9637DB142,Full House Resort,Mae Suai / Wiang Pa Pao (Chiang Rai),TH,171 Moo.1 Tambol Pa Ngiu Ampor Wiangpapao Chia...,3.0,57170.0,603122E5379E7354B532B876BB149C8D,Fullhouse Resort,Ban Pa Ngiu,TH,171 Moo.1 Tambol Pangiu Chiangrai,3.0,57170
2,341522B8405C1A26EA3786E3F702AA0C,Ondas Do Mar Beach Resort Phase 1,Goa,IN,"Holiday St, Gaura Waddo",2.0,403516.0,DC03E5D79546157E53DB083C349BF03F,Ondas Do Mar Beach Resort Phase -1,Calangute,IN,"Holiday Street, Gaura Waddo, Calangute,Bardez,Goa",0.0,403516
3,85F1C44C25A14B5E918B6D2917FEF5E2,7 Days Inn Guangzhou Baiyun Yongtai Metro 2nd ...,Guangzhou,CN,"Building 3, No. 116 Tongtai Road",1.5,,739B4F2C65D4DBCE62C30CE8C18FCBFF,7Days Inn Guangzhou Baiyun Yongtai Subway Stat...,Guangzhou,CN,"No. 3 Building, No. 116 Tongtai Road, Baiyun D...",2.0,510420
4,58F58899B2131D5BB416262DD7000418,Hampton Inn Rome,Rome (NY),US,1352 Floyd Avenue,2.5,13441.0,B78EC933B2CF6DD45038CC86611EEA49,Hampton Inn Rome,Rome,US,1352 Floyd Avenue,4.0,13441


In [236]:
data['Partner1'].head()

Unnamed: 0,p1.key,p1.hotel_name,p1.city_name,p1.country_code,p1.hotel_address,p1.star_rating,p1.postal_code
0,5E876BFEA81A39E42E3019FE17303D52,Elite Grande Hotel,Manama,BH,"Bldg 3378, Road 2845, Area 428",4.0,5458.0
1,4F315989358CC0F3F7869F569887743D,Quality Inn West Chester,West Chester (OH),US,8567 Cincinnati Dayton Road,3.0,45069.0
2,A4EEBCBB9932DADE591248DFFFBDC068,MAP5 Village Resort,Goa,IN,Vithaldas Wadoo,3.0,403512.0
3,2833BE9FD49A063A36D3DE1E5E28ABC4,Hampton Inn & Suites San Jose Hotel,San Jose (CA),US,55 Old Tully Road,3.0,95111.0
4,F7C20B50AE5C6C807BAABB65B8926F07,Favehotel Daeng Tompo,Makassar,ID,"Daeng Tompo Street number 28, Losari",3.0,


Choose data:

In [237]:
if MODE=='real':
  df1 = data['Partner1']
  df2 = data['Partner2']
elif MODE=='test':
  df1 = data['examples'].iloc[:,:7].append(data['Partner1'].iloc[:10,:]).reset_index()
  df2 = data['examples'].iloc[:,7:].append(data['Partner2'].iloc[:10,:]).reset_index()
else:
  raise(Exception("Invalid mode"))

Helper function to compute TF-IDF vectors:

In [238]:
def pw_sim(docs1 : list, docs2 : list):
  vect = TfidfVectorizer(analyzer='char')
  tfidf = vect.fit_transform(docs1 + docs2)
  tfidf1 = tfidf[:len(docs1)]
  tfidf2 = tfidf[len(docs2):]
  pairwise_similarity = tfidf1 * tfidf2.T
  return pairwise_similarity

Compute cosine similarities for fields:

In [239]:
pw_sim_names = pw_sim(df1['p1.hotel_name'].to_list() , df2['p2.hotel_name'].to_list())

pw_sim_adresses = pw_sim(df1['p1.hotel_address'].fillna('').astype(str).to_list() , df2['p2.hotel_address'].fillna('').astype(str).to_list())

locations1 = df1['p1.hotel_address'].fillna('').astype(str) + df1['p1.city_name'] + df1['p1.postal_code'].fillna('').astype(str)
locations2 = df2['p2.hotel_address'].fillna('').astype(str) + df2['p2.city_name'] + df2['p2.postal_code'].fillna('').astype(str)
pw_sim_locs = pw_sim(locations1.to_list() , locations2.to_list())

pw_sim_country = pw_sim(df1['p1.country_code'].to_list() , df2['p2.country_code'].to_list())
pw_sim_country = (pw_sim_country > 0.9).astype(int)

We use the following combined similarity matrix:

In [240]:
pw = pw_sim_country.multiply((pw_sim_names+pw_sim_adresses+pw_sim_locs))
# pw = pw_sim_country.multiply((pw_sim_names+0.5*pw_sim_adresses+0.5*pw_sim_locs))

We can now search the maximum by row/column. In case of duplicate assignment we go for the next unassigned index with the largest similarity.

In [241]:
THRESHOLD = 1.5 #fine tuned on test data

def predict3(pairwise_similarity):
  dir1to2 = np.argmax(pairwise_similarity.toarray(), axis=1)
  dir2to1 = np.argmax(pairwise_similarity.toarray(), axis=0)
  dir2to1_reversed = np.arange(len(dir2to1))[dir2to1]

  # solve conflicts = duplicates by choosing next highest value
  def remove_duplicates(x):
    u, c = np.unique(x, return_counts=True)
    dups = u[c > 1]
    for dup in dups:
      idup = np.squeeze(np.argwhere(x==dup))
      x[x==dup] = -1
      sims = pairwise_similarity[idup,:].toarray()
      
      while np.sum(sims>=0)>0:
        imax, jmax = np.unravel_index(np.argmax(sims), sims.shape)
        if jmax in x: #already assigned
          sims[:,jmax] = -1
        else: 
          if sims[imax, jmax] > THRESHOLD:
            x[idup[imax]] = jmax
            sims[imax,:] = -1
          else: #too low similarity. do not map.
            x[idup[imax]] = -1
            sims[imax,:] = -1
    return x
  dir1to2 = remove_duplicates(dir1to2)
  predicted = dir1to2.copy()

  #Ensure no duplicates
  assert len(np.unique(predicted[predicted>=0])) == np.sum(predicted>=0), 'Failed to remove conflicts'

  # Concatenate to get results
  df2_sorted=df2.iloc[predicted[predicted>=0],:].reset_index(drop=True)
  df1_filtered=df1[predicted>=0].reset_index(drop=True)
  results = pd.concat([ df1_filtered, df2_sorted], axis=1)

  #Validate in test mode, when GT is known
  if MODE=='test':
    results_keys_gt = data['examples'] 
    merged_with_gt = results_keys_gt.merge(results, on='p1.key', how='left', suffixes=('_gt', '_pred')) 
    kk = (merged_with_gt['p2.key_gt'] == merged_with_gt['p2.key_pred'])
    print(f"Accuracy: {100*sum(kk) / len(kk):.1f}%")
    print("Number of errors: ", sum(~kk))
    wrong_indexes = merged_with_gt.index[~kk].tolist()
  else:
    wrong_indexes = None
  return predicted, results, wrong_indexes

Execute:

In [242]:
predicted, df_results, errors = predict3(pw)

Accuracy: 99.0%
Number of errors:  5


The known errors (in test mode only obviously):

In [243]:
if MODE=='test':
  print(predicted[errors])

[-1 -1 23 -1 -1]


We see how many hotels have not been classified:

In [244]:
print(f"{sum(predicted<0)}")
print(f"{100*sum(predicted<0)/len(predicted):.1f}%")

10
2.0%


And finally we save the results:

In [245]:
if MODE=='real':
  df_results[['p1.key', 'p2.key']].dropna().to_csv('mappings.csv', index=False)
  df_results.to_csv('mappings_full.csv', index=False)