In [1]:
import pandas as pd
import csv
import numpy as np

#Load Data
dfRevngo = pd.read_csv('hotels_revngo.com.csv', sep=',')
dfIhg = pd.read_csv('hotels_www.ihg.com.csv', sep=',')
dfNighttours = pd.read_csv('hotels_www.nighttours.com.csv', sep=',')
dfTouristlink = pd.read_csv('hotels_www.touristlink.com.csv', sep=',')

#Load True Matches
dfIhgGoldStandard = pd.read_csv('goldstandard_revngo_www.ihg.com_other.csv', sep=',', header=None)
dfNighttoursGoldStandard = pd.read_csv('goldstandard_revngo_www.nighttours.com.csv', sep=',', header=None)
dfTouristlinkGoldStandard = pd.read_csv('goldstandard_revngo_www.touristlink.com.csv', sep=',', header=None)

In [2]:
display(dfRevngo.describe())

Unnamed: 0,hotel_name,hotel_url,source,source_reduced,uri,postalcode,streetaddress,addresscountry,addresslocality,frequent_part_hotel_name,infrequent_part_hotel_name
count,39216,39216,39216,39216,39216,37834,39189,35732,39189,28242,39201
unique,39216,39216,39216,1,39216,18995,39070,2273,11628,1831,38408
top,G\u00E4stehaus Balthasar Neumann Br\u00FChl,https://revngo.com/amazon-tupana-jungle-lodge-...,<https://revngo.com/knights-inn-monroe>,revngo.com,revngo.com_8669,0,8240 Sunny Beach,United States,Rome,hotel,indianapolis
freq,1,1,1,39216,1,110,7,2364,428,8818,11


In [3]:
dfRevngoInfo = dfRevngo.filter(items = ['hotel_name', 'postalcode', 'streetaddress', 'addresscountry', 'addresslocality', 'frequent_part_hotel_name', 'infrequent_part_hotel_name']).describe()

datatypes = {'hotel_name': ['string'], 'postalcode': ['string'], 'streetaddress': ['string'], 
             'addresscountry': ['string'], 'addresslocality': ['string'], 'frequent_part_hotel_name': ['string'], 'infrequent_part_hotel_name': ['string']}

dfRevngoProfiling = pd.DataFrame(data= datatypes)
dfRevngoProfiling = dfRevngoProfiling.append((dfRevngoInfo.loc[['count']] / len(dfRevngo)).astype(np.float).round(3))
dfRevngoProfiling = dfRevngoProfiling.append((dfRevngoInfo.loc[['unique']] / dfRevngoInfo.loc[['count']].values).astype(np.float).round(3))

dfRevngoProfiling.index = ['Data Type', 'Density', 'Uniqueness']

dfRevngoProfiling.to_csv('Revngo_hotel_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
print(dfRevngoProfiling.transpose().filter(items=['Density', 'Uniqueness']).to_latex())

\begin{tabular}{lll}
\toprule
{} & Density & Uniqueness \\
\midrule
addresscountry             &   0.911 &      0.064 \\
addresslocality            &   0.999 &      0.297 \\
frequent\_part\_hotel\_name   &    0.72 &      0.065 \\
hotel\_name                 &       1 &          1 \\
infrequent\_part\_hotel\_name &       1 &       0.98 \\
postalcode                 &   0.965 &      0.502 \\
streetaddress              &   0.999 &      0.997 \\
\bottomrule
\end{tabular}



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
dfIhgInfo = dfIhg.filter(items = ['hotel_name', 'postalcode', 'streetaddress', 'addresscountry', 'addresslocality', 'frequent_part_hotel_name', 'infrequent_part_hotel_name']).describe(include='all')

#display(dfDnbInfo)
datatypes = {'hotel_name': ['string'], 'postalcode': ['string'], 'streetaddress': ['string'], 
             'addresscountry': ['string'], 'addresslocality': ['string'], 'frequent_part_hotel_name': ['string'], 'infrequent_part_hotel_name': ['string']}

dfIhgProfiling = pd.DataFrame(data= datatypes)
dfIhgProfiling = dfIhgProfiling.append((dfIhgInfo.loc[['count']] / len(dfIhg)).astype(np.float).round(3), sort = False)
dfIhgProfiling = dfIhgProfiling.append((dfIhgInfo.loc[['unique']] / dfIhgInfo.loc[['count']].values).astype(np.float).round(3), sort = False)

dfIhgProfiling.index = ['Data Type', 'Density', 'Uniqueness']

In [5]:
from nltk import word_tokenize, ngrams
from nltk.metrics import jaccard_distance
import numpy as np

mapping = {"hotel_name": "hotel_name", "postalcode": "postalcode", "streetaddress": "streetaddress", "addresscountry": "addresscountry", "addresslocality": "addresslocality", "frequent_part_hotel_name": "frequent_part_hotel_name", "infrequent_part_hotel_name": "infrequent_part_hotel_name"}
jaccard_similarity_counter = {"hotel_name" : 0, "postalcode": 0, "postalcode":0, "streetaddress": 0, "addresscountry": 0, "addresslocality": 0, "frequent_part_hotel_name": 0, "infrequent_part_hotel_name": 0}

for index, row in dfIhgGoldStandard.iterrows():
    rowRevngo = dfRevngo[dfRevngo['uri'] == row[0]]
    rowIhg = dfIhg[dfIhg['uri'] == row[1]]
    for key in mapping:
        if type(rowIhg[key].values[0]) not in[np.float64, float] and type(rowRevngo[mapping[key]].values[0]) not in[np.float64, float]:
            jaccard_similarity_counter[key] += jaccard_distance(set(ngrams(rowRevngo[mapping[key]].values[0], 2)), set(ngrams(rowIhg[key].values[0], 2)))
    
for key in jaccard_similarity_counter:
    jaccard_similarity_counter[key] = round(jaccard_similarity_counter[key] / len(dfIhgGoldStandard), 3)
 
#display(jaccard_similarity_counter)
dfIhgProfiling = dfIhgProfiling.append(pd.DataFrame(data=jaccard_similarity_counter, index=['Heterogeneity']))
dfIhgProfiling.to_csv('IHG_hotel_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
print(dfIhgProfiling.transpose().filter(items=['Density', 'Uniqueness', 'Heterogeneity']).to_latex())

\begin{tabular}{llll}
\toprule
{} & Density & Uniqueness & Heterogeneity \\
\midrule
addresscountry             &    0.63 &      0.015 &         0.493 \\
addresslocality            &    0.63 &      0.232 &         0.139 \\
frequent\_part\_hotel\_name   &       1 &      0.033 &         0.012 \\
hotel\_name                 &       1 &      0.318 &         0.336 \\
infrequent\_part\_hotel\_name &   0.999 &      0.261 &         0.042 \\
postalcode                 &   0.623 &      0.349 &         0.324 \\
streetaddress              &   0.403 &      0.389 &         0.197 \\
\bottomrule
\end{tabular}



In [6]:
dfNighttoursInfo = dfNighttours.filter(items = ['hotel_name', 'postalcode', 'streetaddress', 'addresscountry', 'addresslocality', 'frequent_part_hotel_name', 'infrequent_part_hotel_name']).describe(include='all')
datatypes = {'hotel_name': ['string'], 'postalcode': ['string'], 'streetaddress': ['string'], 
             'addresscountry': ['string'], 'addresslocality': ['string'], 'frequent_part_hotel_name': ['string'], 'infrequent_part_hotel_name': ['string']}

dfNighttoursProfiling = pd.DataFrame(data= datatypes)
dfNighttoursProfiling = dfNighttoursProfiling.append((dfNighttoursInfo.loc[['count']] / len(dfNighttours)).astype(np.float).round(3), sort = False)
dfNighttoursProfiling = dfNighttoursProfiling.append((dfNighttoursInfo.loc[['unique']] / dfNighttoursInfo.loc[['count']].values).astype(np.float).round(3), sort = False)

dfNighttoursProfiling.index = ['Data Type', 'Density', 'Uniqueness']

In [7]:
from nltk import word_tokenize, ngrams
from nltk.metrics import jaccard_distance
import numpy as np

mapping = {"hotel_name": "hotel_name", "postalcode": "postalcode", "streetaddress": "streetaddress", "addresscountry": "addresscountry", "addresslocality": "addresslocality", "frequent_part_hotel_name": "frequent_part_hotel_name", "infrequent_part_hotel_name": "infrequent_part_hotel_name"}
jaccard_similarity_counter = {"hotel_name" : 0, "postalcode": 0, "postalcode":0, "streetaddress": 0, "addresscountry": 0, "addresslocality": 0, "frequent_part_hotel_name": 0, "infrequent_part_hotel_name": 0}

for index, row in dfNighttoursGoldStandard.iterrows():
    rowRevngo = dfRevngo[dfRevngo['uri'] == row[0]]
    rowNighttours = dfNighttours[dfNighttours['uri'] == row[1]]
    for key in mapping:
        if len(rowNighttours[key].values) > 0 and len(rowRevngo[mapping[key]].values) > 0:
            if type(rowNighttours[key].values[0]) not in[np.float64, float] and type(rowRevngo[mapping[key]].values[0]) not in[np.float64, float]:
                if len(set(ngrams(rowRevngo[mapping[key]].values[0], 2))) > 0 and len(set(ngrams(rowNighttours[key].values[0], 2))) > 0:
                    jaccard_similarity_counter[key] += jaccard_distance(set(ngrams(rowRevngo[mapping[key]].values[0], 2)), set(ngrams(rowNighttours[key].values[0], 2)))
    
for key in jaccard_similarity_counter:
    jaccard_similarity_counter[key] = round(jaccard_similarity_counter[key] / len(dfNighttoursGoldStandard),3)
 
#display(jaccard_similarity_counter)
dfNighttoursProfiling = dfNighttoursProfiling.append(pd.DataFrame(data=jaccard_similarity_counter, index=['Heterogeneity']))
dfNighttoursProfiling.to_csv('nighttours_hotel_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)

print(dfNighttoursProfiling.transpose().filter(items=['Density', 'Uniqueness', 'Heterogeneity']).to_latex())

\begin{tabular}{llll}
\toprule
{} & Density & Uniqueness & Heterogeneity \\
\midrule
addresscountry             &   0.987 &      0.027 &         0.892 \\
addresslocality            &   0.433 &      0.208 &         0.107 \\
frequent\_part\_hotel\_name   &   0.702 &      0.166 &         0.065 \\
hotel\_name                 &       1 &      0.998 &         0.268 \\
infrequent\_part\_hotel\_name &   0.982 &       0.97 &         0.118 \\
postalcode                 &   0.393 &      0.614 &         0.063 \\
streetaddress              &   0.992 &      0.985 &         0.468 \\
\bottomrule
\end{tabular}



In [8]:
dfTouristlinkInfo = dfTouristlink.filter(items = ['hotel_name', 'postalcode', 'streetaddress', 'addresscountry', 'addresslocality', 'frequent_part_hotel_name', 'infrequent_part_hotel_name']).describe(include='all')
datatypes = {'hotel_name': ['string'], 'postalcode': ['string'], 'streetaddress': ['string'], 
             'addresscountry': ['string'], 'addresslocality': ['string'], 'frequent_part_hotel_name': ['string'], 'infrequent_part_hotel_name': ['string']}

dfTouristlinkProfiling = pd.DataFrame(data= datatypes)
dfTouristlinkProfiling = dfTouristlinkProfiling.append((dfTouristlinkInfo.loc[['count']] / len(dfTouristlink)).astype(np.float).round(3), sort = False)
dfTouristlinkProfiling = dfTouristlinkProfiling.append((dfTouristlinkInfo.loc[['unique']] / dfTouristlinkInfo.loc[['count']].values).astype(np.float).round(3), sort = False)

dfTouristlinkProfiling.index = ['Data Type', 'Density', 'Uniqueness']

In [9]:
from nltk import word_tokenize, ngrams
from nltk.metrics import jaccard_distance
import numpy as np

mapping = {"hotel_name": "hotel_name", "postalcode": "postalcode", "streetaddress": "streetaddress", "addresscountry": "addresscountry", "addresslocality": "addresslocality", "frequent_part_hotel_name": "frequent_part_hotel_name", "infrequent_part_hotel_name": "infrequent_part_hotel_name"}
jaccard_similarity_counter = {"hotel_name" : 0, "postalcode": 0, "postalcode":0, "streetaddress": 0, "addresscountry": 0, "addresslocality": 0, "frequent_part_hotel_name": 0, "infrequent_part_hotel_name": 0}

for index, row in dfTouristlinkGoldStandard.iterrows():
    rowRevngo = dfRevngo[dfRevngo['uri'] == row[0]]
    rowTouristlink = dfTouristlink[dfTouristlink['uri'] == row[1]]
    for key in mapping:
        if type(rowTouristlink[key].values[0]) not in[np.float64, float] and type(rowRevngo[mapping[key]].values[0]) not in[np.float64, float]:
            jaccard_similarity_counter[key] += jaccard_distance(set(ngrams(rowRevngo[mapping[key]].values[0], 2)), set(ngrams(rowTouristlink[key].values[0], 2)))
    
for key in jaccard_similarity_counter:
    jaccard_similarity_counter[key] = round(jaccard_similarity_counter[key] / len(dfTouristlinkGoldStandard),3)
 
#display(jaccard_similarity_counter)
dfTouristlinkProfiling = dfTouristlinkProfiling.append(pd.DataFrame(data=jaccard_similarity_counter, index=['Heterogeneity']))
dfTouristlinkProfiling.to_csv('touristlink_hotel_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)

print(dfTouristlinkProfiling.transpose().filter(items=['Density', 'Uniqueness', 'Heterogeneity']).to_latex())

\begin{tabular}{llll}
\toprule
{} & Density & Uniqueness & Heterogeneity \\
\midrule
addresscountry             &   0.998 &      0.044 &          0.88 \\
addresslocality            &   0.998 &      0.851 &         0.214 \\
frequent\_part\_hotel\_name   &   0.741 &      0.126 &         0.049 \\
hotel\_name                 &       1 &      0.979 &         0.372 \\
infrequent\_part\_hotel\_name &   0.984 &      0.963 &         0.089 \\
postalcode                 &   0.993 &      0.842 &         0.124 \\
streetaddress              &   0.998 &      0.982 &         0.573 \\
\bottomrule
\end{tabular}



In [10]:
#dfDBPediaProfiling.to_csv('DBPedia_author_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
#dfVIAFProfiling.to_csv('viaf_author_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
#dfWikiProfiling.to_csv('wiki_author_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
#dfDnbProfiling.to_csv('Dnb_author_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)


In [11]:
dfAggregatedProfiling = dfRevngoProfiling.transpose().append(dfIhgProfiling.transpose())
dfAggregatedProfiling = dfAggregatedProfiling.append(dfNighttoursProfiling.transpose())
dfAggregatedProfiling = dfAggregatedProfiling.append(dfTouristlinkProfiling.transpose())

dfSummedProfiling = pd.DataFrame()
for index in set(dfAggregatedProfiling.index):
    profil = {}
    dfSubset = dfAggregatedProfiling.loc[index]
    profil['Attribute'] = index
    profil['Density-Mean'] = dfSubset['Density'].mean()
    profil['Density-Std'] = dfSubset['Density'].std()
    profil['Heterogeneity-Mean'] = dfSubset['Heterogeneity'].mean()
    profil['Heterogeneity-Std'] = dfSubset['Heterogeneity'].std()
    profil['Uniqueness-Mean'] = dfSubset['Uniqueness'].mean()
    profil['Uniqueness-Std'] = dfSubset['Uniqueness'].std()
    dfProfil = pd.DataFrame(profil, index=[0])
    dfSummedProfiling = dfSummedProfiling.append(dfProfil)

display(dfSummedProfiling)
dfSummedProfiling.to_csv('hotel_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Attribute,Density-Mean,Density-Std,Heterogeneity-Mean,Heterogeneity-Std,Uniqueness-Mean,Uniqueness-Std
0,addresscountry,0.8815,0.172071,0.755,0.226978,0.0375,0.021299
0,addresslocality,0.765,0.281362,0.153333,0.054921,0.397,0.304993
0,hotel_name,1.0,0.0,0.325333,0.052814,0.82375,0.337299
0,infrequent_part_hotel_name,0.99125,0.00957,0.083,0.038354,0.7935,0.355069
0,frequent_part_hotel_name,0.79075,0.140407,0.042,0.027185,0.0975,0.05978
0,streetaddress,0.848,0.296683,0.412667,0.194011,0.83825,0.29957
0,postalcode,0.7435,0.287914,0.170333,0.13653,0.57675,0.207527
