In [1]:
import json 
import pandas as pd
import numpy as np
import uuid 
import rdflib
import collections

In [2]:
def nt_to_csv(path, source):
    # converts source and target files
    # load the .nt file
    g = rdflib.Graph()
    if source: 
        g.parse(path+'/source', format="nt")
    else:
        g.parse(path+'/target', format="nt")
    # put it into a dictionary
    phone_source = collections.defaultdict(dict)
    for s,p,o in g:
        phone_source[str(s)][str(p)] = str(o)

    phone_source = pd.DataFrame.from_dict(phone_source, orient='index')
    
    if source: # product pages
        d =[]
        for i in list(phone_source.index):
            d.append(i[24:])
        phone_source['page_id']=d
        phone_source=change_col_names(phone_source, 'page')
    else: # catalog
        d =[]
        for i in list(phone_source.index):
            d.append(i[28:])
        phone_source['catalog_id']=d
        phone_source=change_col_names(phone_source, 'cat')
    

    return phone_source

In [3]:
def change_col_names(df, prefix):
    # change column names of the dataframe and returns it
    result=[]
    for c in df.columns:
        if(c== 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'):
            result.append('page_type')
        elif(c == 'page_id'):
            result.append('page_id')
        elif (c== 'catalog_id'):
            result.append('catalog_id')
        else:
            result.append(prefix+'_'+c.split('http://schema.org/Product/')[1])
    df.columns=result
    return df

In [4]:
def get_goldstandard(correspondence_path):
    # 'GoldStandard/Correspondences/phonecorrespndences.csv'
    cor_phone=pd.read_csv(correspondence_path+'/correspondence.csv',sep=';',names=['id_webpage','catalog_id','match'],header=0)
    return cor_phone

In [5]:
def get_product_cat(correspondence_path):
    # gets the product catalog
    id_prodcat_info= pd.read_json(correspondence_path+'/product_catalog.json', encoding='utf-8')
    id_prodcat_info.columns=['prodcat_'+c for c in id_prodcat_info.columns]
    id_prodcat_info = id_prodcat_info[['prodcat_product_name','prodcat_id']]
    return id_prodcat_info

In [6]:
def merge(prod_cat,gold,target,source):
    # Merge step 1:
    # We need product catalog ids to merge with target, because the target doesn't have names and therefore cannnot 
    # be joined with the gold standard
    prod_cat=pd.merge(target, prod_cat, how='left', left_on='catalog_id',right_on='prodcat_id')
    # Merge step 2: Add web page columns and add it to the correspondences
    phone_getsource=pd.merge(gold, source, how='inner',left_on='id_webpage',right_on='page_id')
    # Merge step 3: Add product catalog columns and add it to the correspondences
    phone_result=pd.merge(phone_getsource, prod_cat, how='inner',left_on='catalog_id',right_on='prodcat_product_name')
    phone_result=phone_result.drop(['page_type_y','page_type_x','catalog_id_x','catalog_id_y','page_id'],axis=1)
    print('Product catalog shape (target):',target.shape)
    print('Product catalog after merging with extracted:',prod_cat.shape)
    print('Gold standard shape:',gold.shape)
    print('Product pages shape (source):',source.shape)
    print('Merge 1- web pages:',phone_getsource.shape)
    print('Merge 2- all:',phone_result.shape)
    print('Number of unique products from prodcat:',phone_result['prodcat_id'].nunique())
    print('Number of unique webpages:',phone_result['id_webpage'].nunique())
    print('Matches:')
    print(phone_result['match'].value_counts())
    return phone_result
    

In [7]:
def get_dataset(path):
    source_df=nt_to_csv(path, True)
    target_df=nt_to_csv(path, False)
    gold_df=get_goldstandard(path)
    prod_cat=get_product_cat(path)
    result=merge(prod_cat,gold_df,target_df,source_df)
    result=get_unique_ids(result)
    return result

In [8]:
def get_unique_ids(df):
    norows=df.shape[0]
    ids=[str(uuid.uuid1()) for i in range(0,norows)]
    df['id']=ids
    return df

# 1. Phones

In [9]:
phone_path ='/Users/bengikoseoglu/Documents/Masters/Semester4/Thesis/Dataset/Phone'
phone_result=get_dataset(phone_path)
phone_result.to_csv('phone_merged.csv')
phone_result.head()

Product catalog shape (target): (50, 31)
Product catalog after merging with extracted: (50, 33)
Gold standard shape: (24999, 3)
Product pages shape (source): (447, 29)
Merge 1- web pages: (22349, 32)
Merge 2- all: (21455, 60)
Number of unique products from prodcat: 48
Number of unique webpages: 447
Matches:
0    21237
1      218
Name: match, dtype: int64


Unnamed: 0,id_webpage,match,page_brand,page_mpn,page_display_size,page_url,page_rear_cam_resolution,page_warc,page_color,page_product_code,...,cat_brand,cat_ram,cat_manufacturer,cat_voltage,cat_display_resolution,cat_dimensions,cat_height,prodcat_product_name,prodcat_id,id
0,node5fd6d025f29b89ff319c853d0e33744,0,htc,,5 in,http://www.ebay.com/itm/blocked-htc-one-m9-32g...,20.0mp,ebay.com0.warc.nq.gz,gold on silver,,...,apple,1gb,,1810 mah,750 x 1334 pixels,138.1 x 67 x 6.9 mm,5.4 in,iphone 6 64gb,a7b16fc1-3a7d-4265-b4a6-a1a1cd548b80,3fb875ca-a0e6-11e9-85f7-d0817aaa94da
1,nodee9affe60fbe185b64feb5811d2e43159,0,htc,htc6535lvw,5 in,http://www.ebay.com/itm/works-perfect-htc-one-...,20 mp,ebay.com0.warc.nq.gz,gunmetal gray,,...,apple,1gb,,1810 mah,750 x 1334 pixels,138.1 x 67 x 6.9 mm,5.4 in,iphone 6 64gb,a7b16fc1-3a7d-4265-b4a6-a1a1cd548b80,3fb87822-a0e6-11e9-85f7-d0817aaa94da
2,nodeb1d0ceb9797fd339c5f7c04aa76a7af1,0,htc,6735a,5 in,http://www.ebay.com/itm/unlocked-htc-one-m9-32...,20 mp,ebay.com0.warc.nq.gz,gunmetal gray,,...,apple,1gb,,1810 mah,750 x 1334 pixels,138.1 x 67 x 6.9 mm,5.4 in,iphone 6 64gb,a7b16fc1-3a7d-4265-b4a6-a1a1cd548b80,3fb8787c-a0e6-11e9-85f7-d0817aaa94da
3,node12b450a88f0d6b897758f676414c61,0,htc,opja120,,http://www.ebay.com/itm/refurbished-htc-one-m9...,,ebay.com0.warc.nq.gz,gold,,...,apple,1gb,,1810 mah,750 x 1334 pixels,138.1 x 67 x 6.9 mm,5.4 in,iphone 6 64gb,a7b16fc1-3a7d-4265-b4a6-a1a1cd548b80,3fb878b8-a0e6-11e9-85f7-d0817aaa94da
4,node9ace9d5daf49116fed3f11241dff6d,0,htc,ebay_htconem932gbgoldonsilverunlocked,5 in,http://www.ebay.com/itm/nice-factory-unlocked-...,20 mp,ebay.com0.warc.nq.gz,gold on silver,,...,apple,1gb,,1810 mah,750 x 1334 pixels,138.1 x 67 x 6.9 mm,5.4 in,iphone 6 64gb,a7b16fc1-3a7d-4265-b4a6-a1a1cd548b80,3fb878ea-a0e6-11e9-85f7-d0817aaa94da


In [10]:
prod_cat=get_product_cat(phone_path)
gold=get_goldstandard(phone_path)
print(set(prod_cat['prodcat_product_name'].unique())- set(phone_result['prodcat_product_name'].unique()))
print(set(gold['id_webpage'].unique())- set(phone_result['id_webpage'].unique()))

{'iphone 4s 32gb', 'sony xperia z5'}
{'node1e6988f78fdb248ff2cb121f5756e84', 'nodeb688213d65c725effd495f87e17831', 'node58c27cec51b514e4baa22bcc1973a342', 'node29d8fe8d3d70a71d4022c43ba9e890df', 'node316992e7c7355ece9b2d0e06d4adf', 'node5d17ce778fea342eebf37cdd4f5194d0', 'node83f8e462117b54abb673d66a9290fdb5', 'nodebaca1fd1a43f877cf5cde21421ebdc', 'node3034bfed2337d67f5408c943e2ae86', 'node2b58ef6f35df3d828d85c4522561528', 'nodebbae7edcb08fe8639ca7825a38d34ac8', 'nodebe7cc2d974994dd414df2b54a37ecad', 'nodeb344450f47ad750755a446b26ac9482', 'node3445a7e1efb9e21e7caac93dc8679', 'nodebbc9cdbb628f304d5faf5fba82938', 'node195684ba6b3f111769989f468486c746', 'nodeeb9ab6329baec23bb20259374c24ea6', 'node27cc192e9b2c9fa028437865469b11f0', 'node27db81be323cda2c7171bf93059625b', 'node1d3d7b2b5f58d76ea4bf36b674a64c', 'node17ab56e772c94925cad38e6f74f9fe5', 'nodebb2a4b92a124482eb8d10ad97e5a660', 'nodebbd475b33b8e8276ce6d0bc82a2b18a', 'nodebec93d5f58875104b1d80f43c66d2b', 'nodebf8e2fa05b2dee4843ba231a7

# 2. Headphone

In [15]:
hp_path ='/Users/bengikoseoglu/Documents/Masters/Semester4/Thesis/Dataset/Headphone'
hp_result=get_dataset(hp_path)
hp_result.to_csv('hp_merged.csv')
hp_result.head()

Product catalog shape (target): (51, 39)
Product catalog after merging with extracted: (51, 41)
Gold standard shape: (25499, 3)
Product pages shape (source): (444, 30)
Merge 1- web pages: (22643, 33)
Merge 2- all: (23087, 69)
Number of unique products from prodcat: 50
Number of unique webpages: 444
Matches:
0    22823
1      264
Name: match, dtype: int64


Unnamed: 0,id_webpage,match,page_headphones_form_factor,page_mpn,page_product_gtin,page_warc,page_color,page_brand,page_model,page_url,...,cat_sound_output_mode,cat_sensitivity,cat_color,cat_microphone_audio_details,cat_headphones_cup_type,cat_aditional_features,cat_mdoel,prodcat_product_name,prodcat_id,id
0,node449b610bd3ad6439d1b26190266d5,0,ear-cup (over the ear),does not apply,does not apply,ebay.com0.warc.nq.gz,black,akg,k712 pro,http://www.ebay.com/itm/brand-new-akg-k712-pro...,...,stereo,99 db,black,,closed,,,audio technica ath-m50x,d7130c90-df60-4e0d-a898-ad8e2821dc7f,fa07e48a-7a42-11e9-a70d-9a0011fd23e0
1,node4b4dee215b51f2eb45a114409ce7eda6,0,ear-cup (over the ear),does not apply,does not apply,ebay.com0.warc.nq.gz,black,akg,k712 pro,http://www.ebay.com/itm/brand-new-akg-k712-pro...,...,stereo,99 db,black,,closed,,,audio technica ath-m50x,d7130c90-df60-4e0d-a898-ad8e2821dc7f,fa07e5f2-7a42-11e9-a70d-9a0011fd23e0
2,node52a9f6bc8fa1772a0f6a2832641fa1,0,,does not apply,does not apply,ebay.com0.warc.nq.gz,,akg,k712 pro,http://www.ebay.com/itm/brand-new-akg-k712-pro...,...,stereo,99 db,black,,closed,,,audio technica ath-m50x,d7130c90-df60-4e0d-a898-ad8e2821dc7f,fa07e656-7a42-11e9-a70d-9a0011fd23e0
3,node6d72a28b4e6d7d1aff11ebab95221fa1,0,ear-cup (over the ear),does not apply,does not apply,ebay.com0.warc.nq.gz,black,akg,k712 pro,http://www.ebay.com/itm/brand-new-akg-k712-pro...,...,stereo,99 db,black,,closed,,,audio technica ath-m50x,d7130c90-df60-4e0d-a898-ad8e2821dc7f,fa07e69c-7a42-11e9-a70d-9a0011fd23e0
4,node88b5b595637fff2942878d19b2358,0,,does not apply,does not apply,ebay.com0.warc.nq.gz,,akg,,http://www.ebay.com/itm/brand-new-akg-k712-pro...,...,stereo,99 db,black,,closed,,,audio technica ath-m50x,d7130c90-df60-4e0d-a898-ad8e2821dc7f,fa07e6d8-7a42-11e9-a70d-9a0011fd23e0


In [16]:
prod_cat=get_product_cat(hp_path)
gold=get_goldstandard(hp_path)
print(set(prod_cat['prodcat_product_name'].unique())- set(hp_result['prodcat_product_name'].unique()))
print(set(gold['id_webpage'].unique())- set(hp_result['id_webpage'].unique()))

{'audio technica 1is'}
{'node82f07ec6d9273b68f123f53fcdb175', 'node15ece85cf178c2166fca09c1411c05a', 'node427c11f5c8417ff4a1484470553365f', 'node3bc544ea77e049729449daea1a9e1e9a', 'node3bede52a51e2ad386f3b14c4e5e1ba', 'node9d69e1f42b6353599a78698f44b8a6', 'node68aa41f7ede7c40e3bbd47dfcead199', 'node2e43deaed2df483d77b04c5b3e9db47', 'node949445e642ee76afb6ec51aa7488794', 'node3248215eb14f80a570a39d7e29ec9d', 'node11ae878fc6a623f81a4455fee81c21', 'nodee9e1a2488b31947e90c32abcf416f114', 'node6632d9343aa8b41318ee6b951e8ec33b', 'node2eba4892ebe4a67621f3f8912040b05c', 'node2ac13dab8e6e79a1aeee49eefed89eab', 'node17f52cd4b2c46ac25435fbb405e6beb', 'node59b2b179cf95b142dc448e84551da2', 'node20c02bcab35254fb2978303e9e7541', 'node64ac34673d9b549093109c4c242dad35', 'node4b709bfdc92d7b9984e8fc20db85b457', 'node28c6d64f3dcc68ae33fb11dd3b499a', 'nodeb2b0ff2db0f5d3648616df6f121d4264', 'node241ec5524d053a3574161892d4b32c1', 'node50f575778311b97189ddcb3d6add6156', 'node0602dabfd0cb83451a7929bbeb66', 'no

# 3. TVs

In [17]:
tv_path='/Users/bengikoseoglu/Documents/Masters/Semester4/Thesis/Dataset/TVs'
tv_result=get_dataset(tv_path)
tv_result.to_csv('tv_merged.csv')
tv_result.head()

Product catalog shape (target): (60, 78)
Product catalog after merging with extracted: (60, 80)
Gold standard shape: (29999, 3)
Product pages shape (source): (428, 479)
Merge 1- web pages: (25679, 482)
Merge 2- all: (25679, 556)
Number of unique products from prodcat: 60
Number of unique webpages: 428
Matches:
0    25495
1      184
Name: match, dtype: int64


Unnamed: 0,id_webpage,match,page_closed_captions_(cc),page_subwoofer_out,page_dimming_type,page_auto_photo_mode,page_mhl,page_5.1_channel_audio_out,page_wi-fi_standard,page_composite_video_input(s),...,cat_timer_functions,cat_input_video_formats,cat_memory,cat_brightness,cat_color,cat_speakers_qty,cat_3d_technology,prodcat_product_name,prodcat_id,id
0,node2adaa221ac4791c47f29f345972f1c91,0,,,,,,,,,...,,,,,,2,passive,lg 55eg9600,84e2838d-a419-4a5b-a284-9463b01cea52,fd165238-7a42-11e9-a70d-9a0011fd23e0
1,nodeb452c577135dea4d89c4fcd40b9ac4f,0,,,,,,,,,...,,,,,,2,passive,lg 55eg9600,84e2838d-a419-4a5b-a284-9463b01cea52,fd16536e-7a42-11e9-a70d-9a0011fd23e0
2,nodef741bd19171e3c5f35e514b43c112972,0,,,,,,,,,...,,,,,,2,passive,lg 55eg9600,84e2838d-a419-4a5b-a284-9463b01cea52,fd1653d2-7a42-11e9-a70d-9a0011fd23e0
3,node62aa28f37676c392289b6393a7a550e8,0,,,,,,,,,...,,,,,,2,passive,lg 55eg9600,84e2838d-a419-4a5b-a284-9463b01cea52,fd165418-7a42-11e9-a70d-9a0011fd23e0
4,node66e1a3264db98d4cab3dbe4f9295858,0,,,,,,,,,...,,,,,,2,passive,lg 55eg9600,84e2838d-a419-4a5b-a284-9463b01cea52,fd16545e-7a42-11e9-a70d-9a0011fd23e0


In [18]:
prod_cat=get_product_cat(tv_path)
gold=get_goldstandard(tv_path)
print(set(prod_cat['prodcat_product_name'].unique())- set(tv_result['prodcat_product_name'].unique()))
print(set(gold['id_webpage'].unique())- set(tv_result['id_webpage'].unique()))

set()
{'node3ff0b1197849cb836a92b727b558f', 'node57c878704a407a1da0d18b17ce9e0fa', 'nodee3a0d88b28c16768f5b2dff1e2bb813', 'nodeabc4a961b7c7cd47dbb53c6f907c65', 'node3fccecec87624cfea7cfa8f596356d0', 'nodefa27dc9d8eb1ba24b36c6c585ace7ce', 'node922c412f1611baaed3fd56ad96cfc619', 'nodee4dd2d090fce8371076b168ab3ce6e3', 'node2b6cda69d3fecce68389a9b67b4781b6', 'node13f0a760be21f42b4af44abeff03e4c', 'node554ec0d5735ea69bd570664e35f739be', 'node4827488809c812aa37c7436b2132cc', 'node9e1f4b59182244970e5fe46eb23ffb', 'node9caabf345a82ca84128ba54fea88cf54', 'node6771d29823aceb14cc0ed7742c767d5', 'noded44297f2a4fc617a620779baee5bd70', 'node6f20a69e32d6be8fa83e1f68673c1b4', 'node7ded2cb9ae2b5bdc238ff198cdf650cd', 'noded7ade45451a0c5a4e72fa653416da2', 'node36c1b44185ee2e737860bff34841564e', 'node8a4739a97abcbf808b4a2ebaae75fc72', 'node9de86550b2862fc7b7289bbc8572f', 'node7f20f7b02fa0763c755bd21d28a4e', 'node7c6f5876aee14aab219704b25bd8bba', 'node8fac74218525e2a1c5a40a13bd9acf8', 'node4ea767a8631dd041