## Adding the product titles to the labeled dataset

In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk

import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  0%|          | 0/24 [00:00<?, ?it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:02<00:00, 11.75it/s]

>>> Dataframe created successfully!






In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.wexphotographic.com,154,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com,553,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com,601,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com,197,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com,178,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [5]:
labeled = pd.read_csv('../datasets/labeled/sigmod_large_labelled_dataset.csv')
labeled.head()

Unnamed: 0,left_spec_id,right_spec_id,label
0,www.ebay.com//53278,www.garricks.com.au//31,1
1,www.ebay.com//24817,www.ebay.com//58782,0
2,www.ebay.com//43019,www.ebay.com//58782,0
3,www.ebay.com//42055,www.ebay.com//54403,0
4,buy.net//6145,www.ebay.com//44280,0


In [6]:
joined = labeled.merge(df, left_on='left_spec_id', right_on='spec_id')
joined.rename(columns={'page_title': 'left_page_title'}, inplace=True)
joined.drop('spec_id', axis=1, inplace=True)
joined.head()

Unnamed: 0,left_spec_id,right_spec_id,label,source,spec_number,left_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,www.ebay.com,53278,nikon d3200 24 2 mp digital slr camera black k...
1,www.ebay.com//53278,www.priceme.co.nz//2246,0,www.ebay.com,53278,nikon d3200 24 2 mp digital slr camera black k...
2,www.ebay.com//53278,www.shopbot.com.au//1376,0,www.ebay.com,53278,nikon d3200 24 2 mp digital slr camera black k...
3,www.ebay.com//53278,www.flipkart.com//2193,0,www.ebay.com,53278,nikon d3200 24 2 mp digital slr camera black k...
4,www.ebay.com//53278,www.ebay.com//58781,0,www.ebay.com,53278,nikon d3200 24 2 mp digital slr camera black k...


In [7]:
right_joined = labeled.merge(df, left_on='right_spec_id', right_on='spec_id')
right_joined.rename(columns={'page_title': 'right_page_title'}, inplace=True)
right_joined.drop('spec_id', axis=1, inplace=True)
right_joined.head()

Unnamed: 0,left_spec_id,right_spec_id,label,source,spec_number,right_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,www.garricks.com.au,31,nikon d3200 black w/ 18-55mm vr lens
1,www.ebay.com//48947,www.garricks.com.au//31,0,www.garricks.com.au,31,nikon d3200 black w/ 18-55mm vr lens
2,www.ebay.com//55173,www.garricks.com.au//31,1,www.garricks.com.au,31,nikon d3200 black w/ 18-55mm vr lens
3,www.ebay.com//42569,www.garricks.com.au//31,0,www.garricks.com.au,31,nikon d3200 black w/ 18-55mm vr lens
4,www.ebay.com//55623,www.garricks.com.au//31,0,www.garricks.com.au,31,nikon d3200 black w/ 18-55mm vr lens


In [8]:
joined['right_page_title'] = right_joined['right_page_title']
joined.drop('spec_number', axis=1, inplace=True)
joined.drop('source', axis=1, inplace=True)
joined.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
1,www.ebay.com//53278,www.priceme.co.nz//2246,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
2,www.ebay.com//53278,www.shopbot.com.au//1376,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
3,www.ebay.com//53278,www.flipkart.com//2193,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
4,www.ebay.com//53278,www.ebay.com//58781,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens


In [9]:
joined.to_csv("../datasets/labeled/labeled_with_titles_large.csv", index = False)