In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk

import matplotlib.pyplot as plt

In [10]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    df = df.isna().sum().sort_values()[:20].keys()
    return df

In [3]:
most_freq_columns_buy = create_dataframe('../datasets/unlabeled/2013_camera_specs', "buy.net")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
  2%|▏         | 8/358 [00:00<00:04, 71.03it/s]

>>> Creating dataframe...



100%|██████████| 358/358 [00:08<00:00, 41.13it/s]

>>> Dataframe created successfully!






In [5]:
most_freq_columns_gosale = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.gosale.com")


  0%|          | 0/1002 [00:00<?, ?it/s][A
  1%|          | 8/1002 [00:00<00:12, 77.58it/s][A

>>> Creating dataframe...




  2%|▏         | 16/1002 [00:00<00:12, 77.34it/s][A
  2%|▏         | 24/1002 [00:00<00:12, 78.10it/s][A
  3%|▎         | 33/1002 [00:00<00:12, 79.66it/s][A
  4%|▍         | 42/1002 [00:00<00:11, 82.32it/s][A
  5%|▌         | 52/1002 [00:00<00:10, 86.37it/s][A
  6%|▋         | 63/1002 [00:00<00:10, 90.76it/s][A
  7%|▋         | 73/1002 [00:00<00:10, 91.35it/s][A
  8%|▊         | 82/1002 [00:00<00:10, 90.66it/s][A
  9%|▉         | 91/1002 [00:01<00:10, 90.44it/s][A
 10%|█         | 101/1002 [00:01<00:09, 92.67it/s][A
 11%|█         | 111/1002 [00:01<00:09, 93.19it/s][A
 12%|█▏        | 121/1002 [00:01<00:09, 92.52it/s][A
 13%|█▎        | 131/1002 [00:01<00:09, 91.17it/s][A
 14%|█▍        | 141/1002 [00:01<00:09, 90.68it/s][A
 15%|█▌        | 151/1002 [00:01<00:09, 89.30it/s][A
 16%|█▌        | 161/1002 [00:01<00:09, 90.86it/s][A
 17%|█▋        | 171/1002 [00:01<00:09, 88.23it/s][A
 18%|█▊        | 180/1002 [00:02<00:09, 84.50it/s][A
 19%|█▉        | 189/1002 [00:02<00:

>>> Dataframe created successfully!



In [11]:
dataset_path = "../datasets/unlabeled/2013_camera_specs"

for source in tqdm(os.listdir(dataset_path)):
    if source != "www.ebay.com" and source !="www.alibaba.com":
        most_freq = create_dataframe(dataset_path, source)
        print(source)
        print(most_freq.to_list())
        print()
    




  0%|          | 0/24 [00:00<?, ?it/s][A[A[A

>>> Creating dataframe...






  4%|▍         | 1/24 [00:01<00:41,  1.82s/it][A[A[A

>>> Dataframe created successfully!

www.wexphotographic.com
['<page title>', 'the information is', 'the description needs', 'spec_number', 'spec_id', 'source', 'screen size inches', 'screen resolution pixels', 'requires', 'please add', 'optical zoom x', 'needs additional', 'max aperture wide', 'max aperture tele', 'iso min', 'iso max', 'megapixels', 'cant find a', 'focal length wide', 'an']

>>> Creating dataframe...






  8%|▊         | 2/24 [00:16<02:05,  5.72s/it][A[A[A

>>> Dataframe created successfully!

www.price-hunt.com
['spec_number', '<page title>', 'source', 'spec_id', 'model id', 'brand', 'lcd screen size', 'optical sensor resolution in megapixel', 'weight', 'sensor type', 'type', 'upgradeable memory', 'memory card type', 'battery type', 'color', 'image display resolution', 'usb cable', 'image format', 'video format', 'dimensions']

>>> Creating dataframe...






 12%|█▎        | 3/24 [00:19<01:44,  5.00s/it][A[A[A

>>> Dataframe created successfully!

www.henrys.com
['spec_number', 'spec_id', 'source', '<page title>', 'white balance modes', 'colour', 'camera type', 'battery', 'effective pixels', 'aspect ratio', 'operatingstorage temperature', 'sensor typesize', 'viewfinder type', 'display screen type', 'remote control', 'self timer', 'metering method', 'shutter', 'builtin flash', 'iso sensitivity']

>>> Creating dataframe...






 21%|██        | 5/24 [00:25<01:22,  4.33s/it][A[A[A

>>> Dataframe created successfully!

www.flipkart.com
['spec_id', 'spec_number', '<page title>', 'source', '12 months', '6 months', 'emi tenure', '9 months', '3 months', 'brand', 'type', 'lcd screen size', 'lcd display', 'image display resolution', 'color', 'additional features', 'focal length', '18 months', 'model id', '24 months']

>>> Creating dataframe...






 25%|██▌       | 6/24 [00:33<01:39,  5.53s/it][A[A[A

>>> Dataframe created successfully!

buy.net
['<page title>', 'source', 'spec_id', 'spec_number', 'camera type', 'lcd screen size', 'url', 'memory card support', 'effective megapixels', 'height', 'width', 'depth', 'weight', 'image sensor', 'flash', 'maximum video capture resolution', 'warranty information', 'image stabilization', 'total pixels', 'focal length']

>>> Creating dataframe...






 29%|██▉       | 7/24 [00:38<01:28,  5.22s/it][A[A[A

>>> Dataframe created successfully!

www.ilgs.net
['<page title>', 'spec_number', 'spec_id', 'source', 'ean', 'manufacturer', 'pdf url', 'product promotion', 'short description', 'sku', 'specification', 'megapixel', 'colour of product', 'compatible memory cards', 'sensor type', 'weight', 'battery type', 'iso sensitivity', 'builtin flash', 'white balance']

>>> Creating dataframe...






 33%|███▎      | 8/24 [00:41<01:13,  4.58s/it][A[A[A

>>> Dataframe created successfully!

www.walmart.com
['<page title>', 'store information not available', 'spec_number', 'spec_id', 'source', 'product in inches l x w x h', 'model no', 'shipping weight in pounds', 'walmart no', 'type', 'additional features', 'optical zoom', 'connector type', 'self timer delay', 'software', 'flash modes', 'digital zoom', 'focal length equivalent to 35mm camera', 'features', 'effective flash range']

>>> Creating dataframe...






 38%|███▊      | 9/24 [00:43<00:58,  3.93s/it][A[A[A

>>> Dataframe created successfully!

www.pcconnection.com
['warranty  parts', '<page title>', 'warranty  labor', 'spec_number', 'spec_id', 'source', 'returns policy', 'camera type', 'megapixels', 'optical sensor type', 'display size', 'optical sensor size', 'flash type', 'color', 'shooting modes', '35mm equivalent focal length max', '35mm equivalent focal length min', 'contents', 'power notes', 'display technology']

>>> Creating dataframe...






 42%|████▏     | 10/24 [00:47<00:55,  3.95s/it][A[A[A

>>> Dataframe created successfully!

cammarkt.com
['brand', 'source', 'spec_id', 'spec_number', 'specs', 'manufacturer', '<page title>', 'weight', 'part number', 'image sensor type', 'catalog number', 'resolution', 'camera resolution', 'family line', 'upc', 'lcd screen size', 'height', 'depth', 'width', 'color']

>>> Creating dataframe...






 46%|████▌     | 11/24 [00:56<01:08,  5.29s/it][A[A[A

>>> Dataframe created successfully!

www.priceme.co.nz
['<page title>', 'spec_number', 'spec_id', 'source', 'light sensitivity', 'still image format', 'resolution', 'max image resolution', 'optical sensor', 'shutter speed', 'image stabilizer', 'digital zoom', 'optical zoom', 'lens aperture', 'max focal length', 'min focal length', '3d support', 'focus adjustment', 'storage media', 'colour']

>>> Creating dataframe...






 50%|█████     | 12/24 [01:08<01:28,  7.36s/it][A[A[A

>>> Dataframe created successfully!

www.gosale.com
['<page title>', 'spec_number', 'spec_id', 'source', 'product name', 'manufacturer', 'deal first added on', 'last updated', 'upc', 'ean13', 'retail price', 'product number mpn', 'camera type', 'ean14', 'weight', 'megapixels', 'megapixel range', 'dimensions', 'optical zoom', 'lcd screen size']

>>> Creating dataframe...






 54%|█████▍    | 13/24 [01:09<00:59,  5.44s/it][A[A[A

>>> Dataframe created successfully!

www.garricks.com.au
['<page title>', 'spec_number', 'spec_id', 'source', 'sensor details', 'memory type', 'resolution', 'image format', 'lens mount', 'viewfinder type', 'video recording format', 'flash unit', 'special attribute', 'image stabilization', 'zoom range', 'weight', 'autofocus array']

>>> Creating dataframe...






 58%|█████▊    | 14/24 [01:18<01:06,  6.61s/it][A[A[A

>>> Dataframe created successfully!

www.pricedekho.com
['<page title>', 'spec_number', 'spec_id', 'source', 'face detection', 'screen size', 'video display resolution', 'usb', 'hdmi', 'gps', 'external memory', 'color', 'metering', 'additional features', 'bangalore', 'self timer', 'sensor size', 'sensor type', 'focal length', 'image display resolution']

>>> Creating dataframe...






 62%|██████▎   | 15/24 [01:19<00:43,  4.88s/it][A[A[A

>>> Dataframe created successfully!

www.ukdigitalcameras.co.uk
['<page title>', 'brand', 'camera resolution', 'colour', 'spec_number', 'spec_id', 'lcd size', 'source', 'mpn', 'hd video', '35mm equivalent', 'lens wide mm', 'lens tele mm', 'features', 'optical zoom', 'optical zoom range', 'variangle lcd', 'included lens', 'waterproof depth']

>>> Creating dataframe...






 67%|██████▋   | 16/24 [01:22<00:33,  4.24s/it][A[A[A

>>> Dataframe created successfully!

www.camerafarm.com.au
['spec_number', '<page title>', 'source', 'spec_id', 'part no', 'ship weight', 'rating', 'our price', 'rrp', 'brand', 'usually ships', 'date added', 'you save', 'manuf no', 'exposure compensation', 'exposure modes', 'scene modes', 'selftimer', 'incamera image editing', 'top continuous shooting speed at full resolution']

>>> Creating dataframe...






 71%|███████   | 17/24 [01:28<00:33,  4.78s/it][A[A[A

>>> Dataframe created successfully!

www.mypriceindia.com
['<page title>', 'source', 'spec_id', 'spec_number', 'camera resolution', 'video format', 'maximum shutter speed', 'minimum shutter speed', 'iso rating', 'self timer', 'auto focus', 'image format', 'white balancing', 'lens type', 'focal length', 'face detection', 'aperture range', 'shooting modes', 'conitnous shots', 'manual focus']

>>> Creating dataframe...






 75%|███████▌  | 18/24 [01:43<00:47,  7.99s/it][A[A[A

>>> Dataframe created successfully!

www.eglobalcentral.co.uk
['<page title>', 'spec_number', 'spec_id', 'source', 'weight', 'battery', 'minimum aperture', 'iso sensitivity', 'max resolution', 'aspect ratio', 'angle of view', 'focal length', 'effective pixels', 'dimensions w x h x d', 'usb', 'hdmi', 'sensor type', 'gps', 'sensor size', 'storage type']

>>> Creating dataframe...






 83%|████████▎ | 20/24 [01:48<00:25,  6.29s/it][A[A[A

>>> Dataframe created successfully!

www.shopbot.com.au
['<page title>', 'source', 'spec_id', 'spec_number', 'product name', 'digital camera warehouse', 'camerastorecomau', 'shop', 'the bad', 'the good', 'dimensions', 'camera type', 'sensor resolution', 'lcd size', 'video resolution', 'digital camera warehouse nsw vic qld', 'optical image stabilization', 'weight', 'camerastorecomau sa', 'nsw']

>>> Creating dataframe...






 88%|████████▊ | 21/24 [02:06<00:28,  9.66s/it][A[A[A

>>> Dataframe created successfully!

www.shopmania.in
['category', '<page title>', 'user reviews', 'spec_number', 'spec_id', 'source', 'product name', 'brand', 'product rating', 'resolution', 'screen size', 'sensor type', 'light sensitivity iso', 'image format', 'digital video format', 'memory type', 'connector type', 'image resolutions', 'self timer delay', 'sensor size']

>>> Creating dataframe...






 92%|█████████▏| 22/24 [02:13<00:18,  9.12s/it][A[A[A

>>> Dataframe created successfully!

www.cambuy.com.au
['spec_number', '<page title>', 'spec_id', 'source', 'dimensions w x h x d', 'weight', 'type', 'exposure compensation', 'monitor', 'speed', 'effective pixels', 'supplied accessories', 'focal length', 'selftimer', 'white balance', 'metering modes', 'tripod socket', 'file system', 'image sensor', 'supported languages']

>>> Creating dataframe...






 96%|█████████▌| 23/24 [02:47<00:16, 16.40s/it][A[A[A

>>> Dataframe created successfully!

www.buzzillions.com
['spec_number', '<page title>', 'specification', 'spec_id', 'source', 'megapixels', 'lcd viewer', 'lens', 'storage media type', 'optical zoom', 'still picture capture', 'video movie capture', 'electronic flash', 'anti shake mode', 'camera color', 'trackinglinkbaseurl', 'trackinglinkuri', 'digital zoom', 'model', 'sound']

>>> Creating dataframe...






100%|██████████| 24/24 [02:51<00:00, 12.75s/it][A[A[A

>>> Dataframe created successfully!

www.canon-europe.com
['<page title>', 'still image type', 'spec_number', 'spec_id', 'source', 'movies', 'modes', 'metering modes', 'maximum fnumber', 'iso sensitivity', 'focal length', 'exposure compensation', 'continuous shooting', 'construction', 'zoom', 'redeye reduction', 'builtin flash range', 'ae lock', 'slow sync speed', 'drive modes']



In [12]:
most_freq_alibaba = create_dataframe(dataset_path, "www.alibaba.com")

>>> Creating dataframe...

>>> Dataframe created successfully!



In [13]:
most_freq_alibaba

Index(['source', '<page title>', 'spec_id', 'spec_number', 'payment terms',
       'supply ability', 'fob price', 'minorder quantity', 'port', 'weight',
       'power supply', 'image sensor', 'power consumption', 'min illumination',
       'frame rate', 'dimensions', 'bit rate', 'lens', 'interface', 'network'],
      dtype='object')