In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk

import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in tqdm(os.listdir(os.path.join(dataset_path, source))):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    df = df.isna().sum().sort_values()[:20].keys()
    return df

In [3]:
most_freq_columns_buy = create_dataframe('../datasets/unlabeled/2013_camera_specs', "buy.net")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
  2%|▏         | 8/358 [00:00<00:04, 71.03it/s]

>>> Creating dataframe...



100%|██████████| 358/358 [00:08<00:00, 41.13it/s]

>>> Dataframe created successfully!






In [None]:
most_freq_columns_ebay = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.ebay.com")

  0%|          | 14/14274 [00:00<01:57, 121.61it/s]

>>> Creating dataframe...



  9%|▊         | 1237/14274 [01:30<31:06,  6.99it/s]