In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import ast
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  4%|▍         | 1/24 [00:00<00:02,  9.33it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:07<00:00,  3.18it/s]


>>> Dataframe created successfully!



## Title

In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.mypriceindia.com,50,www.mypriceindia.com//50,"panasonic hc v130 price in india, bangalore, h..."
1,www.mypriceindia.com,34,www.mypriceindia.com//34,canon eos 1100d (ef-s 18-55 mm is ii) price in...
2,www.mypriceindia.com,47,www.mypriceindia.com//47,"panasonic lumix dmc tz30 price in india, banga..."
3,www.mypriceindia.com,40,www.mypriceindia.com//40,"sony alpha ilce 7s (body only) price in india,..."
4,www.mypriceindia.com,726,www.mypriceindia.com//726,"samsung st72 price in india, bangalore, hydera..."


In [5]:
df = df.drop(columns = ["source", "spec_number"], axis = 1)

In [6]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [7]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [8]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [9]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [10]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

### Modelwords

In [11]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [12]:
## In the data replace lumix with panasonic

In [13]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [14]:
df["page_title"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [15]:
df.head()

Unnamed: 0,spec_id,page_title
0,www.mypriceindia.com//50,"[panasonic, v130]"
1,www.mypriceindia.com//34,"[canon, 1100d]"
2,www.mypriceindia.com//47,"[panasonic, tz30]"
3,www.mypriceindia.com//40,"[sony, 7s]"
4,www.mypriceindia.com//726,"[samsung, st72]"


In [16]:
df["brand"] = [[] for _ in range(len(df))]

In [17]:
# See how many products have more than one brand
for index, row in df.iterrows():
    for brand in row["page_title"]:
        if brand in brands:
            df.at[index, "brand"].append(brand)
            row["page_title"].remove(brand)

In [18]:
def clean_mp_mm_g_oz(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    regex = r"[0-9]+mm(\n|)"
    regex2 = r"[0-9]+mp(\n|)"
    regex3 = r"[0-9]+oz"
    regex4 = r"[0-9]+g(\n|)$"
    repl = value
    for e in repl:
        if bool(re.match(regex, e)) or bool(re.match(regex2, e)) or bool(re.match(regex3, e)) or bool(re.match(regex4, e)):
            repl.remove(e)
    return repl

In [19]:
df["page_title"] = df["page_title"].apply(lambda row : clean_mp_mm_g_oz(row))

In [20]:
df.head()

Unnamed: 0,spec_id,page_title,brand
0,www.mypriceindia.com//50,[v130],[panasonic]
1,www.mypriceindia.com//34,[1100d],[canon]
2,www.mypriceindia.com//47,[tz30],[panasonic]
3,www.mypriceindia.com//40,[7s],[sony]
4,www.mypriceindia.com//726,[st72],[samsung]


## Load cleaned datasets

In [21]:
import os
import glob

os.chdir("../datasets/unlabeled/cleaned")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
df_cleaned = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv

In [22]:
df_cleaned = df_cleaned.reset_index(drop = True)

In [23]:
df_cleaned.drop(columns=["page_title"], inplace=True)

In [24]:
df_cleaned.head()

Unnamed: 0,spec_id,megapixels,short_descr,manufacturer,type,weight,screen_size,dimensions,brand,dots
0,www.buzzillions.com//2560,16.0,,,,,,,,
1,www.buzzillions.com//2075,7.0,,,,,,,,
2,www.buzzillions.com//2425,16.0,,,,,,,,
3,www.buzzillions.com//1966,10.0,,,,,,,,
4,www.buzzillions.com//953,14.0,,,,,,,,


## Merge clean with title

In [25]:
df = df.merge(df_cleaned, on="spec_id")

In [26]:
df.head()

Unnamed: 0,spec_id,page_title,brand_x,megapixels,short_descr,manufacturer,type,weight,screen_size,dimensions,brand_y,dots
0,www.mypriceindia.com//50,[v130],[panasonic],8.9,,,,,,,,
1,www.mypriceindia.com//34,[1100d],[canon],12.2,,,,,,,,
2,www.mypriceindia.com//47,[tz30],[panasonic],14.0,,,,,,,,
3,www.mypriceindia.com//40,[7s],[sony],12.4,,,,,,,,
4,www.mypriceindia.com//726,[st72],[samsung],16.2,,,,,,,,


In [27]:
df.rename(columns={"brand_x" : "brand_from_title", "brand_y" : "brand_descr"}, inplace=True)

In [28]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,megapixels,short_descr,manufacturer,type,weight,screen_size,dimensions,brand_descr,dots
0,www.mypriceindia.com//50,[v130],[panasonic],8.9,,,,,,,,
1,www.mypriceindia.com//34,[1100d],[canon],12.2,,,,,,,,
2,www.mypriceindia.com//47,[tz30],[panasonic],14.0,,,,,,,,
3,www.mypriceindia.com//40,[7s],[sony],12.4,,,,,,,,
4,www.mypriceindia.com//726,[st72],[samsung],16.2,,,,,,,,


In [29]:
def clean_short_descr(line):
    pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")
    brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']
    if not isinstance(line, list) and pd.isna(line):
        return np.nan
    else:
        line = ast.literal_eval(line)
        return list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line)))

In [30]:
df["short_descr"] = df["short_descr"].apply(clean_short_descr)

In [31]:
df["short_descr"] = df["short_descr"].apply(lambda row : clean_mp_mm_g_oz(row))

In [32]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,megapixels,short_descr,manufacturer,type,weight,screen_size,dimensions,brand_descr,dots
0,www.mypriceindia.com//50,[v130],[panasonic],8.9,,,,,,,,
1,www.mypriceindia.com//34,[1100d],[canon],12.2,,,,,,,,
2,www.mypriceindia.com//47,[tz30],[panasonic],14.0,,,,,,,,
3,www.mypriceindia.com//40,[7s],[sony],12.4,,,,,,,,
4,www.mypriceindia.com//726,[st72],[samsung],16.2,,,,,,,,


## Add units to megapixels and screen_size

In [33]:
df["megapixels"] = df["megapixels"].apply(lambda value: str(value) + "mp" if not pd.isna(value) else np.nan)

In [34]:
df["screen_size"] = df["screen_size"].apply(lambda value: str(value) + "in" if not pd.isna(value) else np.nan)

In [35]:
df["weight"] = df["weight"].apply(lambda value: str(value) + "g" if not pd.isna(value) else np.nan)

In [36]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,megapixels,short_descr,manufacturer,type,weight,screen_size,dimensions,brand_descr,dots
0,www.mypriceindia.com//50,[v130],[panasonic],8.9mp,,,,,,,,
1,www.mypriceindia.com//34,[1100d],[canon],12.2mp,,,,,,,,
2,www.mypriceindia.com//47,[tz30],[panasonic],14.0mp,,,,,,,,
3,www.mypriceindia.com//40,[7s],[sony],12.4mp,,,,,,,,
4,www.mypriceindia.com//726,[st72],[samsung],16.2mp,,,,,,,,


In [37]:
df.isna().sum()

spec_id                 0
page_title              0
brand_from_title        0
megapixels          13234
short_descr         28062
manufacturer        28495
type                28355
weight              24366
screen_size         16123
dimensions          28340
brand_descr         14132
dots                29239
dtype: int64

In [38]:
len(df)

29786

In [39]:
def create_brands_column(row):
    repl = row["brand_from_title"]
    if not pd.isna(row["brand_descr"]):
        repl.append(row["brand_descr"])
    if not pd.isna(row["manufacturer"]):
        repl.append(row["manufacturer"])
    return tuple(set(repl))

In [40]:
df["merged_brands"] = df.apply(create_brands_column, axis = 1)

In [41]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,megapixels,short_descr,manufacturer,type,weight,screen_size,dimensions,brand_descr,dots,merged_brands
0,www.mypriceindia.com//50,[v130],[panasonic],8.9mp,,,,,,,,,"(panasonic,)"
1,www.mypriceindia.com//34,[1100d],[canon],12.2mp,,,,,,,,,"(canon,)"
2,www.mypriceindia.com//47,[tz30],[panasonic],14.0mp,,,,,,,,,"(panasonic,)"
3,www.mypriceindia.com//40,[7s],[sony],12.4mp,,,,,,,,,"(sony,)"
4,www.mypriceindia.com//726,[st72],[samsung],16.2mp,,,,,,,,,"(samsung,)"


In [42]:
df.drop(columns = ["brand_from_title", "brand_descr", "manufacturer"], inplace=True)

In [43]:
def merge_attributes(row):
    row_set = set()
    for column in row[1:]:
        if not isinstance(column, list):
            if not pd.isna(column):
                row_set.update([column])
        else:
            row_set.update(set(column))
            
    row["attributes"] = row_set
    return row

In [44]:
df = df.apply(merge_attributes, axis = 1)

In [45]:
needed = ["spec_id", "attributes"]

In [46]:
df = df[needed]

In [47]:
df.head()

Unnamed: 0,spec_id,attributes
0,www.mypriceindia.com//50,"{v130, 8.9mp, (panasonic,)}"
1,www.mypriceindia.com//34,"{12.2mp, (canon,), 1100d}"
2,www.mypriceindia.com//47,"{(panasonic,), 14.0mp, tz30}"
3,www.mypriceindia.com//40,"{(sony,), 7s, 12.4mp}"
4,www.mypriceindia.com//726,"{16.2mp, (samsung,), st72}"


In [None]:
df['attributes'] = 