In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import ast
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  0%|          | 0/24 [00:00<?, ?it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:03<00:00,  6.89it/s]

>>> Dataframe created successfully!






## Title

In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.wexphotographic.com,154,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com,553,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com,601,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com,197,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com,178,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [5]:
df = df.drop(columns = ["source", "spec_number"], axis = 1)

In [6]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [7]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [8]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [9]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

### Modelwords

In [10]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [11]:
## In the data replace lumix with panasonic

In [12]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [13]:
df["page_title"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [14]:
df.head()

Unnamed: 0,spec_id,page_title
0,www.wexphotographic.com//154,"[aw120, nikon, vna593e1]"
1,www.wexphotographic.com//553,"[canon, 9148b007aa]"
2,www.wexphotographic.com//601,"[fuji, p10nc12730a, s1]"
3,www.wexphotographic.com//197,"[vna540e1, nikon, s5300]"
4,www.wexphotographic.com//178,"[fuji, p10nc12690a, s8600]"


In [15]:
df["brand"] = [[] for _ in range(len(df))]

In [16]:
# Create brand column
for index, row in df.iterrows():
    for brand in row["page_title"]:
        if brand in brands:
            if not brand in df.at[index, "brand"]:
                df.at[index, "brand"].append(brand)
                row["page_title"].remove(brand)

In [17]:
def clean_mp_mm_g_oz(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    regex = r"[0-9]+mm(\n|)"
    regex2 = r"[0-9]+mp(\n|)"
    regex3 = r"[0-9]+oz"
    regex4 = r"[0-9]+g(\n|)$"
    repl = value
    for e in repl:
        if bool(re.match(regex, e)) or bool(re.match(regex2, e)) or bool(re.match(regex3, e)) or bool(re.match(regex4, e)):
            repl.remove(e)
    return repl

In [18]:
df["page_title"] = df["page_title"].apply(lambda row : clean_mp_mm_g_oz(row))

In [19]:
df.head()

Unnamed: 0,spec_id,page_title,brand
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon]
1,www.wexphotographic.com//553,[9148b007aa],[canon]
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji]
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon]
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji]


## Load cleaned datasets

In [20]:
import os
import glob

os.chdir("../datasets/unlabeled/cleaned")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
df_cleaned = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv

In [21]:
df_cleaned = df_cleaned.reset_index(drop = True)

In [22]:
df_cleaned.head()

Unnamed: 0,spec_id,brand,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.canon-europe.com//115,canon,,,,,,,,
1,www.canon-europe.com//154,canon,,,,,,,,
2,www.canon-europe.com//103,canon,,,,,,,,
3,www.canon-europe.com//20,canon,,,,,,,,
4,www.canon-europe.com//98,canon,,,,,,,,


## Merge clean with title

In [23]:
df = df.merge(df_cleaned, on="spec_id")

In [24]:
df.head()

Unnamed: 0,spec_id,page_title,brand_x,brand_y,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0,,3.0,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0,,2.7,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4,,3.0,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0,,3.0,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0,,3.0,,


In [25]:
df.rename(columns={"brand_x" : "brand_from_title", "brand_y" : "brand_descr"}, inplace=True)

In [26]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0,,3.0,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0,,2.7,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4,,3.0,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0,,3.0,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0,,3.0,,


In [27]:
def clean_short_descr(line):
    pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")
    brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dorr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']
    if not isinstance(line, list) and pd.isna(line):
        return np.nan
    else:
        line = ast.literal_eval(line)
        return list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line)))

In [28]:
df["short_descr"] = df["short_descr"].apply(clean_short_descr)

In [29]:
df["short_descr"] = df["short_descr"].apply(lambda row : clean_mp_mm_g_oz(row))

In [30]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0,,3.0,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0,,2.7,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4,,3.0,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0,,3.0,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0,,3.0,,


## Add units to megapixels and screen_size

In [31]:
df["megapixels"] = df["megapixels"].apply(lambda value: str(value) + "mp" if not pd.isna(value) else np.nan)

In [32]:
df["screen_size"] = df["screen_size"].apply(lambda value: str(value) + "in" if not pd.isna(value) else np.nan)

In [33]:
df["weight"] = df["weight"].apply(lambda value: str(value) + "g" if not pd.isna(value) else np.nan)

In [34]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,


In [35]:
df.isna().sum() / len(df)

spec_id             0.000000
page_title          0.000000
brand_from_title    0.000000
brand_descr         0.466058
weight              0.818035
manufacturer        0.956657
short_descr         0.942120
megapixels          0.444303
dimensions          0.953837
screen_size         0.541295
type                0.951957
dots                0.981636
dtype: float64

In [36]:
len(df)

29786

In [37]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,


## Clean brands

In [38]:
def clean_title_brands(title):
    new_title = []
    for e in title:
        if 'fuji' in e:
            new_title.append("fuji")
        elif "nikkor" in e or "nikkon" in e:
            new_title.append("nikon")
        elif "butterfly" in e:
            new_title.append("butterfly")
        elif "blackmagic" in e:
            new_title.append("blackmagic")
        else:
            new_title.append(e)
    return list(set(new_title))

In [39]:
df["brand_from_title"] = df["brand_from_title"].apply(clean_title_brands)

In [40]:
def clean_manufacturer(e):
    if pd.isna(e):
        return e
    if 'fuji' in e:
        return "fuji"
    elif "nikkor" in e or "nikkon" in e or "niko9" in e:
        return "nikon"
    elif "penx9" in e:
        return "pentax"
    elif "canu9" in e or "canon cameras us" in e:
        return "canon"
    elif "butterfly" in e:
        return "butterfly"
    elif "blackmagic" in e:
        return "blackmagic"
    elif "leica camera" in e:
        return "leica"
    elif "samsung pleomax zirex" in e:
        return "samsung"
    elif "digital" in e or "lomo cameras" in e or "micro solution of japan" in e or "ricoh cameras usa" in e:
        np.nan
    else:
        return e

In [41]:
df["manufacturer"] = df["manufacturer"].apply(clean_manufacturer)

In [42]:
def clean_type(camera):
    if pd.isna(camera):
        return camera
    if "slr" in camera:
        return "dslr"
    elif "point shoot" in camera:
        return "point shoot"
    elif "compact" in camera:
        return "compact"
    elif "mirrorless" in camera:
        return "mirrorless"
    else:
        return camera

In [43]:
df["type"] = df["type"].apply(clean_type)

In [44]:
df["brand_descr"] = df["brand_descr"].apply(clean_manufacturer)

In [45]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,


## Merge brands together

In [46]:
def create_brands_column(row):
    repl = row["brand_from_title"]
    if not pd.isna(row["brand_descr"]):
        repl.append(row["brand_descr"])
    if not pd.isna(row["manufacturer"]):
        repl.append(row["manufacturer"])
    return tuple(set(repl))

In [47]:
df["merged_brands"] = df.apply(create_brands_column, axis = 1)

In [48]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots,merged_brands
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,,"(nikon,)"
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,,"(canon,)"
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,,"(fuji,)"
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,,"(nikon,)"
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,,"(fuji,)"


In [49]:
sum(df.apply(lambda row : row["page_title"] == [], axis = 1))

5140

In [50]:
df.drop(columns = ["brand_from_title", "brand_descr", "manufacturer"], inplace=True)

In [51]:
df.head()

Unnamed: 0,spec_id,page_title,weight,short_descr,megapixels,dimensions,screen_size,type,dots,merged_brands
0,www.wexphotographic.com//154,"[aw120, vna593e1]",,,16.0mp,,3.0in,,,"(nikon,)"
1,www.wexphotographic.com//553,[9148b007aa],,,16.0mp,,2.7in,,,"(canon,)"
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",,,16.4mp,,3.0in,,,"(fuji,)"
3,www.wexphotographic.com//197,"[vna540e1, s5300]",,,16.0mp,,3.0in,,,"(nikon,)"
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",,,16.0mp,,3.0in,,,"(fuji,)"


## Create column with everything

In [52]:
def merge_all_features(row):
    attrs = []
    for e in row["page_title"]:
        if not pd.isna(e):
            attrs.append(e)
    if isinstance(row["short_descr"], list):# and not pd.isna(row["short_descr"]):
        for e in row["short_descr"]:
            if not pd.isna(e):
                attrs.append(e)
    if not pd.isna(row["weight"]):
        attrs.append(row["weight"])
    for attr in row[4:-1]:
        if not pd.isna(attr):
            attrs.append(attr)
    for e in row["merged_brands"]:
        if not pd.isna(e):
            attrs.append(e)
    row["features"] = list(set(attrs))
    return row

In [53]:
line_df = df.apply(merge_all_features, axis = 1)

In [54]:
line_df = line_df[["spec_id", "features"]]

In [55]:
line_df.head()

Unnamed: 0,spec_id,features
0,www.wexphotographic.com//154,"[nikon, aw120, 16.0mp, vna593e1, 3.0in]"
1,www.wexphotographic.com//553,"[16.0mp, canon, 9148b007aa, 2.7in]"
2,www.wexphotographic.com//601,"[s1, fuji, p10nc12730a, 16.4mp, 3.0in]"
3,www.wexphotographic.com//197,"[vna540e1, nikon, s5300, 16.0mp, 3.0in]"
4,www.wexphotographic.com//178,"[fuji, 16.0mp, s8600, 3.0in, p10nc12690a]"


In [56]:
line_df["lenghts"] = line_df["features"].apply(len)

In [57]:
line_df.head()

Unnamed: 0,spec_id,features,lenghts
0,www.wexphotographic.com//154,"[nikon, aw120, 16.0mp, vna593e1, 3.0in]",5
1,www.wexphotographic.com//553,"[16.0mp, canon, 9148b007aa, 2.7in]",4
2,www.wexphotographic.com//601,"[s1, fuji, p10nc12730a, 16.4mp, 3.0in]",5
3,www.wexphotographic.com//197,"[vna540e1, nikon, s5300, 16.0mp, 3.0in]",5
4,www.wexphotographic.com//178,"[fuji, 16.0mp, s8600, 3.0in, p10nc12690a]",5


In [58]:
line_df.lenghts.value_counts()

4     9786
3     6036
2     4382
5     3668
0     2131
1     1884
6     1175
7      465
8      222
9       13
11      11
10      11
28       1
12       1
Name: lenghts, dtype: int64

In [59]:
line_df = line_df.loc[(line_df["lenghts"] >= 2)]

In [60]:
def list_to_combinations(features):
    if len(features) >= 3:
        list_of_tuples = list(itertools.combinations(features, 3))
    else:
        list_of_tuples = [tuple(features)]
    return list_of_tuples

In [61]:
line_df["features"] = line_df["features"].apply(list_to_combinations)

In [62]:
line_df.head()

Unnamed: 0,spec_id,features,lenghts
0,www.wexphotographic.com//154,"[(nikon, aw120, 16.0mp), (nikon, aw120, vna593...",5
1,www.wexphotographic.com//553,"[(16.0mp, canon, 9148b007aa), (16.0mp, canon, ...",4
2,www.wexphotographic.com//601,"[(s1, fuji, p10nc12730a), (s1, fuji, 16.4mp), ...",5
3,www.wexphotographic.com//197,"[(vna540e1, nikon, s5300), (vna540e1, nikon, 1...",5
4,www.wexphotographic.com//178,"[(fuji, 16.0mp, s8600), (fuji, 16.0mp, 3.0in),...",5


In [63]:
feature_map = {}

In [64]:
def create_dict(row):
    global feature_map
    features = row["features"]
    for tup in features:
        sorted_tup = tuple(sorted(list(tup)))
        if sorted_tup in feature_map:
            feature_map[sorted_tup].append(row["spec_id"])
        else:
            feature_map[sorted_tup] = [row["spec_id"]]

In [65]:
line_df.apply(create_dict, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
29774    None
29775    None
29776    None
29777    None
29781    None
Length: 25771, dtype: object

In [66]:
matches = []

In [67]:
for value in tqdm(feature_map.values()):
    combs = list(itertools.combinations(value, 2))
    for comb in combs:
        if comb[0] < comb[1]:
            temp_dict = {"left_spec_id" : comb[0], "right_spec_id" : comb[1]}
        else:
            temp_dict = {"left_spec_id" : comb[1], "right_spec_id" : comb[0]}
        matches.append(temp_dict)

100%|██████████| 70338/70338 [00:01<00:00, 53935.84it/s]


In [68]:
matches = pd.DataFrame(matches)

In [69]:
len(matches)

1406806

In [70]:
matches = matches.drop_duplicates()

In [71]:
len(matches)

903413

In [72]:
matches.head()

Unnamed: 0,left_spec_id,right_spec_id
0,www.wexphotographic.com//154,www.wexphotographic.com//181
1,www.wexphotographic.com//154,www.wexphotographic.com//187
2,www.henrys.com//76,www.wexphotographic.com//154
3,www.henrys.com//132,www.wexphotographic.com//154
4,www.pcconnection.com//12171,www.wexphotographic.com//154


In [74]:
matches.to_csv("/Users/gfotiadis/programming/sigmod/src/matches.csv", index=False)