In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import ast
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  8%|▊         | 2/24 [00:00<00:01, 12.53it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:09<00:00,  2.45it/s]

>>> Dataframe created successfully!






## Title

In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.wexphotographic.com,154,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com,553,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com,601,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com,197,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com,178,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [5]:
df = df.drop(columns = ["source", "spec_number"], axis = 1)

In [6]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [7]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [8]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [9]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [10]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

### Modelwords

In [11]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [12]:
## In the data replace lumix with panasonic

In [13]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [14]:
df["page_title"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [15]:
df.head()

Unnamed: 0,spec_id,page_title
0,www.wexphotographic.com//154,"[aw120, nikon, vna593e1]"
1,www.wexphotographic.com//553,"[9148b007aa, canon]"
2,www.wexphotographic.com//601,"[p10nc12730a, fuji, s1]"
3,www.wexphotographic.com//197,"[vna540e1, nikon, s5300]"
4,www.wexphotographic.com//178,"[p10nc12690a, fuji, s8600]"


In [16]:
df["brand"] = [[] for _ in range(len(df))]

In [17]:
# Create brand column
for index, row in df.iterrows():
    for brand in row["page_title"]:
        if brand in brands:
            if not brand in df.at[index, "brand"]:
                df.at[index, "brand"].append(brand)
                row["page_title"].remove(brand)

In [18]:
def clean_mp_mm_g_oz(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    regex = r"[0-9]+mm(\n|)"
    regex2 = r"[0-9]+mp(\n|)"
    regex3 = r"[0-9]+oz"
    regex4 = r"[0-9]+g(\n|)$"
    repl = value
    for e in repl:
        if bool(re.match(regex, e)) or bool(re.match(regex2, e)) or bool(re.match(regex3, e)) or bool(re.match(regex4, e)):
            repl.remove(e)
    return repl

In [19]:
df["page_title"] = df["page_title"].apply(lambda row : clean_mp_mm_g_oz(row))

In [20]:
df.head()

Unnamed: 0,spec_id,page_title,brand
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon]
1,www.wexphotographic.com//553,[9148b007aa],[canon]
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji]
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon]
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji]


## Load cleaned datasets

In [21]:
import os
import glob

os.chdir("../datasets/unlabeled/cleaned")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
df_cleaned = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv

In [22]:
df_cleaned = df_cleaned.reset_index(drop = True)

In [23]:
df_cleaned.head()

Unnamed: 0,spec_id,brand,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.canon-europe.com//115,canon,,,,,,,,
1,www.canon-europe.com//154,canon,,,,,,,,
2,www.canon-europe.com//103,canon,,,,,,,,
3,www.canon-europe.com//20,canon,,,,,,,,
4,www.canon-europe.com//98,canon,,,,,,,,


## Merge clean with title

In [24]:
df = df.merge(df_cleaned, on="spec_id")

In [25]:
df.head()

Unnamed: 0,spec_id,page_title,brand_x,brand_y,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0,,3.0,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0,,2.7,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4,,3.0,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0,,3.0,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0,,3.0,,


In [26]:
df.rename(columns={"brand_x" : "brand_from_title", "brand_y" : "brand_descr"}, inplace=True)

In [27]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0,,3.0,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0,,2.7,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4,,3.0,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0,,3.0,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0,,3.0,,


In [28]:
def clean_short_descr(line):
    pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")
    brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dorr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']
    if not isinstance(line, list) and pd.isna(line):
        return np.nan
    else:
        line = ast.literal_eval(line)
        return list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line)))

In [29]:
df["short_descr"] = df["short_descr"].apply(clean_short_descr)

In [30]:
df["short_descr"] = df["short_descr"].apply(lambda row : clean_mp_mm_g_oz(row))

In [31]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0,,3.0,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0,,2.7,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4,,3.0,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0,,3.0,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0,,3.0,,


## Add units to megapixels and screen_size

In [32]:
df["megapixels"] = df["megapixels"].apply(lambda value: str(value) + "mp" if not pd.isna(value) else np.nan)

In [33]:
df["screen_size"] = df["screen_size"].apply(lambda value: str(value) + "in" if not pd.isna(value) else np.nan)

In [34]:
df["weight"] = df["weight"].apply(lambda value: str(value) + "g" if not pd.isna(value) else np.nan)

In [35]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,


In [36]:
df.isna().sum() / len(df)

spec_id             0.000000
page_title          0.000000
brand_from_title    0.000000
brand_descr         0.466058
weight              0.818035
manufacturer        0.956657
short_descr         0.942120
megapixels          0.444303
dimensions          0.953837
screen_size         0.541295
type                0.951957
dots                0.981636
dtype: float64

In [37]:
len(df)

29786

In [38]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,


## Clean brands

In [39]:
def clean_title_brands(title):
    new_title = []
    for e in title:
        if 'fuji' in e:
            new_title.append("fuji")
        elif "nikkor" in e or "nikkon" in e:
            new_title.append("nikon")
        elif "butterfly" in e:
            new_title.append("butterfly")
        elif "blackmagic" in e:
            new_title.append("blackmagic")
        else:
            new_title.append(e)
    return list(set(new_title))

In [40]:
df["brand_from_title"] = df["brand_from_title"].apply(clean_title_brands)

In [41]:
def clean_manufacturer(e):
    if pd.isna(e):
        return e
    if 'fuji' in e:
        return "fuji"
    elif "nikkor" in e or "nikkon" in e or "niko9" in e:
        return "nikon"
    elif "penx9" in e:
        return "pentax"
    elif "canu9" in e or "canon cameras us" in e:
        return "canon"
    elif "butterfly" in e:
        return "butterfly"
    elif "blackmagic" in e:
        return "blackmagic"
    elif "leica camera" in e:
        return "leica"
    elif "samsung pleomax zirex" in e:
        return "samsung"
    elif "digital" in e or "lomo cameras" in e or "micro solution of japan" in e or "ricoh cameras usa" in e:
        np.nan
    else:
        return e

In [42]:
df["manufacturer"] = df["manufacturer"].apply(clean_manufacturer)

In [43]:
def clean_type(camera):
    if pd.isna(camera):
        return camera
    if "slr" in camera:
        return "dslr"
    elif "point shoot" in camera:
        return "point shoot"
    elif "compact" in camera:
        return "compact"
    elif "mirrorless" in camera:
        return "mirrorless"
    else:
        return camera

In [44]:
df["type"] = df["type"].apply(clean_type)

In [45]:
df["brand_descr"] = df["brand_descr"].apply(clean_manufacturer)

In [46]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,


## Merge brands together

In [47]:
def create_brands_column(row):
    repl = row["brand_from_title"]
    if not pd.isna(row["brand_descr"]):
        repl.append(row["brand_descr"])
    if not pd.isna(row["manufacturer"]):
        repl.append(row["manufacturer"])
    return tuple(set(repl))

In [48]:
df["merged_brands"] = df.apply(create_brands_column, axis = 1)

In [49]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,weight,manufacturer,short_descr,megapixels,dimensions,screen_size,type,dots,merged_brands
0,www.wexphotographic.com//154,"[aw120, vna593e1]",[nikon],,,,,16.0mp,,3.0in,,,"(nikon,)"
1,www.wexphotographic.com//553,[9148b007aa],[canon],,,,,16.0mp,,2.7in,,,"(canon,)"
2,www.wexphotographic.com//601,"[p10nc12730a, s1]",[fuji],,,,,16.4mp,,3.0in,,,"(fuji,)"
3,www.wexphotographic.com//197,"[vna540e1, s5300]",[nikon],,,,,16.0mp,,3.0in,,,"(nikon,)"
4,www.wexphotographic.com//178,"[p10nc12690a, s8600]",[fuji],,,,,16.0mp,,3.0in,,,"(fuji,)"


In [50]:
sum(df.apply(lambda row : row["page_title"] == [], axis = 1))

5137

In [51]:
df.drop(columns = ["brand_from_title", "brand_descr", "manufacturer"], inplace=True)

### Statistics on matches

In [52]:
# df.head()

In [53]:
# labeled = pd.read_csv("/Users/gfotiadis/programming/sigmod/datasets/created/with_details/combined_csv.csv")

In [54]:
# labeled.head()

In [55]:
# left_joined = labeled.merge(df, left_on='left_spec_id', right_on='spec_id')
# left_joined.rename(columns={'page_title': 'left_page_title'}, inplace=True)
# left_joined.drop('spec_id', axis=1, inplace=True)

In [56]:
# left_joined.head()

In [57]:
# right_joined = labeled.merge(df, left_on='right_spec_id', right_on='spec_id')
# right_joined.rename(columns={'page_title': 'right_page_title'}, inplace=True)
# right_joined.drop('spec_id', axis=1, inplace=True)
# right_joined.head()

In [58]:
# fully_joined = pd.merge(left_joined, right_joined, how="inner", left_on=["left_spec_id", "right_spec_id"], right_on=["left_spec_id", "right_spec_id"], suffixes=("_left", "_right"))

In [59]:
# fully_joined.right_page_title.apply(lambda title : np.nan if title == [] else title).isna().sum()

In [60]:
# fully_joined.left_page_title.apply(lambda title : np.nan if title == [] else title).isna().sum()

In [61]:
# df.head()

In [62]:
grouped = df.groupby("merged_brands")

In [63]:
unbranded_til_100 = grouped.get_group(())

In [64]:
for gname, group in grouped:
    if len(group) < 100:
        unbranded_til_100 = pd.concat([group, unbranded_til_100])

In [65]:
len(unbranded_til_100)

8769

In [66]:
len(grouped.groups)

476

In [67]:
def get_merged_df(dataframe):
    merged = dataframe.drop(columns=["merged_brands"], axis = 1)
    merged = (merged.merge(merged, on=merged.assign(key_col=1)['key_col'], suffixes=('', '_right'))
 .query('spec_id < spec_id_right') # filter out joins on the same row and keep unique combinations
 .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)
    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True)
    return merged

In [82]:
count_title = 0
count_no_title = 0
matched_title = 0
matched_no_title = 0

In [83]:
def determine_match(row):
    global count_title
    global count_no_title
    global matched_title
    global matched_no_title
    
    if row["page_title"] == [] or row["page_title_right"] == []:
        target = 0.55
        count_no_title += 1
    else:
        target = 1.3
        count_title += 1
            
    dim_weight = 0.9
    dots_weight = 0.9
    mp_weight = 0.44
    scr_weight = 0.54
    type_weight = 0.49
    weight_weight = 0.81
    descr_weight= 0.9
    title_weight = 0.9
    
    
    score = 0
    dim_l = row["dimensions"]
    dim_r = row["dimensions_right"]
    dots_l = row["dots"]
    dots_r = row["dots_right"]
    megapixels_l = row["megapixels"]
    megapixels_r = row["megapixels_right"]
    screen_size_l = row["screen_size"]
    screen_size_r = row["screen_size_right"]
    short_descr_l = row["short_descr"]
    short_descr_r = row["short_descr_right"]
    type_l = row["type"]
    type_r = row["type_right"]
    weight_l = row["weight"]
    weight_r = row["weight_right"]
    page_title_l = row["page_title"]
    page_title_r = row["page_title_right"]
    
    dimensions_regex = r"([0-9]+\.[0-9]+|[0-9]+)h([0-9]+\.[0-9]+|[0-9]+)w([0-9]+\.[0-9]+|[0-9]+)d"
    dimensions_regex_2 = r"h([0-9]+\.[0-9]+|[0-9]+)w([0-9]+\.[0-9]+|[0-9]+)d([0-9]+\.[0-9]+|[0-9]+)"
    
    if not pd.isna(dim_l) and not pd.isna(dim_r):
        if re.match(dimensions_regex, dim_l) == None:
            groups_l = re.match(dimensions_regex_2, dim_l).groups(1)
        else:
            groups_l = re.match(dimensions_regex, dim_l).groups(1)
        if re.match(dimensions_regex, dim_r) == None:
            groups_r = re.match(dimensions_regex_2, dim_r).groups(1)
        else:
            groups_r = re.match(dimensions_regex, dim_r).groups(1)
        if np.sum(np.abs(np.array(groups_l).astype(float) - np.array(groups_r).astype(float))) <= 0.3:
            score += dim_weight
    if not pd.isna(dots_l) and not pd.isna(dots_r) and dots_l == dots_r:
        score += dots_weight
    if not pd.isna(megapixels_l) and not pd.isna(megapixels_r) and abs(float(megapixels_l.replace("mp", "")) - float(megapixels_r.replace("mp", ""))) <= 0.2:
        score += mp_weight
    if not pd.isna(screen_size_l) and not pd.isna(screen_size_r) and abs(float(screen_size_l.replace("in", "")) - float(screen_size_r.replace("in", ""))) <= 0.2:
        score += scr_weight
    if not pd.isna(type_l) and not pd.isna(type_r) and type_l == type_r:
        score += type_weight
    if not pd.isna(weight_l) and not pd.isna(weight_r) and abs(float(weight_l.replace("g", "")) - float(weight_r.replace("g", ""))) <= 0.2:
        score += weight_weight
        
    if isinstance(short_descr_r, list) and isinstance(short_descr_l, list):
        for spec1 in short_descr_l:
            for spec2 in short_descr_r:
                if spec1 == spec2:  
                    score += descr_weight
    if isinstance(page_title_r, list) and isinstance(page_title_l, list):
        for spec1 in page_title_l:
            for spec2 in page_title_r:
                if spec1 == spec2:  
                    score += title_weight
                    
    if score >= target:
        if row["page_title"] == [] or row["page_title_right"] == []:
            matched_no_title += 1
        else:
            matched_title += 1
    return score >= target
    

In [84]:
tqdm.pandas()

  from pandas import Panel


In [85]:
for gname, group in grouped:
    labels = []
    
    if len(group) == 1 or gname == ():
        continue
    #brand_and_unbranded = pd.concat([group, unbranded])
    
    print("CALCULATING FOR BRAND = ", gname)
    merged = get_merged_df(group)

    #logic
    
    print("NUMBER OF COMPARISONS: ", len(merged))
    labels.append(list(merged.progress_apply(determine_match, axis = 1)))
    labels = sum(labels, [])
    merged["label"] = labels
    print("MATCHED ", sum(merged["label"]), " OUT OF ", len(merged["label"]))
    del labels
    merged = merged.loc[merged['label'] == True]
    cols = ["left_spec_id", "right_spec_id"]
    merged = merged[cols]
    if not '/' in gname[0]:
        merged.to_csv("/Users/gfotiadis/programming/sigmod/datasets/created/with_details/{}_matches_labeled.csv".format(gname), index = False)
    else:
        merged.to_csv("/Users/gfotiadis/programming/sigmod/datasets/created/with_details/{}_matches_labeled.csv".format(gname[0].replace("/", "")), index = False)
     

100%|██████████| 1/1 [00:00<00:00, 600.99it/s]
100%|██████████| 1/1 [00:00<00:00, 516.54it/s]
100%|██████████| 55/55 [00:00<00:00, 3193.21it/s]
100%|██████████| 1/1 [00:00<00:00, 531.60it/s]
100%|██████████| 21/21 [00:00<00:00, 3127.85it/s]
100%|██████████| 28/28 [00:00<00:00, 3212.44it/s]
100%|██████████| 1/1 [00:00<00:00, 613.11it/s]
100%|██████████| 1/1 [00:00<00:00, 469.27it/s]

CALCULATING FOR BRAND =  ('advancedplus',)
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('agfa',)
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('aiptek',)
NUMBER OF COMPARISONS:  55
MATCHED  1  OUT OF  55
CALCULATING FOR BRAND =  ('apex',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('apple',)
NUMBER OF COMPARISONS:  21
MATCHED  0  OUT OF  21
CALCULATING FOR BRAND =  ('argus',)
NUMBER OF COMPARISONS:  28
MATCHED  1  OUT OF  28
CALCULATING FOR BRAND =  ('audiovox',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('axess',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('axis',)
NUMBER OF COMPARISONS: 


100%|██████████| 10/10 [00:00<00:00, 1857.04it/s]
100%|██████████| 10/10 [00:00<00:00, 2088.07it/s]
100%|██████████| 1/1 [00:00<00:00, 571.43it/s]
100%|██████████| 171/171 [00:00<00:00, 4202.59it/s]
100%|██████████| 105/105 [00:00<00:00, 4407.55it/s]
100%|██████████| 1/1 [00:00<00:00, 580.37it/s]

 10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('bell',)
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('bell & howell',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('bell & howell', 'bell')
NUMBER OF COMPARISONS:  171
MATCHED  12  OUT OF  171
CALCULATING FOR BRAND =  ('benq',)
NUMBER OF COMPARISONS:  105
MATCHED  9  OUT OF  105
CALCULATING FOR BRAND =  ('big mikes',)
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('blackmagic',)
NUMBER OF COMPARISONS: 


100%|██████████| 15/15 [00:00<00:00, 2697.42it/s]
100%|██████████| 1/1 [00:00<00:00, 661.46it/s]
100%|██████████| 15/15 [00:00<00:00, 2884.67it/s]
100%|██████████| 3/3 [00:00<00:00, 1345.91it/s]
100%|██████████| 1/1 [00:00<00:00, 367.92it/s]
100%|██████████| 28/28 [00:00<00:00, 3337.71it/s]
  0%|          | 0/1596 [00:00<?, ?it/s]

 15
MATCHED  1  OUT OF  15
CALCULATING FOR BRAND =  ('blackmagic', 'none')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('bosch',)
NUMBER OF COMPARISONS:  15
MATCHED  6  OUT OF  15
CALCULATING FOR BRAND =  ('brinno',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('bushnell',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('butterfly',)
NUMBER OF COMPARISONS:  28
MATCHED  21  OUT OF  28
CALCULATING FOR BRAND =  ('butterfly', 'canon')
NUMBER OF COMPARISONS:  1596


100%|██████████| 1596/1596 [00:00<00:00, 4443.40it/s]
100%|██████████| 171/171 [00:00<00:00, 3529.33it/s]
100%|██████████| 3/3 [00:00<00:00, 1687.40it/s]
100%|██████████| 21/21 [00:00<00:00, 3357.75it/s]
100%|██████████| 15/15 [00:00<00:00, 1904.08it/s]
100%|██████████| 10/10 [00:00<00:00, 2416.07it/s]


MATCHED  608  OUT OF  1596
CALCULATING FOR BRAND =  ('butterfly', 'fuji')
NUMBER OF COMPARISONS:  171
MATCHED  171  OUT OF  171
CALCULATING FOR BRAND =  ('butterfly', 'sony')
NUMBER OF COMPARISONS:  3
MATCHED  3  OUT OF  3
CALCULATING FOR BRAND =  ('bvi', 'canon')
NUMBER OF COMPARISONS:  21
MATCHED  15  OUT OF  21
CALCULATING FOR BRAND =  ('bvi', 'fuji')
NUMBER OF COMPARISONS:  15
MATCHED  10  OUT OF  15
CALCULATING FOR BRAND =  ('bvi', 'olympus')
NUMBER OF COMPARISONS:  10
MATCHED  10  OUT OF  10
CALCULATING FOR BRAND =  ('canon',)


  0%|          | 0/14372841 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  14372841


100%|██████████| 14372841/14372841 [48:52<00:00, 4901.41it/s] 


MATCHED  194551  OUT OF  14372841


100%|██████████| 1/1 [00:00<00:00, 651.09it/s]
100%|██████████| 1/1 [00:00<00:00, 700.69it/s]
100%|██████████| 21/21 [00:00<00:00, 2968.97it/s]
100%|██████████| 3/3 [00:00<00:00, 1275.51it/s]
100%|██████████| 1/1 [00:00<00:00, 552.46it/s]
100%|██████████| 153/153 [00:00<00:00, 3932.76it/s]
  0%|          | 0/45 [00:00<?, ?it/s]

CALCULATING FOR BRAND =  ('canon', 'ebasket')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('canon', 'fuji')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('canon', 'gid')
NUMBER OF COMPARISONS:  21
MATCHED  15  OUT OF  21
CALCULATING FOR BRAND =  ('canon', 'nikon')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('canon', 'samsung')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('canon', 'sigma')
NUMBER OF COMPARISONS:  153
MATCHED  1  OUT OF  153
CALCULATING FOR BRAND =  ('canon', 'tamron')
NUMBER OF COMPARISONS:  45


100%|██████████| 45/45 [00:00<00:00, 3513.99it/s]
100%|██████████| 36/36 [00:00<00:00, 3154.60it/s]
100%|██████████| 171/171 [00:00<00:00, 3666.54it/s]
  0%|          | 0/22791 [00:00<?, ?it/s]

MATCHED  0  OUT OF  45
CALCULATING FOR BRAND =  ('carl', 'sony')
NUMBER OF COMPARISONS:  36
MATCHED  4  OUT OF  36
CALCULATING FOR BRAND =  ('carl', 'sony', 'zeiss')
NUMBER OF COMPARISONS:  171
MATCHED  19  OUT OF  171
CALCULATING FOR BRAND =  ('casio',)
NUMBER OF COMPARISONS:  22791


100%|██████████| 22791/22791 [00:04<00:00, 4755.38it/s]
100%|██████████| 3/3 [00:00<00:00, 1664.85it/s]
100%|██████████| 1/1 [00:00<00:00, 744.99it/s]
100%|██████████| 55/55 [00:00<00:00, 4400.65it/s]
100%|██████████| 6/6 [00:00<00:00, 2083.26it/s]
  0%|          | 0/595 [00:00<?, ?it/s]

MATCHED  247  OUT OF  22791
CALCULATING FOR BRAND =  ('celestron',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('celltime inc',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('cobra',)
NUMBER OF COMPARISONS:  55
MATCHED  1  OUT OF  55
CALCULATING FOR BRAND =  ('coby',)
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('coleman',)
NUMBER OF COMPARISONS:  595


100%|██████████| 595/595 [00:00<00:00, 4371.01it/s]
100%|██████████| 6/6 [00:00<00:00, 2204.24it/s]
100%|██████████| 1/1 [00:00<00:00, 514.64it/s]
100%|██████████| 1/1 [00:00<00:00, 427.34it/s]
100%|██████████| 1/1 [00:00<00:00, 585.80it/s]
100%|██████████| 1/1 [00:00<00:00, 632.43it/s]
100%|██████████| 21/21 [00:00<00:00, 3255.24it/s]
100%|██████████| 3/3 [00:00<00:00, 1360.02it/s]
100%|██████████| 6/6 [00:00<00:00, 2016.17it/s]
100%|██████████| 1/1 [00:00<00:00, 567.18it/s]
100%|██████████| 3/3 [00:00<00:00, 1440.85it/s]

MATCHED  102  OUT OF  595
CALCULATING FOR BRAND =  ('coleman', 'none')
NUMBER OF COMPARISONS:  6
MATCHED  6  OUT OF  6
CALCULATING FOR BRAND =  ('coleman', 'xtreme')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('contax',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('contour',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('contour, inc', 'contour')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('covert',)
NUMBER OF COMPARISONS:  21
MATCHED  0  OUT OF  21
CALCULATING FOR BRAND =  ('crayola',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('crayola', 'sakar')
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('creative',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('digiflip',)
NUMBER OF COMPARISONS:  3
MATCHED 


100%|██████████| 1/1 [00:00<00:00, 520.58it/s]
100%|██████████| 210/210 [00:00<00:00, 4641.89it/s]
100%|██████████| 1/1 [00:00<00:00, 727.80it/s]
100%|██████████| 1/1 [00:00<00:00, 682.56it/s]
100%|██████████| 10/10 [00:00<00:00, 2701.99it/s]
100%|██████████| 1/1 [00:00<00:00, 636.56it/s]
100%|██████████| 66/66 [00:00<00:00, 4071.60it/s]

 0  OUT OF  3
CALCULATING FOR BRAND =  ('discovery kids', 'discovery')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('disney',)
NUMBER OF COMPARISONS:  210
MATCHED  15  OUT OF  210
CALCULATING FOR BRAND =  ('disney', 'none')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('disney', 'sakar')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('dji',)
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('drift',)
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('dxg',)
NUMBER OF COMPARISONS:  66
MATCHED  5  OUT OF  66



100%|██████████| 1/1 [00:00<00:00, 468.85it/s]
100%|██████████| 36/36 [00:00<00:00, 3342.30it/s]
100%|██████████| 6/6 [00:00<00:00, 2054.19it/s]
100%|██████████| 1/1 [00:00<00:00, 588.92it/s]
100%|██████████| 45/45 [00:00<00:00, 3808.08it/s]
100%|██████████| 1/1 [00:00<00:00, 498.79it/s]
100%|██████████| 6/6 [00:00<00:00, 1576.90it/s]
100%|██████████| 6/6 [00:00<00:00, 1604.66it/s]
100%|██████████| 3/3 [00:00<00:00, 1254.65it/s]

CALCULATING FOR BRAND =  ('dxg', 'none')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('easypix',)
NUMBER OF COMPARISONS:  36
MATCHED  4  OUT OF  36
CALCULATING FOR BRAND =  ('ebasket', 'samsung')
NUMBER OF COMPARISONS:  6
MATCHED  6  OUT OF  6
CALCULATING FOR BRAND =  ('element',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('emerson',)
NUMBER OF COMPARISONS:  45
MATCHED  2  OUT OF  45
CALCULATING FOR BRAND =  ('emerson', 'none')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('epson',)
NUMBER OF COMPARISONS:  6
MATCHED  1  OUT OF  6
CALCULATING FOR BRAND =  ('fisher price',)
NUMBER OF COMPARISONS:  6
MATCHED  2  OUT OF  6
CALCULATING FOR BRAND =  ('flip',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3



100%|██████████| 3/3 [00:00<00:00, 1083.43it/s]
100%|██████████| 1/1 [00:00<00:00, 515.65it/s]
100%|██████████| 1/1 [00:00<00:00, 404.23it/s]
100%|██████████| 3/3 [00:00<00:00, 1020.84it/s]
100%|██████████| 21/21 [00:00<00:00, 2810.21it/s]


CALCULATING FOR BRAND =  ('flip', 'flip video')
NUMBER OF COMPARISONS:  3
MATCHED  1  OUT OF  3
CALCULATING FOR BRAND =  ('flip', 'nikon')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('flip', 'vivitar')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('foscam',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('fotopix',)
NUMBER OF COMPARISONS:  21
MATCHED  9  OUT OF  21
CALCULATING FOR BRAND =  ('fuji',)


  0%|          | 0/1078246 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  1078246


100%|██████████| 1078246/1078246 [03:42<00:00, 4839.94it/s]
100%|██████████| 1/1 [00:00<00:00, 689.51it/s]
100%|██████████| 1/1 [00:00<00:00, 654.03it/s]
100%|██████████| 3/3 [00:00<00:00, 1332.09it/s]
  0%|          | 0/276 [00:00<?, ?it/s]

MATCHED  9103  OUT OF  1078246
CALCULATING FOR BRAND =  ('fuji', 'nikon')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('fuji', 'olympus')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('fuji', 'polaroid')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('fvanor',)
NUMBER OF COMPARISONS:  276


100%|██████████| 276/276 [00:00<00:00, 3549.79it/s]
100%|██████████| 15/15 [00:00<00:00, 3286.56it/s]
100%|██████████| 1/1 [00:00<00:00, 650.58it/s]
 11%|█▏        | 455/4005 [00:00<00:00, 4542.07it/s]

MATCHED  276  OUT OF  276
CALCULATING FOR BRAND =  ('garmin',)
NUMBER OF COMPARISONS:  15
MATCHED  3  OUT OF  15
CALCULATING FOR BRAND =  ('gateway',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('ge',)
NUMBER OF COMPARISONS:  4005


100%|██████████| 4005/4005 [00:00<00:00, 4787.52it/s]
100%|██████████| 1/1 [00:00<00:00, 462.54it/s]
100%|██████████| 6/6 [00:00<00:00, 1902.90it/s]
  0%|          | 0/36046 [00:00<?, ?it/s]

MATCHED  412  OUT OF  4005
CALCULATING FOR BRAND =  ('general electric',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('global point',)
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('gopro',)
NUMBER OF COMPARISONS:  36046


100%|██████████| 36046/36046 [00:06<00:00, 5294.16it/s]
100%|██████████| 6/6 [00:00<00:00, 2654.90it/s]
100%|██████████| 3/3 [00:00<00:00, 1629.28it/s]
100%|██████████| 190/190 [00:00<00:00, 4816.26it/s]
100%|██████████| 1/1 [00:00<00:00, 680.23it/s]
100%|██████████| 3/3 [00:00<00:00, 1517.48it/s]
100%|██████████| 1/1 [00:00<00:00, 630.82it/s]
100%|██████████| 21/21 [00:00<00:00, 3215.32it/s]
  0%|          | 0/3240 [00:00<?, ?it/s]

MATCHED  861  OUT OF  36046
CALCULATING FOR BRAND =  ('gopro', 'none')
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('gopro', 'olympus')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('hasselblad',)
NUMBER OF COMPARISONS:  190
MATCHED  29  OUT OF  190
CALCULATING FOR BRAND =  ('hello',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('hello', 'hello kitty')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('hello', 'sakar')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('hoya',)
NUMBER OF COMPARISONS:  21
MATCHED  0  OUT OF  21
CALCULATING FOR BRAND =  ('hp',)
NUMBER OF COMPARISONS:  3240


100%|██████████| 3240/3240 [00:00<00:00, 4917.64it/s]
100%|██████████| 6/6 [00:00<00:00, 1661.11it/s]
100%|██████████| 1/1 [00:00<00:00, 739.61it/s]
100%|██████████| 1/1 [00:00<00:00, 513.06it/s]
100%|██████████| 1/1 [00:00<00:00, 598.84it/s]
100%|██████████| 28/28 [00:00<00:00, 3668.30it/s]
100%|██████████| 36/36 [00:00<00:00, 3996.48it/s]
100%|██████████| 3/3 [00:00<00:00, 1399.81it/s]
100%|██████████| 1/1 [00:00<00:00, 674.76it/s]
100%|██████████| 3/3 [00:00<00:00, 1532.07it/s]
100%|██████████| 6/6 [00:00<00:00, 1856.43it/s]

MATCHED  91  OUT OF  3240
CALCULATING FOR BRAND =  ('htc',)
NUMBER OF COMPARISONS:  6
MATCHED  3  OUT OF  6
CALCULATING FOR BRAND =  ('humminbird',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('iclick',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('innovage',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('insignia',)
NUMBER OF COMPARISONS:  28
MATCHED  0  OUT OF  28
CALCULATING FOR BRAND =  ('intova',)
NUMBER OF COMPARISONS:  36
MATCHED  2  OUT OF  36
CALCULATING FOR BRAND =  ('ion',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('ion', 'leica')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('iris',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('jazz',)
NUMBER OF COMPARISONS:  6



100%|██████████| 21/21 [00:00<00:00, 2465.44it/s]
100%|██████████| 1/1 [00:00<00:00, 657.41it/s]
100%|██████████| 3/3 [00:00<00:00, 1365.63it/s]
100%|██████████| 1/1 [00:00<00:00, 486.35it/s]


MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('jvc',)
NUMBER OF COMPARISONS:  21
MATCHED  1  OUT OF  21
CALCULATING FOR BRAND =  ('jvc', 'none')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('keedox',)
NUMBER OF COMPARISONS:  3
MATCHED  1  OUT OF  3
CALCULATING FOR BRAND =  ('kenko',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('kodak',)


  0%|          | 31/283881 [00:00<15:18, 309.20it/s]

NUMBER OF COMPARISONS:  283881


100%|██████████| 283881/283881 [00:57<00:00, 4929.53it/s]
100%|██████████| 1/1 [00:00<00:00, 517.56it/s]
100%|██████████| 3/3 [00:00<00:00, 1477.04it/s]
100%|██████████| 3/3 [00:00<00:00, 1306.64it/s]
100%|██████████| 1/1 [00:00<00:00, 522.72it/s]
  0%|          | 0/13366 [00:00<?, ?it/s]

MATCHED  2300  OUT OF  283881
CALCULATING FOR BRAND =  ('kodak', 'hp')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('kodak', 'none')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('konica minolta', 'konica')
NUMBER OF COMPARISONS:  3
MATCHED  1  OUT OF  3
CALCULATING FOR BRAND =  ('lego',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('leica',)
NUMBER OF COMPARISONS:  13366


100%|██████████| 13366/13366 [00:02<00:00, 5023.38it/s]
100%|██████████| 21/21 [00:00<00:00, 3917.30it/s]
100%|██████████| 231/231 [00:00<00:00, 4624.52it/s]
100%|██████████| 1/1 [00:00<00:00, 550.43it/s]
100%|██████████| 190/190 [00:00<00:00, 3946.33it/s]
100%|██████████| 3/3 [00:00<00:00, 1068.43it/s]


MATCHED  576  OUT OF  13366
CALCULATING FOR BRAND =  ('leica', 'none')
NUMBER OF COMPARISONS:  21
MATCHED  1  OUT OF  21
CALCULATING FOR BRAND =  ('leica', 'panasonic')
NUMBER OF COMPARISONS:  231
MATCHED  23  OUT OF  231
CALCULATING FOR BRAND =  ('leica', 'ricoh')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('lg',)
NUMBER OF COMPARISONS:  190
MATCHED  23  OUT OF  190
CALCULATING FOR BRAND =  ('limited too',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3


100%|██████████| 1/1 [00:00<00:00, 616.90it/s]
100%|██████████| 120/120 [00:00<00:00, 3905.22it/s]
100%|██████████| 21/21 [00:00<00:00, 3583.56it/s]
100%|██████████| 190/190 [00:00<00:00, 3965.95it/s]
100%|██████████| 10/10 [00:00<00:00, 2493.79it/s]
100%|██████████| 1/1 [00:00<00:00, 521.94it/s]

CALCULATING FOR BRAND =  ('little acorn', 'ltl', 'acorn')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('lowepro',)
NUMBER OF COMPARISONS:  120
MATCHED  0  OUT OF  120
CALCULATING FOR BRAND =  ('lowrance',)
NUMBER OF COMPARISONS:  21
MATCHED  0  OUT OF  21
CALCULATING FOR BRAND =  ('lytro',)
NUMBER OF COMPARISONS:  190
MATCHED  10  OUT OF  190
CALCULATING FOR BRAND =  ('magnavox',)
NUMBER OF COMPARISONS:  10
MATCHED  1  OUT OF  10
CALCULATING FOR BRAND =  ('mamiya',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1



100%|██████████| 6/6 [00:00<00:00, 1869.12it/s]
100%|██████████| 1/1 [00:00<00:00, 442.48it/s]
100%|██████████| 1/1 [00:00<00:00, 289.70it/s]
100%|██████████| 276/276 [00:00<00:00, 4182.77it/s]
  0%|          | 0/903 [00:00<?, ?it/s]

CALCULATING FOR BRAND =  ('mattel',)
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('midland',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('mikona',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('minolta',)
NUMBER OF COMPARISONS:  276
MATCHED  16  OUT OF  276
CALCULATING FOR BRAND =  ('minolta', 'konica minolta')
NUMBER OF COMPARISONS:  903


100%|██████████| 903/903 [00:00<00:00, 4386.13it/s]
100%|██████████| 45/45 [00:00<00:00, 4105.44it/s]
100%|██████████| 1/1 [00:00<00:00, 621.56it/s]
100%|██████████| 10/10 [00:00<00:00, 2758.68it/s]
100%|██████████| 3/3 [00:00<00:00, 1386.24it/s]
100%|██████████| 1/1 [00:00<00:00, 667.14it/s]
100%|██████████| 1/1 [00:00<00:00, 667.14it/s]
100%|██████████| 6/6 [00:00<00:00, 2165.36it/s]
100%|██████████| 15/15 [00:00<00:00, 2812.70it/s]
100%|██████████| 36/36 [00:00<00:00, 3964.27it/s]


MATCHED  170  OUT OF  903
CALCULATING FOR BRAND =  ('minolta', 'konica minolta', 'konica')
NUMBER OF COMPARISONS:  45
MATCHED  4  OUT OF  45
CALCULATING FOR BRAND =  ('minolta', 'sony')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('minox',)
NUMBER OF COMPARISONS:  10
MATCHED  6  OUT OF  10
CALCULATING FOR BRAND =  ('minox', 'leica')
NUMBER OF COMPARISONS:  3
MATCHED  1  OUT OF  3
CALCULATING FOR BRAND =  ('mitsuba',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('mitsubishi',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('motorola',)
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('mustek',)
NUMBER OF COMPARISONS:  15
MATCHED  1  OUT OF  15
CALCULATING FOR BRAND =  ('naxa',)
NUMBER OF COMPARISONS:  36
MATCHED  0  OUT OF  36


100%|██████████| 21/21 [00:00<00:00, 3223.32it/s]
  0%|          | 0/67896 [00:00<?, ?it/s]

CALCULATING FOR BRAND =  ('nest',)
NUMBER OF COMPARISONS:  21
MATCHED  0  OUT OF  21
CALCULATING FOR BRAND =  ('night',)
NUMBER OF COMPARISONS:  67896


100%|██████████| 67896/67896 [00:13<00:00, 5072.35it/s]
100%|██████████| 1/1 [00:00<00:00, 645.18it/s]


MATCHED  632  OUT OF  67896
CALCULATING FOR BRAND =  ('night', 'chobi')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('nikon',)


  0%|          | 0/10897446 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  10897446


100%|██████████| 10897446/10897446 [36:53<00:00, 4922.88it/s]


MATCHED  122043  OUT OF  10897446


100%|██████████| 3/3 [00:00<00:00, 343.22it/s]
100%|██████████| 1/1 [00:00<00:00, 652.30it/s]
100%|██████████| 55/55 [00:00<00:00, 3495.84it/s]
100%|██████████| 66/66 [00:00<00:00, 3438.72it/s]
100%|██████████| 1/1 [00:00<00:00, 552.68it/s]
  0%|          | 0/4851 [00:00<?, ?it/s]

CALCULATING FOR BRAND =  ('nikon', 'lowepro')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('nikon', 'olympus')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('nikon', 'sigma')
NUMBER OF COMPARISONS:  55
MATCHED  2  OUT OF  55
CALCULATING FOR BRAND =  ('nikon', 'tamron')
NUMBER OF COMPARISONS:  66
MATCHED  0  OUT OF  66
CALCULATING FOR BRAND =  ('nokia',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('none',)
NUMBER OF COMPARISONS:  4851


100%|██████████| 4851/4851 [00:00<00:00, 4871.47it/s]
100%|██████████| 528/528 [00:00<00:00, 4310.10it/s]
100%|██████████| 10/10 [00:00<00:00, 2321.27it/s]
100%|██████████| 28/28 [00:00<00:00, 3476.01it/s]


MATCHED  19  OUT OF  4851
CALCULATING FOR BRAND =  ('none', 'canon')
NUMBER OF COMPARISONS:  528
MATCHED  1  OUT OF  528
CALCULATING FOR BRAND =  ('none', 'casio')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('none', 'fuji')
NUMBER OF COMPARISONS:  28
MATCHED  0  OUT OF  28
CALCULATING FOR BRAND =  ('none', 'intova')


100%|██████████| 3/3 [00:00<00:00, 958.11it/s]
100%|██████████| 231/231 [00:00<00:00, 3895.58it/s]
100%|██████████| 10/10 [00:00<00:00, 1120.60it/s]
  0%|          | 0/171 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  3
MATCHED  3  OUT OF  3
CALCULATING FOR BRAND =  ('none', 'nikon')
NUMBER OF COMPARISONS:  231
MATCHED  4  OUT OF  231
CALCULATING FOR BRAND =  ('none', 'olympus')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('none', 'samsung')
NUMBER OF COMPARISONS:  171


100%|██████████| 171/171 [00:00<00:00, 3262.73it/s]


MATCHED  3  OUT OF  171
CALCULATING FOR BRAND =  ('olympus',)


  0%|          | 0/1004653 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  1004653


100%|██████████| 1004653/1004653 [03:19<00:00, 5042.55it/s]
100%|██████████| 1/1 [00:00<00:00, 723.16it/s]
100%|██████████| 1/1 [00:00<00:00, 630.15it/s]
100%|██████████| 10/10 [00:00<00:00, 1745.23it/s]
100%|██████████| 1/1 [00:00<00:00, 443.47it/s]

MATCHED  23653  OUT OF  1004653
CALCULATING FOR BRAND =  ('olympus', 'sigma')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('oregon scientific',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('oregon scientific', 'oregon')
NUMBER OF COMPARISONS:  10
MATCHED  1  OUT OF  10
CALCULATING FOR BRAND =  ('packard',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('packard', 'hp')
NUMBER OF COMPARISONS: 


100%|██████████| 3/3 [00:00<00:00, 1179.72it/s]


 3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('panasonic',)


  0%|          | 0/796953 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  796953


100%|██████████| 796953/796953 [02:42<00:00, 4894.16it/s]
100%|██████████| 66/66 [00:00<00:00, 4064.61it/s]
100%|██████████| 1/1 [00:00<00:00, 695.46it/s]
100%|██████████| 36/36 [00:00<00:00, 3704.49it/s]
100%|██████████| 1/1 [00:00<00:00, 613.38it/s]
100%|██████████| 3/3 [00:00<00:00, 1161.00it/s]
100%|██████████| 3/3 [00:00<00:00, 1029.19it/s]

MATCHED  6375  OUT OF  796953
CALCULATING FOR BRAND =  ('panasonic', 'butterfly')
NUMBER OF COMPARISONS:  66
MATCHED  66  OUT OF  66
CALCULATING FOR BRAND =  ('panasonic', 'canon', 'nikon')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('panasonic', 'none')
NUMBER OF COMPARISONS:  36
MATCHED  1  OUT OF  36
CALCULATING FOR BRAND =  ('panasonic', 'olympus')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('panasonic', 'sharp')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('pelco',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3





CALCULATING FOR BRAND =  ('pentax',)


  0%|          | 248/172578 [00:00<01:09, 2475.87it/s]

NUMBER OF COMPARISONS:  172578


100%|██████████| 172578/172578 [00:34<00:00, 5005.73it/s]
100%|██████████| 153/153 [00:00<00:00, 3745.61it/s]
100%|██████████| 1/1 [00:00<00:00, 489.25it/s]
100%|██████████| 10/10 [00:00<00:00, 2593.56it/s]
100%|██████████| 3/3 [00:00<00:00, 1426.47it/s]
100%|██████████| 10/10 [00:00<00:00, 2211.37it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

MATCHED  4076  OUT OF  172578
CALCULATING FOR BRAND =  ('pentax', 'bvi')
NUMBER OF COMPARISONS:  153
MATCHED  153  OUT OF  153
CALCULATING FOR BRAND =  ('pentax', 'nikon')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('pentax', 'ricoh')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('pentax', 'sigma')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('pentax', 'tamron')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('philips',)
NUMBER OF COMPARISONS:  153


100%|██████████| 153/153 [00:00<00:00, 4061.80it/s]
100%|██████████| 3/3 [00:00<00:00, 1264.11it/s]
  7%|▋         | 456/6555 [00:00<00:01, 4556.41it/s]

MATCHED  0  OUT OF  153
CALCULATING FOR BRAND =  ('pioneer',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('polaroid',)
NUMBER OF COMPARISONS:  6555


100%|██████████| 6555/6555 [00:01<00:00, 4975.96it/s]
100%|██████████| 1/1 [00:00<00:00, 632.15it/s]
100%|██████████| 1/1 [00:00<00:00, 560.21it/s]
100%|██████████| 3/3 [00:00<00:00, 1414.92it/s]
100%|██████████| 1/1 [00:00<00:00, 617.63it/s]
100%|██████████| 1/1 [00:00<00:00, 518.46it/s]
100%|██████████| 28/28 [00:00<00:00, 3537.58it/s]
100%|██████████| 1/1 [00:00<00:00, 542.60it/s]
  0%|          | 0/4371 [00:00<?, ?it/s]

MATCHED  166  OUT OF  6555
CALCULATING FOR BRAND =  ('praktica',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('precision',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('premier',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('proscan',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('quantumfx',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('rca',)
NUMBER OF COMPARISONS:  28
MATCHED  0  OUT OF  28
CALCULATING FOR BRAND =  ('rca', 'polaroid')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('ricoh',)
NUMBER OF COMPARISONS:  4371


100%|██████████| 4371/4371 [00:00<00:00, 4982.93it/s]
100%|██████████| 1/1 [00:00<00:00, 611.59it/s]
100%|██████████| 153/153 [00:00<00:00, 4965.21it/s]


MATCHED  523  OUT OF  4371
CALCULATING FOR BRAND =  ('rollei',)
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('sakar',)
NUMBER OF COMPARISONS:  153
MATCHED  2  OUT OF  153
CALCULATING FOR BRAND =  ('samsung',)


  0%|          | 0/540280 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  540280


100%|██████████| 540280/540280 [01:49<00:00, 4942.83it/s]
100%|██████████| 91/91 [00:00<00:00, 4774.42it/s]
100%|██████████| 1/1 [00:00<00:00, 413.44it/s]
100%|██████████| 1/1 [00:00<00:00, 507.91it/s]
  0%|          | 0/990 [00:00<?, ?it/s]

MATCHED  5563  OUT OF  540280
CALCULATING FOR BRAND =  ('sandisk',)
NUMBER OF COMPARISONS:  91
MATCHED  1  OUT OF  91
CALCULATING FOR BRAND =  ('sandisk', 'olympus')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sandisk', 'sony')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sanyo',)
NUMBER OF COMPARISONS:  990


100%|██████████| 990/990 [00:00<00:00, 4636.56it/s]
100%|██████████| 1/1 [00:00<00:00, 800.59it/s]
100%|██████████| 66/66 [00:00<00:00, 4468.51it/s]
100%|██████████| 1/1 [00:00<00:00, 666.71it/s]
100%|██████████| 45/45 [00:00<00:00, 3654.71it/s]
100%|██████████| 10/10 [00:00<00:00, 2724.81it/s]
100%|██████████| 1/1 [00:00<00:00, 659.59it/s]
100%|██████████| 1/1 [00:00<00:00, 414.05it/s]
100%|██████████| 1/1 [00:00<00:00, 651.90it/s]
100%|██████████| 3/3 [00:00<00:00, 1488.75it/s]


MATCHED  27  OUT OF  990
CALCULATING FOR BRAND =  ('sciwin',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sealife',)
NUMBER OF COMPARISONS:  66
MATCHED  14  OUT OF  66
CALCULATING FOR BRAND =  ('sealife', 'sealife reefmaster')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('sharp',)
NUMBER OF COMPARISONS:  45
MATCHED  4  OUT OF  45
CALCULATING FOR BRAND =  ('sharp', 'canon')
NUMBER OF COMPARISONS:  10
MATCHED  6  OUT OF  10
CALCULATING FOR BRAND =  ('sharp', 'olympus')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sharp', 'sharper image')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sharper', 'sharper image')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sharper', 'the sharper image')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3


100%|██████████| 3/3 [00:00<00:00, 1467.05it/s]
100%|██████████| 703/703 [00:00<00:00, 4702.63it/s]
100%|██████████| 1/1 [00:00<00:00, 696.27it/s]

CALCULATING FOR BRAND =  ('shimano',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('sigma',)
NUMBER OF COMPARISONS:  703
MATCHED  51  OUT OF  703
CALCULATING FOR BRAND =  ('sipix',)
NUMBER OF COMPARISONS:  1
MATCHED 


100%|██████████| 3/3 [00:00<00:00, 1342.32it/s]
100%|██████████| 1/1 [00:00<00:00, 557.68it/s]
100%|██████████| 1/1 [00:00<00:00, 513.32it/s]


 0  OUT OF  1
CALCULATING FOR BRAND =  ('sjcam',)
NUMBER OF COMPARISONS:  3
MATCHED  1  OUT OF  3
CALCULATING FOR BRAND =  ('skyworth',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('slick',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sony',)


  0%|          | 0/4915680 [00:00<?, ?it/s]

NUMBER OF COMPARISONS:  4915680


100%|██████████| 4915680/4915680 [16:35<00:00, 4937.33it/s]


MATCHED  27698  OUT OF  4915680


100%|██████████| 15/15 [00:00<00:00, 3020.38it/s]
100%|██████████| 3/3 [00:00<00:00, 1372.93it/s]
100%|██████████| 1/1 [00:00<00:00, 546.99it/s]
100%|██████████| 1/1 [00:00<00:00, 382.27it/s]
100%|██████████| 3/3 [00:00<00:00, 1283.32it/s]
100%|██████████| 1/1 [00:00<00:00, 512.81it/s]
  0%|          | 0/406 [00:00<?, ?it/s]

CALCULATING FOR BRAND =  ('sony', 'canon')
NUMBER OF COMPARISONS:  15
MATCHED  0  OUT OF  15
CALCULATING FOR BRAND =  ('sony', 'ebasket')
NUMBER OF COMPARISONS:  3
MATCHED  3  OUT OF  3
CALCULATING FOR BRAND =  ('sony', 'fuji')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sony', 'hp')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('sony', 'lowepro')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('sony', 'night')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('sony', 'nikon')
NUMBER OF COMPARISONS:  406


100%|██████████| 406/406 [00:00<00:00, 4412.37it/s]
100%|██████████| 630/630 [00:00<00:00, 4609.41it/s]
100%|██████████| 3/3 [00:00<00:00, 1451.32it/s]
100%|██████████| 45/45 [00:00<00:00, 4055.52it/s]

MATCHED  1  OUT OF  406
CALCULATING FOR BRAND =  ('sony', 'none')
NUMBER OF COMPARISONS:  630
MATCHED  33  OUT OF  630
CALCULATING FOR BRAND =  ('sony', 'olympus')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('sony', 'sigma')
NUMBER OF COMPARISONS:  45



100%|██████████| 10/10 [00:00<00:00, 2387.33it/s]
100%|██████████| 21/21 [00:00<00:00, 3141.46it/s]
100%|██████████| 1/1 [00:00<00:00, 695.92it/s]
100%|██████████| 15/15 [00:00<00:00, 3213.53it/s]
100%|██████████| 3/3 [00:00<00:00, 1180.61it/s]
  0%|          | 0/1275 [00:00<?, ?it/s]

MATCHED  2  OUT OF  45
CALCULATING FOR BRAND =  ('sony', 'tamron')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('sony', 'zeiss')
NUMBER OF COMPARISONS:  21
MATCHED  4  OUT OF  21
CALCULATING FOR BRAND =  ('spongebob',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('superheadz',)
NUMBER OF COMPARISONS:  15
MATCHED  0  OUT OF  15
CALCULATING FOR BRAND =  ('supersonic',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('svp',)
NUMBER OF COMPARISONS:  1275


100%|██████████| 1275/1275 [00:00<00:00, 4958.06it/s]
100%|██████████| 6/6 [00:00<00:00, 1879.59it/s]
100%|██████████| 1/1 [00:00<00:00, 652.81it/s]
100%|██████████| 1/1 [00:00<00:00, 515.90it/s]
100%|██████████| 66/66 [00:00<00:00, 3862.59it/s]
100%|██████████| 10/10 [00:00<00:00, 3341.01it/s]
100%|██████████| 3/3 [00:00<00:00, 1660.45it/s]
100%|██████████| 10/10 [00:00<00:00, 2184.42it/s]
100%|██████████| 1/1 [00:00<00:00, 623.41it/s]
100%|██████████| 136/136 [00:00<00:00, 4669.23it/s]

MATCHED  1  OUT OF  1275
CALCULATING FOR BRAND =  ('sylvania',)
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('tamrac', 'minolta')
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('tamron',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('tamron', 'nikon')
NUMBER OF COMPARISONS:  66
MATCHED  8  OUT OF  66
CALCULATING FOR BRAND =  ('toi kamera',)
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('tokina', 'canon')
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('tokina', 'nikon')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('topixo',)
NUMBER OF COMPARISONS:  1
MATCHED  1  OUT OF  1
CALCULATING FOR BRAND =  ('toshiba',)
NUMBER OF COMPARISONS:  136



100%|██████████| 1/1 [00:00<00:00, 727.80it/s]
100%|██████████| 1/1 [00:00<00:00, 582.06it/s]
100%|██████████| 6/6 [00:00<00:00, 2365.88it/s]
100%|██████████| 55/55 [00:00<00:00, 3351.98it/s]
  0%|          | 0/17391 [00:00<?, ?it/s]

MATCHED  1  OUT OF  136
CALCULATING FOR BRAND =  ('traveler',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('tvc',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('vibe',)
NUMBER OF COMPARISONS:  6
MATCHED  0  OUT OF  6
CALCULATING FOR BRAND =  ('vistaquest',)
NUMBER OF COMPARISONS:  55
MATCHED  3  OUT OF  55
CALCULATING FOR BRAND =  ('vivitar',)
NUMBER OF COMPARISONS:  17391


100%|██████████| 17391/17391 [00:03<00:00, 5106.22it/s]
100%|██████████| 1/1 [00:00<00:00, 548.78it/s]
100%|██████████| 1/1 [00:00<00:00, 713.20it/s]
100%|██████████| 10/10 [00:00<00:00, 2905.45it/s]
100%|██████████| 496/496 [00:00<00:00, 4561.47it/s]
  0%|          | 0/45 [00:00<?, ?it/s]

MATCHED  385  OUT OF  17391
CALCULATING FOR BRAND =  ('vivitar', 'canon')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('vivitar', 'nikon')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('vivitar', 'sakar')
NUMBER OF COMPARISONS:  10
MATCHED  0  OUT OF  10
CALCULATING FOR BRAND =  ('vizio',)
NUMBER OF COMPARISONS:  496
MATCHED  78  OUT OF  496
CALCULATING FOR BRAND =  ('vtech',)
NUMBER OF COMPARISONS:  45


100%|██████████| 45/45 [00:00<00:00, 3953.99it/s]
100%|██████████| 1/1 [00:00<00:00, 394.76it/s]
100%|██████████| 3/3 [00:00<00:00, 1308.27it/s]
100%|██████████| 1/1 [00:00<00:00, 654.54it/s]
100%|██████████| 1/1 [00:00<00:00, 653.73it/s]
100%|██████████| 3/3 [00:00<00:00, 1441.67it/s]
100%|██████████| 36/36 [00:00<00:00, 3507.68it/s]
100%|██████████| 15/15 [00:00<00:00, 2831.82it/s]
100%|██████████| 1/1 [00:00<00:00, 487.03it/s]


MATCHED  0  OUT OF  45
CALCULATING FOR BRAND =  ('vupoint', 'vupoint solutions')
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('wespro',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('westinghouse',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('wingscapes',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1
CALCULATING FOR BRAND =  ('yashica',)
NUMBER OF COMPARISONS:  3
MATCHED  0  OUT OF  3
CALCULATING FOR BRAND =  ('yourdeal',)
NUMBER OF COMPARISONS:  36
MATCHED  14  OUT OF  36
CALCULATING FOR BRAND =  ('zeemo', 'canon')
NUMBER OF COMPARISONS:  15
MATCHED  11  OUT OF  15
CALCULATING FOR BRAND =  ('zenith',)
NUMBER OF COMPARISONS:  1
MATCHED  0  OUT OF  1


In [86]:
# count_title
# count_no_title
# matched_title
# matched_no_title

In [87]:
matched_no_title / count_no_title

0.02399022998450135

In [88]:
matched_title / count_title

0.009428436272976754

In [89]:
matched_no_title

129218

In [90]:
matched_title

272203

In [91]:
count_title

28870429

In [92]:
count_no_title

5386276

In [93]:
labels_unbranded = []
merged_unbranded = get_merged_df(unbranded_til_100)
labels_unbranded.append(list(merged_unbranded.progress_apply(determine_match, axis = 1)))
labels_unbranded = sum(labels_unbranded, [])
merged_unbranded["label"] = labels_unbranded
del labels_unbranded
merged_unbranded = merged_unbranded.loc[merged_unbranded['label'] == True]
cols = ["left_spec_id", "right_spec_id"]
merged_unbranded = merged_unbranded[cols]
merged_unbranded.to_csv("/Users/gfotiadis/programming/sigmod/datasets/created/with_details/unbranded_matches_labeled.csv", index = False)

100%|██████████| 38443296/38443296 [2:07:38<00:00, 5019.94it/s]  


In [94]:
len(merged_unbranded)

169982

In [95]:
import os
import glob

os.chdir("/Users/gfotiadis/programming/sigmod/datasets/created/with_details/")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f, header = 0) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "/Users/gfotiadis/programming/sigmod/datasets/created/with_details/combined_csv.csv", index=False, encoding='utf-8-sig')

In [96]:
len(combined_csv)

574100

In [None]:
# old 15100

In [None]:
# target 600,212