In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            weight = specification_data.get("weight")
            title = specification_data.get("<page title>")
            row = (specification_id, title, weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.alibaba.com",["spec_id", "<page title>", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df = df.rename(columns = {"<page title>" : "page_title"})

### Weight

In [5]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,page_title,weight
2,www.alibaba.com//23684,Slr Portable Camera Inner Partition Padded Pro...,0.19KG
3,www.alibaba.com//24141,Cheap Outdoor Ip Camera Ds-2cd2632f-is Hikvisi...,1200g
4,www.alibaba.com//22996,Hikvision Ds-2cd2532f-is Ir 3mp Mini Dome Hd 1...,600 g (1.32 lbs)
5,www.alibaba.com//24511,2colors Fashion Abs Plastic Dslr Slr Hard Comp...,0.09Kg
6,www.alibaba.com//35300,Hikvision Ir Cctv Camera 3mp Bullet Ip Camera ...,1200g
...,...,...,...
7960,www.alibaba.com//34210,Mini Camera Ds-2cd2532f-i 3 Megapixel Dome Hik...,600 g (1.32 lbs)
7962,www.alibaba.com//29588,Ds-2cd2012-i Network Video Server Hikvision Ip...,500g (1.1 lbs)
7968,www.alibaba.com//35051,Genuine For Gopro Hero 3 3+plus Underwater Wat...,0.1KG/pc
7969,www.alibaba.com//24210,Ds-2cd3332-i Ip Network Camera Hikvision Weath...,670g (1.5 lbs)


In [6]:
def parse_weight(value):
    if (isinstance(value, list)):
        return float("NaN")
        

    if pd.isna(value):
        return value
    else:
        value = value.lower()
       
        match1 = re.search('(approx.|)(\d*\,\d+|\d*\.\d+|\d+)( g|g| kg|kg|lbs| lbs)', str(value))
        if match1 is None: 
            return float("NaN")


        to_convert= match1.group(2).replace(" ","").replace(",",".")
        metric= match1.group(3).replace(" ","")
        converted=float("NaN")
        if metric == "oz":
            converted= int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            converted= int(round(float(to_convert) * 454))
        elif metric == "kg":
            converted= int(round(float(to_convert) * 1000))
        else:
            converted= int(round(float(to_convert)))
        return converted
        

In [7]:
df["weight"] = df["weight"].apply(parse_weight)

## Title

In [8]:
df.head()

Unnamed: 0,spec_id,page_title,weight
0,www.alibaba.com//37297,New Arrival Metal Case 1.3mp Vandalproof Ip Ca...,
1,www.alibaba.com//29289,New Design In 2014 Camera Case For Iphone - Bu...,
2,www.alibaba.com//23684,Slr Portable Camera Inner Partition Padded Pro...,190.0
3,www.alibaba.com//24141,Cheap Outdoor Ip Camera Ds-2cd2632f-is Hikvisi...,1200.0
4,www.alibaba.com//22996,Hikvision Ds-2cd2532f-is Ir 3mp Mini Dome Hd 1...,600.0


In [9]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [10]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [11]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [12]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

## Modelwords

In [13]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [14]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [44]:
def clean_mp_mm_g_oz(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    regex = r"[0-9]+mm(\n|)"
    regex1 = r"[0-9]+m$"
    regex2 = r"[0-9]+mp(\n|)"
    regex3 = r"[0-9]+oz"
    regex4 = r"[0-9]+g(\n|)$"
    repl = value
    for e in repl:
        if bool(re.match(regex, e)) or bool(re.match(regex2, e)) or bool(re.match(regex3, e)) or bool(re.match(regex4, e)) or bool(re.match(regex1, e)):
            repl.remove(e)
    return repl

In [45]:
df["page_title_model"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)),line)))).apply(lambda row : clean_mp_mm_g_oz(row))

In [46]:
df.head()

Unnamed: 0,spec_id,page_title,weight,page_title_model
0,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[]
1,www.alibaba.com//29289,"[new, design, 2014, camera, case, iphone, buy,...",,[]
2,www.alibaba.com//23684,"[slr, portable, camera, inner, partition, padd...",190.0,[]
3,www.alibaba.com//24141,"[cheap, outdoor, ip, camera, ds2cd2632fis, hik...",1200.0,[ds2cd2632fis]
4,www.alibaba.com//22996,"[hikvision, ds2cd2532fis, ir, 3mp, mini, dome,...",600.0,"[1080p, ds2cd2532fis, ip66]"


In [47]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [48]:
def get_merged_df(dataframe):
    merged = dataframe
    merged = (merged.merge(merged, on=merged.assign(key_col=1)['key_col'], suffixes=('', '_right'))
 .query('spec_id < spec_id_right') # filter out joins on the same row and keep unique combinations
 .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)
    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True, drop=True)
    return merged

In [49]:
merged = get_merged_df(df)

In [50]:
small_merged = merged#.sample(frac = 0.001)

In [51]:
len(small_merged)

317206

In [52]:
small_merged.head()

Unnamed: 0,left_spec_id,page_title,weight,page_title_model,right_spec_id,page_title_right,weight_right,page_title_model_right
0,www.alibaba.com//35400,"[cheap, best, selling, dslr, digital, camera, ...",,"[d5100, d7000, d700, d2000]",www.alibaba.com//37522,"[camera, pouch, bag, dslr, slr, compact, digit...",,[gx1]
1,www.alibaba.com//35400,"[cheap, best, selling, dslr, digital, camera, ...",,"[d5100, d7000, d700, d2000]",www.alibaba.com//37025,"[hikvision, camaras, cctv, 3, megapixel, mini,...",500.0,[ds2cd2132i]
2,www.alibaba.com//35400,"[cheap, best, selling, dslr, digital, camera, ...",,"[d5100, d7000, d700, d2000]",www.alibaba.com//37274,"[dslr, slr, underwater, camera, waterproof, ca...",,[nex5n]
3,www.alibaba.com//35400,"[cheap, best, selling, dslr, digital, camera, ...",,"[d5100, d7000, d700, d2000]",www.alibaba.com//37023,"[viewerframe, mode, ip, camera, ds2cd2632fis, ...",1200.0,[ds2cd2632fis]
4,www.alibaba.com//35400,"[cheap, best, selling, dslr, digital, camera, ...",,"[d5100, d7000, d700, d2000]",www.alibaba.com//6942,"[support, 32g, alarm, trigger, local, memory, ...",,[hfwo14]


In [53]:
def determine_match(row):
    score = 0
    
    if pd.isna(row["weight"]) or pd.isna(row["weight_right"]):
        target = 0.5
    else:
        target = 0.8
    
    
    
    weight_l = row["weight"]
    weight_r = row["weight_right"]
    page_title_l = row["page_title"]
    page_title_r = row["page_title_right"]
    model_l = row["page_title_model"]
    model_r = row["page_title_model_right"]
    
    weight_weight = 0.6
    
    if model_l != [] and model_r != []:
        model_count = 0
        for spec1 in model_l:
            for spec2 in model_r:
                if spec1 == spec2: 
                    return True
        return False
    
    if not pd.isna(weight_l) and not pd.isna(weight_r) and abs(weight_l - weight_r) <= 2:
        score += weight_weight


    if page_title_l != [] and page_title_r != []:
        score += jaccard_similarity(page_title_l, page_title_r)
                    
    return score > target
     

In [54]:
tqdm.pandas()

  from pandas import Panel


In [55]:
labels = []
labels.append(list(small_merged.progress_apply(determine_match, axis = 1)))
labels = sum(labels, [])
small_merged["label"] = labels


100%|██████████| 317206/317206 [00:50<00:00, 6225.04it/s]


In [56]:
small_merged[small_merged["label"] == True]

Unnamed: 0,left_spec_id,page_title,weight,page_title_model,right_spec_id,page_title_right,weight_right,page_title_model_right,label
416,www.alibaba.com//34221,"[top, 1, camera, china, hikvision, ds2cd2032i,...",500.0,[ds2cd2032i],www.alibaba.com//36126,"[surveillance, hikvision, ds2cd2032i, hikvisio...",500.0,"[ds2cd2032i, ip66]",True
427,www.alibaba.com//34221,"[top, 1, camera, china, hikvision, ds2cd2032i,...",500.0,[ds2cd2032i],www.alibaba.com//37832,"[1080p, exmor, cmos, ip, camera, hikvision, 3,...",500.0,"[1080p, ds2cd2032i, ip66]",True
610,www.alibaba.com//34221,"[top, 1, camera, china, hikvision, ds2cd2032i,...",500.0,[ds2cd2032i],www.alibaba.com//36396,"[ds2cd2012i, similiar, hikvision, 3mp, ir, bul...",500.0,"[ds2cd2032i, ds2cd2012i]",True
676,www.alibaba.com//37522,"[camera, pouch, bag, dslr, slr, compact, digit...",,[gx1],www.alibaba.com//37708,"[fashion, digital, camera, case, buy, digital,...",,[],True
1437,www.alibaba.com//35197,"[hikvision, cctv, camera, housing, mini, ir, b...",500.0,[ds2cd2012i],www.alibaba.com//37820,"[ds2cd2012i, best, cheap, hd, video, camera, i...",500.0,[ds2cd2012i],True
...,...,...,...,...,...,...,...,...,...
317162,www.alibaba.com//29494,"[fashion, camera, bags, buy, camera, case, wat...",,[],www.alibaba.com//29763,"[fashion, cool, camera, bag, waterproof, polye...",,[],True
317170,www.alibaba.com//29494,"[fashion, camera, bags, buy, camera, case, wat...",,[],www.alibaba.com//37708,"[fashion, digital, camera, case, buy, digital,...",,[],True
317176,www.alibaba.com//29494,"[fashion, camera, bags, buy, camera, case, wat...",,[],www.alibaba.com//37238,"[neoprene, digital, camera, pouch, buy, waterp...",,[],True
317191,www.alibaba.com//29494,"[fashion, camera, bags, buy, camera, case, wat...",,[],www.alibaba.com//29595,"[1112, waterproof, pu, digital, camera, case, ...",,[],True


In [58]:
len(small_merged[small_merged["label"] == True]) / len(small_merged)

0.02626368984193237

In [None]:
small_merged = small_merged[small_merged["label"] == True]
cols = ["left_spec_id", "right_spec_id"]
small_merged = small_merged[cols]
small_merged.to_csv("alibaba.csv", index = False)

  0%|          | 89506/31772406 [00:41<1:45:37, 4998.92it/s]

# Ebay

In [2]:
def create_dataframe_ebay(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            weight = specification_data.get("weight")
            brand = specification_data.get("brand")
            mp = specification_data.get("megapixels")
            scr = specification_data.get("screen size")
            title = specification_data.get("<page title>")
            row = (specification_id, title, brand, mp, scr, weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe_ebay('../datasets/unlabeled/2013_camera_specs', "www.ebay.com",["spec_id", "<page title>", "brand", "megapixels", "screen size", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df = df.rename(columns = {"<page title>" : "page_title"})

In [5]:
df.head()

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,Sony,4.0 MP,"1.8""",
1,www.ebay.com//24141,Olympus VH 210 14 0 MP Digital Camera Purple 4...,Olympus,14.0 MP,,
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,Canon,16.0 MP,"3""",
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,Canon,6.3 MP,"1.8""",
4,www.ebay.com//41942,Lumix Panasonic AVCHD Lite DMC TS1 12 MP Digit...,Panasonic,12.0 MP,,


In [6]:
df.isna().sum()

spec_id            0
page_title         0
brand            290
megapixels      1731
screen size     3030
weight         13719
dtype: int64

### megapixels

In [7]:
df[df['megapixels'].notnull()]

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,Sony,4.0 MP,"1.8""",
1,www.ebay.com//24141,Olympus VH 210 14 0 MP Digital Camera Purple 4...,Olympus,14.0 MP,,
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,Canon,16.0 MP,"3""",
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,Canon,6.3 MP,"1.8""",
4,www.ebay.com//41942,Lumix Panasonic AVCHD Lite DMC TS1 12 MP Digit...,Panasonic,12.0 MP,,
...,...,...,...,...,...,...
14268,www.ebay.com//58832,GE A830 8 0 MP Digital Camera Silver 081002701...,GE,8.0 MP,"2.5""",
14269,www.ebay.com//24210,Ximea USB 3 0 4 2 Mpix Colour Camera | eBay,Ximea,4.2,,32g
14271,www.ebay.com//44811,Nikon Coolpix P330 12 2 MP Digital Camera Blac...,Nikon,12.2 MP,"3""",
14272,www.ebay.com//48131,Olympus Camedia Camera C3000 C 3000 U1 05 0050...,Olympus,3.3 MP,"1.8""",


In [8]:
def parse_megapixels(value):
    if(isinstance(value, list)):
        
        value=value[0]
    if pd.isna(value):
        return value
    else:
        match1 = re.search('(\d*\,\d+|\d*\.\d+|\d+)( MP|)', str(value))
        if match1 is None: 
            return float("NaN")


        return (match1.group(1).replace(",","."))

        

In [9]:
df["megapixels"] = df["megapixels"].apply(parse_megapixels)

In [10]:
df.head()

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,Sony,4.0,"1.8""",
1,www.ebay.com//24141,Olympus VH 210 14 0 MP Digital Camera Purple 4...,Olympus,14.0,,
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,Canon,16.0,"3""",
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,Canon,6.3,"1.8""",
4,www.ebay.com//41942,Lumix Panasonic AVCHD Lite DMC TS1 12 MP Digit...,Panasonic,12.0,,


### Weight

In [11]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
17,www.ebay.com//46784,Olympus Stylus 50x Zoom SP 100 HD Digital Came...,,16.0,,20.8 oz. (589g)
19,www.ebay.com//47896,Nikon D70 6 1 MP Digital SLR Camera Black Body...,Nikon,6.1,"2""",600 gr
60,www.ebay.com//60120,Canon EOS Rebel T3 SLR Digital Camera w 18 55m...,Canon,12.2,"2.7""",7.1 oz (201 g)
75,www.ebay.com//44886,Canon EOS Rebel T5 Digital SLR Camera Body EF ...,Canon,18.0,"3""","[15.3 oz. (434g) (body only), 7.1 oz. (200g)]"
129,www.ebay.com//53928,Nikon D60 with DX 18 55mm VR Lens Nikon SB 600...,Nikon,10.2,"2.5""",16.1 Oz.
...,...,...,...,...,...,...
14143,www.ebay.com//54846,Sony Alpha SLT A58 Digital SLR Camera Body 18 ...,Sony,20.1,"2.7""",7.8 oz.(222g)
14186,www.ebay.com//55979,Sony Cyber Shot 63X Super Zoom DSC H400 HD Dig...,,20.1,,22.2 oz. (628g)
14262,www.ebay.com//58998,Canon PowerShot S3 Is 6 0MP 12x Optical Zoom 2...,Canon,6.0,"2""",14.5 Oz.
14264,www.ebay.com//46986,Panasonic Lumix DMC GM1 Digital Camera with G ...,Panasonic,16.0,"3""",2.47 oz (70 g)


In [12]:
def parse_weight(value):
    if (isinstance(value, list)):
        value=value[0]
        

    if pd.isna(value):
        return value
    else:
        value = value.lower()
       
        match1 = re.search('()(\d*\,\d+|\d*\.\d+|\d+)( g|g| kg|kg|lbs| lbs| oz| ounce)', str(value))
        if match1 is None: 
            return float("NaN")




        to_convert= match1.group(2).replace(" ","").replace(",",".")
        metric= match1.group(3).replace(" ","")
        converted=float("NaN")
        if (metric == "oz" or metric== "ounce"):
            converted= int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            converted= int(round(float(to_convert) * 454))
        elif metric == "kg":
            converted= int(round(float(to_convert) * 1000))
        else:
            converted= int(round(float(to_convert)))

        return converted
        

In [13]:
df["weight"] = df["weight"].apply(parse_weight)

In [14]:
df.head()

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,Sony,4.0,"1.8""",
1,www.ebay.com//24141,Olympus VH 210 14 0 MP Digital Camera Purple 4...,Olympus,14.0,,
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,Canon,16.0,"3""",
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,Canon,6.3,"1.8""",
4,www.ebay.com//41942,Lumix Panasonic AVCHD Lite DMC TS1 12 MP Digit...,Panasonic,12.0,,


## screen size

In [15]:
df[df['screen size'].notnull()]

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,Sony,4.0,"1.8""",
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,Canon,16.0,"3""",
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,Canon,6.3,"1.8""",
5,www.ebay.com//54243,Canon EOS 5D Mark III 22 3MP Digital Camera Bo...,Canon,22.3,"3.2""",
8,www.ebay.com//45002,Canon PowerShot S5 Is 8 MP 12x Optical Zoom Di...,Canon,8.0,"2.5""",
...,...,...,...,...,...,...
14267,www.ebay.com//45703,Used Sony Alpha NEX F3 Digital Camera 16 1 MP ...,Sony,16.1,"3""",
14268,www.ebay.com//58832,GE A830 8 0 MP Digital Camera Silver 081002701...,GE,8.0,"2.5""",
14271,www.ebay.com//44811,Nikon Coolpix P330 12 2 MP Digital Camera Blac...,Nikon,12.2,"3""",
14272,www.ebay.com//48131,Olympus Camedia Camera C3000 C 3000 U1 05 0050...,Olympus,3.3,"1.8""",


In [16]:
def parse_screen_size(value):
    if (isinstance(value, list)):
        return float("NaN")
    if pd.isna(value):
        return value

    else:
        m = re.search('(\d*\.\d+|\d*\,\d+|\d+)(cm|\"| \"| in|)', str(value))
        if m is None:

            return float("NaN")

        to_convert= m.group(1).replace(" ","").replace(",",".")
        metric= m.group(2).replace(" ","")
        converted=float("NaN")
        if (metric == "cm"):

            converted= int(round(float(to_convert) / 2.54))
        else:
            converted= int(round(float(to_convert)))

        return converted

In [17]:
df["screen size"] = df["screen size"].apply(parse_screen_size)

In [18]:
df.head()

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,Sony,4.0,2.0,
1,www.ebay.com//24141,Olympus VH 210 14 0 MP Digital Camera Purple 4...,Olympus,14.0,,
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,Canon,16.0,3.0,
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,Canon,6.3,2.0,
4,www.ebay.com//41942,Lumix Panasonic AVCHD Lite DMC TS1 12 MP Digit...,Panasonic,12.0,,


## Brand

In [19]:
df['brand'] = df['brand'].apply(lambda x: str(x).lower())

In [20]:
pd.set_option('display.max_rows', 500)
print(df.brand.value_counts())

canon                                                           3580
nikon                                                           2959
sony                                                            1866
fujifilm                                                         835
olympus                                                          833
kodak                                                            646
samsung                                                          602
panasonic                                                        583
pentax                                                           301
none                                                             290
vivitar                                                          162
casio                                                            147
leica                                                            111
polaroid                                                          99
ge                                

In [21]:
df.loc[df['brand'] == 'blackmagicdesign', 'brand'] = "blackmagic"
df.loc[df['brand'] == 'lg electronics', 'brand'] = "lg"
df.loc[df['brand'] == 'new', 'brand'] = float("NaN")
df.loc[df['brand'] == 'panasonic limix', 'brand'] = "panasonic"
df.loc[df['brand'] == 'unbrand', 'brand'] = float("NaN")
df.loc[df['brand'] == 'dxg technology', 'brand'] = "dxg"
df.loc[df['brand'] == 'unknown', 'brand'] = float("NaN")
df.loc[df['brand'] == 'panasonic / lumix', 'brand'] = "panasonic"
df.loc[df['brand'] == str(['nikon\ntype:\ndigital slr', 'nikon']), 'brand'] = "nikon"
df.loc[df['brand'] == 'canon/japan', 'brand'] = "canon"
df.loc[df['brand'] == '2000 ixla', 'brand'] = float("NaN")
df.loc[df['brand'] == 'insigniaâ¢', 'brand'] = "insignia"
df.loc[df['brand'] == 'vizio, inc.', 'brand'] = "vizio"
df.loc[df['brand'] == str(['nikon megapixels: 12.1 mp', 'nikon\nmegapixels:\n12.1 mp']), 'brand'] = "nikon"
df.loc[df['brand'] == 'motorolathis is a nice, motorola', 'brand'] = "motorola"
df.loc[df['brand'] == str(['kodak', 'kodak'])  , 'brand'] = "kodak"
df.loc[df['brand'] == 'panasonic/lumix ', 'brand'] = "panasonic"
df.loc[df['brand'] == 'unbranded 252 generic', 'brand'] = float("NaN")
df.loc[df['brand'] == 'mamiya afd ii', 'brand'] = float("NaN")
df.loc[df['brand'] == 'unbranded/generic', 'brand'] = float("NaN")
df.loc[df['brand'] == 'sj4000', 'brand'] = float("NaN")
df.loc[df['brand'] == "akai (built by samsung)", 'brand'] = "akai"
df.loc[df['brand'] == 'vivitar & samsung', 'brand'] = "samsung"
df.loc[df['brand'] == '[\'pentax\', \'pentax\']', 'brand'] = "pentax"
df.loc[df['brand'] == '"easy shot" clip', 'brand'] = float("NaN")
df.loc[df['brand'] == 'jazz dv150', 'brand'] = "jazz"
df.loc[df['brand'] == 'spectra merchandising international', 'brand'] = "spectra"
df.loc[df['brand'] == 'vistaquestâ', 'brand'] = "pentax"
df.loc[df['brand'] == 'canon power shot sx130', 'brand'] = 'canon'
df.loc[df['brand'] == '6.0 mp', 'brand'] = float("NaN")
df.loc[df['brand'] == 'olympu', 'brand'] = "olympus"
df.loc[df['brand'] == 'blackmagic design', 'brand'] = "blackmagic"
df.loc[df['brand'] == 'vivitar, kodak, sanyo, nikon', 'brand'] = "vivitar"
df.loc[df['brand'] == 'pentax corporation', 'brand'] = "pentax"
df.loc[df['brand'] == 'kodak, samsung, vivitar, canon , olympus', 'brand'] = "kodak"
df.loc[df['brand'] == 'i_p.mium', 'brand'] = float("NaN")
df.loc[df['brand'] == str(['kodak optical zoom: 8x', 'kodak\noptical zoom:\n8x']), 'brand'] = "kodak"
df.loc[df['brand'] == 'kobian group', 'brand'] = "kobian"
df.loc[df['brand'] == 'fujifilm finepix telephoto digitalcamera', 'brand'] = "fujifilm"
df.loc[df['brand'] == 'hewlett packard', 'brand'] = "hp"
df.loc[df['brand'] == 'unbranded', 'brand'] = float("NaN")
df.loc[df['brand'] == 'pioneer research', 'brand'] = "pioneer"
df.loc[df['brand'] == 'kodak, samsung, vivitar, canon , olympus ', 'brand'] = "kodak "

In [22]:
df.head()

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony Cyber Shot DSC S85 4 0 MP Digital Camera ...,sony,4.0,2.0,
1,www.ebay.com//24141,Olympus VH 210 14 0 MP Digital Camera Purple 4...,olympus,14.0,,
2,www.ebay.com//59471,New Canon PowerShot 130 Is ELPH 16MP Digital C...,canon,16.0,3.0,
3,www.ebay.com//47195,Canon EOS 10D 6 3MP Digital SLR Camera Body 10...,canon,6.3,2.0,
4,www.ebay.com//41942,Lumix Panasonic AVCHD Lite DMC TS1 12 MP Digit...,panasonic,12.0,,


## Page title

In [23]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [24]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [25]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [26]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

## Modelwords

In [27]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [28]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [29]:
def clean_mp_mm_g_oz(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    regex = r"[0-9]+mm(\n|)"
    regex2 = r"[0-9]+mp(\n|)"
    regex3 = r"[0-9]+oz"
    regex4 = r"[0-9]+g(\n|)$"
    repl = value
    for e in repl:
        if bool(re.match(regex, e)) or bool(re.match(regex2, e)) or bool(re.match(regex3, e)) or bool(re.match(regex4, e)):
            repl.remove(e)
    return repl

In [30]:
df["page_title_model"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)),line)))).apply(lambda row : clean_mp_mm_g_oz(row))

In [31]:
df.head()

Unnamed: 0,spec_id,page_title,brand,megapixels,screen size,weight,page_title_model
0,www.ebay.com//56784,"[sony, cyber, shot, dsc, s85, 4, 0, mp, digita...",sony,4.0,2.0,,[s85]
1,www.ebay.com//24141,"[olympus, vh, 210, 14, 0, mp, digital, camera,...",olympus,14.0,,,[]
2,www.ebay.com//59471,"[new, canon, powershot, 130, elph, 16mp, digit...",canon,16.0,3.0,,[]
3,www.ebay.com//47195,"[canon, eos, 10d, 6, 3mp, digital, slr, camera...",canon,6.3,2.0,,[10d]
4,www.ebay.com//41942,"[lumix, panasonic, avchd, lite, dmc, ts1, 12, ...",panasonic,12.0,,,[ts1]


In [32]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [33]:
def get_merged_df(dataframe):
    merged = dataframe
    merged = (merged.merge(merged, on=merged.assign(key_col=1)['key_col'], suffixes=('', '_right'))
 .query('spec_id < spec_id_right') # filter out joins on the same row and keep unique combinations
 .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)
    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True, drop=True)
    return merged

In [34]:
len(df)

14274

In [35]:
df_small = df#.sample(frac = 0.01)

In [36]:
merged = get_merged_df(df_small)

In [37]:
len(merged)

10153

In [38]:
small_merged = merged
#small_merged = merged.sample(frac = 0.001)

In [39]:
len(small_merged)

10153

In [40]:
small_merged.head()

Unnamed: 0,left_spec_id,page_title,brand,megapixels,screen size,weight,page_title_model,right_spec_id,page_title_right,brand_right,megapixels_right,screen size_right,weight_right,page_title_model_right
0,www.ebay.com//45689,"[nikon, series, d200, 10, 2, mp, digital, slr,...",nikon,10.2,2.0,,[d200],www.ebay.com//53575,"[samsung, st, series, st150f, 16, 2, mp, digit...",samsung,16.2,3.0,,[st150f]
1,www.ebay.com//45689,"[nikon, series, d200, 10, 2, mp, digital, slr,...",nikon,10.2,2.0,,[d200],www.ebay.com//60281,"[canon, eos, rebel, t1i, 500d, 15, 1, mp, digi...",canon,15.1,3.0,,"[500d, t1i]"
2,www.ebay.com//45689,"[nikon, series, d200, 10, 2, mp, digital, slr,...",nikon,10.2,2.0,,[d200],www.ebay.com//58563,"[hp, photosmart, m407, used, parts, repair, ebay]",hp,,,,[m407]
3,www.ebay.com//45689,"[nikon, series, d200, 10, 2, mp, digital, slr,...",nikon,10.2,2.0,,[d200],www.ebay.com//56381,"[car, camera, video, recorder, full, hd, dvr, ...",none,1200.0,960.0,,[1080p]
4,www.ebay.com//45689,"[nikon, series, d200, 10, 2, mp, digital, slr,...",nikon,10.2,2.0,,[d200],www.ebay.com//54776,"[canon, eos, 20d, 8, 2mp, digital, slr, camera...",canon,8.2,2.0,,[20d]


In [41]:
small_merged.isna().sum() / len(small_merged)

left_spec_id              0.000000
page_title                0.000000
brand                     0.000000
megapixels                0.104501
screen size               0.183788
weight                    0.925244
page_title_model          0.000000
right_spec_id             0.000000
page_title_right          0.000000
brand_right               0.000000
megapixels_right          0.091303
screen size_right         0.137890
weight_right              0.962868
page_title_model_right    0.000000
dtype: float64

In [69]:
def determine_match(row):
    score = 0
    
#     if pd.isna(row["weight"]) or pd.isna(row["weight_right"]):
#         target = 0.5
#     else:
    target = 1.2
    
    brand_l = row["brand"]
    brand_r = row["brand_right"]
    megapixels_l = row["megapixels"]
    megapixels_r = row["megapixels_right"]
    screen_size_l = row["screen size"]
    screen_size_r = row["screen size_right"]
    weight_l = row["weight"]
    weight_r = row["weight_right"]
    page_title_l = row["page_title"]
    page_title_r = row["page_title_right"]
    model_l = row["page_title_model"]
    model_r = row["page_title_model_right"]
    
    weight_weight = 0.6
    brand_weight = 0.3
    mp_weight = 0.3
    scr_weight = 0.3
    
    if model_l != [] and model_r != []:
        model_count = 0
        for spec1 in model_l:
            for spec2 in model_r:
                if spec1 == spec2: 
                    score += 0.8
                    model_count += 1
                if model_count >= 2:
                    return True
        if model_count == 0:
            return False
    
    if not pd.isna(weight_l) and not pd.isna(weight_r):
        if abs(weight_l - weight_r) <= 2:
            score += weight_weight
        else:
            score -= weight_weight
    
    
    if not pd.isna(megapixels_l) and not pd.isna(megapixels_r):
        if abs(float(megapixels_l.replace("mp", "")) - float(megapixels_r.replace("mp", ""))) == 0:
            score += mp_weight
        else:
            score -= mp_weight
    
    if not pd.isna(brand_l) and not pd.isna(brand_r):
        if brand_l == brand_r:
            score += brand_weight
        else:
            score -= brand_weight
    
    if not pd.isna(screen_size_l) and not pd.isna(screen_size_r):
        if abs(screen_size_l - screen_size_r) == 0:
            score += scr_weight
        else:
            score -= scr_weight

    if page_title_l != [] and page_title_r != []:
        score += jaccard_similarity(page_title_l, page_title_r) * 0.7
                    
    return score > target
     

In [70]:
tqdm.pandas()

  from pandas import Panel


In [71]:
labels = []
labels.append(list(small_merged.progress_apply(determine_match, axis = 1)))
labels = sum(labels, [])
small_merged["label"] = labels



  0%|          | 0/10153 [00:00<?, ?it/s][A
  4%|▎         | 363/10153 [00:00<00:02, 3623.80it/s][A
  7%|▋         | 666/10153 [00:00<00:02, 3422.14it/s][A
 10%|█         | 1025/10153 [00:00<00:02, 3467.05it/s][A
 13%|█▎        | 1305/10153 [00:00<00:02, 3233.93it/s][A
 16%|█▋        | 1664/10153 [00:00<00:02, 3332.22it/s][A
 20%|█▉        | 1994/10153 [00:00<00:02, 3321.55it/s][A
 23%|██▎       | 2300/10153 [00:00<00:02, 3235.62it/s][A
 26%|██▌       | 2597/10153 [00:00<00:02, 3089.07it/s][A
 29%|██▉       | 2933/10153 [00:00<00:02, 3161.89it/s][A
 33%|███▎      | 3321/10153 [00:01<00:02, 3347.64it/s][A
 36%|███▌      | 3651/10153 [00:01<00:02, 3235.33it/s][A
 39%|███▉      | 3972/10153 [00:01<00:01, 3207.13it/s][A
 42%|████▏     | 4298/10153 [00:01<00:01, 3213.75it/s][A
 45%|████▌     | 4618/10153 [00:01<00:01, 3207.19it/s][A
 49%|████▊     | 4938/10153 [00:01<00:01, 3083.53it/s][A
 52%|█████▏    | 5286/10153 [00:01<00:01, 3192.28it/s][A
 55%|█████▌    | 5630/10153 

In [72]:
len(small_merged)

10153

In [73]:
small_merged.loc[(small_merged["label"] == True)]

Unnamed: 0,left_spec_id,page_title,brand,megapixels,screen size,weight,page_title_model,right_spec_id,page_title_right,brand_right,megapixels_right,screen size_right,weight_right,page_title_model_right,label
42,www.ebay.com//45689,"[nikon, series, d200, 10, 2, mp, digital, slr,...",nikon,10.2,2.0,,[d200],www.ebay.com//48020,"[nikon, d200, 10, 2, mp, digital, slr, camera,...",nikon,10.2,2.0,,[d200],True
430,www.ebay.com//25187,"[canon, powershot, sx50, hs, 12, 1, mp, digita...",canon,12.1,3.0,,[sx50],www.ebay.com//45637,"[canon, powershot, elph, 100, hs, ixus, 115, h...",canon,12.1,3.0,,[],True
474,www.ebay.com//25187,"[canon, powershot, sx50, hs, 12, 1, mp, digita...",canon,12.1,3.0,,[sx50],www.ebay.com//42614,"[new, canon, powershot, n, 12, 1, mp, digital,...",canon,12.1,3.0,,[],True
561,www.ebay.com//58768,"[canon, eos, rebel, t1i, 500d, 15, 1, mp, digi...",canon,15.1,3.0,,"[500d, t1i]",www.ebay.com//60281,"[canon, eos, rebel, t1i, 500d, 15, 1, mp, digi...",canon,15.1,3.0,,"[500d, t1i]",True
2310,www.ebay.com//43436,"[canon, powershot, digital, elph, sd1300, ixus...",canon,12.1,3.0,,[sd1300],www.ebay.com//45637,"[canon, powershot, elph, 100, hs, ixus, 115, h...",canon,12.1,3.0,,[],True
4006,www.ebay.com//42092,"[canon, powershot, digital, elph, sd940, 12, 1...",canon,12.1,3.0,,[sd940],www.ebay.com//45637,"[canon, powershot, elph, 100, hs, ixus, 115, h...",canon,12.1,3.0,,[],True
4049,www.ebay.com//42092,"[canon, powershot, digital, elph, sd940, 12, 1...",canon,12.1,3.0,,[sd940],www.ebay.com//42614,"[new, canon, powershot, n, 12, 1, mp, digital,...",canon,12.1,3.0,,[],True
4161,www.ebay.com//45749,"[nikon, d80, 10, 2, mp, digital, slr, camera, ...",nikon,10.2,2.0,,[d80],www.ebay.com//47739,"[nikon, d80, 10, 2mp, digital, slr, camera, bo...",nikon,10.2,2.0,,[d80],True
4714,www.ebay.com//53451,"[nikon, d3100, 14, 2, mp, digital, slr, camera...",nikon,14.2,3.0,,[d3100],www.ebay.com//60477,"[nikon, d3100, 14, 2, mp, digital, slr, camera...",nikon,14.2,3.0,,[d3100],True
5137,www.ebay.com//45637,"[canon, powershot, elph, 100, hs, ixus, 115, h...",canon,12.1,3.0,,[],www.ebay.com//60852,"[brand, new, canon, powershot, sx130, 12, 1, m...",canon,12.1,3.0,,[sx130],True


In [74]:
small_merged["label"].sum() / len(df_small)

0.13986013986013987

In [None]:
small_merged = small_merged[small_merged["label"] == True]
cols = ["left_spec_id", "right_spec_id"]
small_merged = small_merged[cols]
small_merged.to_csv("ebay.csv", index = False)

  0%|          | 89506/31772406 [00:41<1:45:37, 4998.92it/s]