In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            weight = specification_data.get("weight")
            title = specification_data.get("<page title>")
            row = (specification_id, title, weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.alibaba.com",["spec_id", "<page title>", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df = df.rename(columns = {"<page title>" : "page_title"})

### Weight

In [5]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,page_title,weight
2,www.alibaba.com//23684,Slr Portable Camera Inner Partition Padded Pro...,0.19KG
3,www.alibaba.com//24141,Cheap Outdoor Ip Camera Ds-2cd2632f-is Hikvisi...,1200g
4,www.alibaba.com//22996,Hikvision Ds-2cd2532f-is Ir 3mp Mini Dome Hd 1...,600 g (1.32 lbs)
5,www.alibaba.com//24511,2colors Fashion Abs Plastic Dslr Slr Hard Comp...,0.09Kg
6,www.alibaba.com//35300,Hikvision Ir Cctv Camera 3mp Bullet Ip Camera ...,1200g
...,...,...,...
7960,www.alibaba.com//34210,Mini Camera Ds-2cd2532f-i 3 Megapixel Dome Hik...,600 g (1.32 lbs)
7962,www.alibaba.com//29588,Ds-2cd2012-i Network Video Server Hikvision Ip...,500g (1.1 lbs)
7968,www.alibaba.com//35051,Genuine For Gopro Hero 3 3+plus Underwater Wat...,0.1KG/pc
7969,www.alibaba.com//24210,Ds-2cd3332-i Ip Network Camera Hikvision Weath...,670g (1.5 lbs)


In [6]:
def parse_weight(value):
    if (isinstance(value, list)):
        return float("NaN")
        

    if pd.isna(value):
        return value
    else:
        value = value.lower()
       
        match1 = re.search('(approx.|)(\d*\,\d+|\d*\.\d+|\d+)( g|g| kg|kg|lbs| lbs)', str(value))
        if match1 is None: 
            return float("NaN")


        to_convert= match1.group(2).replace(" ","").replace(",",".")
        metric= match1.group(3).replace(" ","")
        converted=float("NaN")
        if metric == "oz":
            converted= int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            converted= int(round(float(to_convert) * 454))
        elif metric == "kg":
            converted= int(round(float(to_convert) * 1000))
        else:
            converted= int(round(float(to_convert)))
        return converted
        

In [7]:
df["weight"] = df["weight"].apply(parse_weight)

## Title

In [8]:
df.head()

Unnamed: 0,spec_id,page_title,weight
0,www.alibaba.com//37297,New Arrival Metal Case 1.3mp Vandalproof Ip Ca...,
1,www.alibaba.com//29289,New Design In 2014 Camera Case For Iphone - Bu...,
2,www.alibaba.com//23684,Slr Portable Camera Inner Partition Padded Pro...,190.0
3,www.alibaba.com//24141,Cheap Outdoor Ip Camera Ds-2cd2632f-is Hikvisi...,1200.0
4,www.alibaba.com//22996,Hikvision Ds-2cd2532f-is Ir 3mp Mini Dome Hd 1...,600.0


In [9]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [10]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [11]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [12]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

## Modelwords

In [13]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [14]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [15]:
df["page_title_model"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [16]:
df.head()

Unnamed: 0,spec_id,page_title,weight,page_title_model
0,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[13mp]
1,www.alibaba.com//29289,"[new, design, 2014, camera, case, iphone, buy,...",,[]
2,www.alibaba.com//23684,"[slr, portable, camera, inner, partition, padd...",190.0,[]
3,www.alibaba.com//24141,"[cheap, outdoor, ip, camera, ds2cd2632fis, hik...",1200.0,"[ds2cd2632fis, 3mp]"
4,www.alibaba.com//22996,"[hikvision, ds2cd2532fis, ir, 3mp, mini, dome,...",600.0,"[ds2cd2532fis, 1080p, 3mp, ip66]"


In [17]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [18]:
def get_merged_df(dataframe):
    merged = dataframe
    merged = (merged.merge(merged, on=merged.assign(key_col=1)['key_col'], suffixes=('', '_right'))
 .query('spec_id < spec_id_right') # filter out joins on the same row and keep unique combinations
 .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)
    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True, drop=True)
    return merged

In [19]:
merged = get_merged_df(df)

In [30]:
small_merged = merged#.sample(frac = 0.001)

In [31]:
small_merged.head()

Unnamed: 0,left_spec_id,page_title,weight,page_title_model,right_spec_id,page_title_right,weight_right,page_title_model_right
0,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[13mp],www.alibaba.com//7298,"[motion, sensor, 720p, full, hd, ip, cctv, lon...",,[720p]
1,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[13mp],www.alibaba.com//37628,"[waterproof, hikvision, ir, bullet, cctv, came...",700.0,[ds2cd2212i5]
2,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[13mp],www.alibaba.com//5533,"[hot, croco, chocolate, camera, case, buy, cam...",,[]
3,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[13mp],www.alibaba.com//5499,"[waterproof, outdoor, 13, color, ccd, sony, di...",,"[mpeg4, sony]"
4,www.alibaba.com//37297,"[new, arrival, metal, case, 13mp, vandalproof,...",,[13mp],www.alibaba.com//37501,"[2mp, dhsd59220shn, dahua, outdoor, ip, ptz, c...",3500.0,"[dhsd59220shn, 16x, 2mp]"


In [32]:
def determine_match(row):
    score = 0
    
    if pd.isna(row["weight"]) or pd.isna(row["weight_right"]):
        target = 0.5
    else:
        target = 0.8
    
    
    
    weight_l = row["weight"]
    weight_r = row["weight_right"]
    page_title_l = row["page_title"]
    page_title_r = row["page_title_right"]
    model_l = row["page_title_model"]
    model_r = row["page_title_model_right"]
    
    weight_weight = 0.6
    
    if model_l != [] and model_r != []:
        model_count = 0
        for spec1 in model_l:
            for spec2 in model_r:
                if spec1 == spec2: 
                    score += 0.4
                    model_count += 1
                if model_count >= 2:
                    return True
    
    if not pd.isna(weight_l) and not pd.isna(weight_r) and abs(weight_l - weight_r) <= 2:
        score += weight_weight


    if page_title_l != [] and page_title_r != []:
        score += jaccard_similarity(page_title_l, page_title_r)
                    
    return score > target
     

In [33]:
tqdm.pandas()

  from pandas import Panel


In [34]:
labels = []
labels.append(list(small_merged.progress_apply(determine_match, axis = 1)))
labels = sum(labels, [])
small_merged["label"] = labels


  0%|          | 89441/31772406 [00:27<1:45:37, 4998.92it/s] 

SystemError: <built-in function is_scalar> returned a result with an error set

In [None]:
small_merged[small_merged["label"] == True]
cols = ["left_spec_id", "right_spec_id"]
small_merged = small_merged[cols]
merged.to_csv(".csv", index = False)

  0%|          | 89506/31772406 [00:41<1:45:37, 4998.92it/s]