In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

from pathlib import Path
import shutil
import os

from copy import deepcopy

import re

from transformers import AutoTokenizer, AutoConfig

from pdb import set_trace

In [3]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/xtremedistil-l6-h256-uncased', additional_special_tokens=('lenovo','thinkpad','elitebook', 'toshiba', 'asus', 'acer', 'lexar', 'sandisk', 'tesco', 'intenso', 'transcend'))

def assign_clusterid(identifier, cluster_id_dict, cluster_id_amount):
    try:
        result = cluster_id_dict[str(identifier)]
    except KeyError:
        result = cluster_id_amount
    return result

def load_normalization():
    """Load Normalization file - Especially for D2"""
    normalizations = {}
    with open('../../normalization.txt', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line_values = line.split(',')
            normalizations[line_values[0]] = line_values[1].replace('\n','')

    return normalizations

def preprocess_input(docs, normalizations, seq_length):
    if len(docs) == 0:
        return ''
    else:
        doc = ' '.join([str(value) for value in docs if type(value) is str or (type(value) is float and not np.isnan(value))]).lower()

        stop_words = ['ebay', 'google', 'vology', 'buy', 'cheapest', 'foto de angelis', 'cheap', 'core',
                      'refurbished', 'wifi', 'best', 'wholesale', 'price', 'hot', '\'\'', '"', '\\\\n',
                      'tesco direct', 'color', ' y ', ' et ', 'tipo a', 'type-a', 'type a', 'informática', ' de ',
                      ' con ', 'newest', ' new', ' ram ', '64-bit', '32-bit', 'accessories', 'series', 'touchscreen',
                      'product', 'customized']

        stop_signs = ['&nbsp;', '&quot;', '&amp;', ',', ';', '-', ':', '|', '/', '(', ')', '/', '&']

        regex_list_1 = ['^dell*', '[\d\w]*\.com', '[\d\w]*\.ca', '[\d\w]*\.fr', '[\d\w]*\.de', '[\d\w]*\.es',
                        '(\d+\s*gb\s*hdd|\d+\s*gb\s*ssd)', '\\\\n']

        for stop_word in stop_words:
            doc = doc.replace(stop_word, ' ')

        for stop_sign in stop_signs:
            doc = doc.replace(stop_sign, ' ')

        for regex in regex_list_1:
            doc = re.sub(regex, '', doc)

        # Move GB pattern to beginning of doc
        gb_pattern = re.findall('(d+\s*gbbeuk|\d+\s*gbbeu|\d+\s*gb|\d+\s*go|\d+\s*bbeu|\d+\s*gabeu)', doc)

        if len(gb_pattern) > 0:
            gb_pattern.sort()
            while len(gb_pattern) > 0 and gb_pattern[0][0] == '0':
                gb_pattern.remove(gb_pattern[0])

            if len(gb_pattern) > 0:
                doc = re.sub('(d+\s*gbbeuk|\d+\s*gbbeu|\d+\s*gb|\d+\s*go|\d+\s*bbeu|\d+\s*gabeu)', ' ', doc)
                doc = '{} {}'.format(gb_pattern[0].replace(' ', '').replace('go', 'gb').replace('gbbeuk', 'gb').replace('gbbeu', 'gb').replace('bbeu', 'gb'),
                                     doc)  # Only take the first found pattern --> might lead to problems, but we need to focus on the first tokens.

        doc = re.sub('\s\s+', ' ', doc)

        if normalizations is not None:
            for key in normalizations:
                doc = doc.replace(key, normalizations[key])
            doc = re.sub('\s\s+', ' ', doc)
            # Clean up normalization
            doc = doc.replace('usb stick usb stick', 'usb stick')
            doc = doc.replace('usb stick usb', 'usb stick')
            doc = doc.replace('usb usb', 'usb')
            doc = doc.replace('memory card memory card', 'memory card')
            doc = doc.replace('memory card memory', 'memory card')
            doc = doc.replace('memory memory', 'memory')
            doc = doc.replace('card card', 'card')
            doc = doc.replace('windows windows', 'windows')
            doc = doc.replace('laptop laptop', 'laptop')
            doc = doc.replace('hp hp', 'hp')

        doc = re.sub('\s\s+', ' ', doc)
        doc = re.sub('\s*$', '', doc)
        doc = re.sub('^\s*', '', doc)

        if len(doc) > 0:
            tokens = tokenizer.tokenize(doc)
            pattern = tokenizer.convert_tokens_to_string(tokens[:seq_length])
        else:
            pattern = ''

        return pattern
    
def build_buckets(pairs):
    bucket_list = []
    for i, row in pairs.iterrows():
        left = f'{row["lid"]}'
        right = f'{row["rid"]}'
        found = False
        for bucket in bucket_list:
            if left in bucket:
                bucket.add(right)
                found = True
                break
            elif right in bucket:
                bucket.add(left)
                found = True
                break
        if not found:
            bucket_list.append(set([left, right]))

    merging = True
    while merging:
        merging=False
        for i,group in enumerate(bucket_list):
            merged = next((g for g in bucket_list[i+1:] if g.intersection(group)),None)
            if not merged: continue
            group.update(merged)
            bucket_list.remove(merged)
            merging = True
            
    return bucket_list

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
data_1 = pd.read_csv('../../X1.csv')
labels_1 = pd.read_csv('../../Y1.csv')

bucket_list = build_buckets(labels_1)

cluster_id_dict = {}
cluster_id_amount = len(bucket_list)

for i, id_set in enumerate(bucket_list):
    for v in id_set:
        cluster_id_dict[v] = i
        
data_1['cluster_id'] = data_1['id'].apply(assign_clusterid, args=(cluster_id_dict, cluster_id_amount))

normalizations_x1 = load_normalization()

seq_length=32
data_1['features'] = data_1[['title']].apply(preprocess_input, normalizations=normalizations_x1, seq_length=seq_length, axis=1)

single_entities = data_1[data_1['cluster_id'] == cluster_id_amount].copy()
single_entities = single_entities.reset_index(drop=True)
single_entities['cluster_id'] = single_entities['cluster_id'] + single_entities.index

data_1 = data_1.set_index('id', drop=False)

data_1 = data_1.drop(single_entities['id'])
data_1 = data_1.append(single_entities)
data_1 = data_1.reset_index(drop=True)

data_1 = data_1[['id', 'features', 'cluster_id']]

os.makedirs(os.path.dirname(f'../../data/processed/blocking-sigmod-1/'), exist_ok=True)
data_1.to_pickle(f'../../data/processed/blocking-sigmod-1/blocking-sigmod-1-train.pkl.gz', compression='gzip')

In [5]:
for i, row in labels_1.iterrows():
    lclusterid = data_1[data_1['id'] == row['lid']]['cluster_id'].iloc[0]
    rclusterid = data_1[data_1['id'] == row['rid']]['cluster_id'].iloc[0]
    try:
        assert lclusterid == rclusterid
    except AssertionError:
        set_trace()

In [6]:
data_2 = pd.read_csv('../../X2.csv')
labels_2 = pd.read_csv('../../Y2.csv')
data_2.head()

Unnamed: 0,id,name,price,brand,description,category
0,139267,SONY CARTE LINE USB EXTREME 64 + C10 (1353969)...,52.99,SONY,,
1,270354,"Toshiba SDHC EXCERIA PRO 64GB, silber",59.95,,,
2,303124,"SANDISK SF32UX, USB USB m3.0 USB bis de GB MIC...",59.0,SANDISK,,
3,647192,Brand micro Pro microSDHC 16gb FLAIR 16GB Spee...,5290.0,,,
4,49177,Toshiba Exceria Pro N101 64GB SD Memory Card 2...,209.99,,,


In [7]:
labels_2.head()

Unnamed: 0,lid,rid
0,403101,443604
1,64846,860539
2,172465,1013447
3,609334,1210449
4,64790,634606


In [8]:
bucket_list = build_buckets(labels_2)

cluster_id_dict = {}
cluster_id_amount = len(bucket_list)

for i, id_set in enumerate(bucket_list):
    for v in id_set:
        cluster_id_dict[v] = i
        
data_2['cluster_id'] = data_2['id'].apply(assign_clusterid, args=(cluster_id_dict, cluster_id_amount))

normalizations_x2 = load_normalization()

seq_length=24
data_2['features'] = data_2[['name']].apply(preprocess_input, normalizations=normalizations_x2, seq_length=seq_length, axis=1)

single_entities = data_2[data_2['cluster_id'] == cluster_id_amount].copy()
single_entities = single_entities.reset_index(drop=True)

single_entities['cluster_id'] = single_entities['cluster_id'] + single_entities.index

data_2 = data_2.set_index('id', drop=False)

data_2 = data_2.drop(single_entities['id'])
data_2 = data_2.append(single_entities)
data_2 = data_2.reset_index(drop=True)

data_2 = data_2[['id', 'features', 'cluster_id']]

os.makedirs(os.path.dirname(f'../../data/processed/blocking-sigmod-2/'), exist_ok=True)
data_2.to_pickle(f'../../data/processed/blocking-sigmod-2/blocking-sigmod-2-train.pkl.gz', compression='gzip')

In [9]:
data_2.head()

Unnamed: 0,id,features,cluster_id
0,139267,sony card line usb extreme 64 + c10 1353969 12...,442
1,270354,64gb toshiba sdhc exceria pro silver,59
2,303124,sandisk sf32ux usb m3. 0 usb bis gb micro 48mb s,660
3,647192,16gb brand micro pro microsdhc flair speed sdhc,218
4,49177,64gb toshiba exceria pro n101 sd memory card 2...,57


In [10]:
data_1.head()

Unnamed: 0,id,features,cluster_id
0,270345,500gb aspire cube intel 6885 intel frame quad ...,628
1,163850,160gb panasonic latitude 14 b5232 solid duo bx...,143
2,180242,500gb panasonic 667374 b21 toshiba 2325 pc amd...,150
3,712728,8gb t540p n7110 us hd laptop notebook 4600u mi...,121
4,729116,128gb 1737 7000 chromebook i5 67y2625 android ...,456


In [11]:
data = pd.read_pickle('../../data/interim/wdc-lspc/preprocessed_english_corpus.pkl.gz')
data.head()

Unnamed: 0,id,cluster_id,category,identifiers,title,description,brand,price,keyValuePairs,specTableContent
0,11920489,2533918,Musical_Instruments,[{'/productID': '[ritrgp5dbsg]'}],Ritter RGP5-D/BSG Performance 5 Series Bag: Dr...,The Ritter RGP5-D padded gigbag offers stylish...,,,,
1,12648455,11167803,Tools_and_Home_Improvement,[{'/sku': '[kro14802l]'}],Krowne - 14-802L 8 in Royal Series Wall Mount ...,The 14-802L Royal Series Wall Mount Faucet w/8...,Krowne,,,
2,7634831,11621476,Jewelry,[{'/mpn': '[me2105q163]'}],A. Jaffe Art Deco ME2105Q-163 Shop A. MES652-2...,"<p> An everlasting symbol of love, model numbe...",,,,
3,16519583,8824768,Sports_and_Outdoors,[{'/sku': '[135964829]'}],Gore bike wear Element Lady 2in1 Shorts Shorts...,Produktbeskrivning Gore bike wear Element Lady...,Gore bike wear,,,
4,3362858,7523117,Shoes,[{'/mpn': '[52853none8]'}],,,,USD,,


In [12]:
relevant_cols = ['id', 'features', 'cluster_id']
categories = ['computers_only_new_15']

out_path = f'../../../data/processed/wdc-lspc/'
Path(out_path).mkdir(parents=True, exist_ok=True)

seq_lengths = [24, 28]

for seq_length in seq_lengths:
    for category in categories:

        ids = pd.read_pickle(f'../../data/raw/wdc-lspc/pre-training_{category}.pkl.gz')

        relevant_ids = set()
        relevant_ids.update(ids['id_left'])
        relevant_ids.update(ids['id_right'])

        data_selection = data[data['id'].isin(relevant_ids)].copy()

        normalizations_x2 = load_normalization()

        data_selection['features'] = data_selection[['brand', 'title']].apply(preprocess_input, normalizations=normalizations_x2, seq_length=seq_length, axis=1)
        data_selection = data_selection[relevant_cols]
        data_selection = data_selection.reset_index(drop=True)
        data_selection.to_pickle(f'{out_path}{category}_train_sigmod_{seq_length}.pkl.gz')
    
data_selection.head()

Unnamed: 0,id,features,cluster_id
0,7634831,a. jaffe art deco me2105q 163 shop a. mes652 2...,11621476
1,3597309,8gb acer acer aspire 5600u ur308 i3 3110m 2. 4...,5572300
2,12905275,classy sandals melissa classy sandals shopbop,7503205
3,13035679,airsal pochette joints airsal sym 125cc 4 temp...,12521967
4,6685653,450gb hp enterprise bd450dajzh hp 10k fc al hd...,736725


In [3]:
ids_1 = pd.read_pickle(f'../../../data/raw/wdc-lspc/pre-training-set/pre-training_computers_only_new_15.pkl.gz')
ids_2 = pd.read_pickle(f'../../../data/raw/wdc-lspc/pre-training-set/pre-training_4cat_new_5.pkl.gz')

In [5]:
ids_1.to_json(f'../../../data/raw/wdc-lspc/pre-training-set/pre-training_computers_only_new_15.json.gz', orient='records', lines=True)
ids_2.to_json(f'../../../data/raw/wdc-lspc/pre-training-set/pre-training_4cat_new_5.json.gz', orient='records', lines=True)

In [3]:
deduped_corpus_safe = pd.read_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_english_corpus.pkl.gz')

In [4]:
deduped_corpus_safe.head()

Unnamed: 0,id,cluster_id,category,identifiers,title,description,brand,price,keyValuePairs,specTableContent
0,11920489,2533918,Musical_Instruments,[{'/productID': '[ritrgp5dbsg]'}],Ritter RGP5-D/BSG Performance 5 Series Bag: Dr...,The Ritter RGP5-D padded gigbag offers stylish...,,,,
1,12648455,11167803,Tools_and_Home_Improvement,[{'/sku': '[kro14802l]'}],Krowne - 14-802L 8 in Royal Series Wall Mount ...,The 14-802L Royal Series Wall Mount Faucet w/8...,Krowne,,,
2,7634831,11621476,Jewelry,[{'/mpn': '[me2105q163]'}],A. Jaffe Art Deco ME2105Q-163 Shop A. MES652-2...,"<p> An everlasting symbol of love, model numbe...",,,,
3,16519583,8824768,Sports_and_Outdoors,[{'/sku': '[135964829]'}],Gore bike wear Element Lady 2in1 Shorts Shorts...,Produktbeskrivning Gore bike wear Element Lady...,Gore bike wear,,,
5,12983937,3315185,Office_Products,[{'/productID': '[17001506]'}],Calendar for 2018 Vector Image Royalty-Free Im...,Calendar for 2018 on white background Vector I...,,,,


In [46]:
deduped_corpus = deduped_corpus_safe.copy()

In [47]:
deduped_corpus = deduped_corpus[['id', 'title', 'cluster_id']]

In [48]:
deduped_corpus['title_short'] = deduped_corpus['title'].apply(lambda x: x[:64])

In [49]:
len(deduped_corpus)

11480025

In [50]:
deduped_corpus = deduped_corpus.drop_duplicates(subset=['title_short'])

In [51]:
deduped_corpus = deduped_corpus.drop(columns='title_short')

In [52]:
len(deduped_corpus)

10242009

In [53]:
counts = deduped_corpus['cluster_id'].value_counts()

In [54]:
counts = counts[counts > 1]
counts = counts[counts < 26]

In [55]:
deduped_corpus = deduped_corpus[deduped_corpus['cluster_id'].isin(counts.index)]

In [56]:
len(deduped_corpus)

2092123

In [57]:
X_1 = deduped_corpus.sample(1000000)

deduped_corpus = deduped_corpus.drop(X_1.index)

X_2 = deduped_corpus.sample(1000000)

In [58]:
X_1.to_csv('../../../data/raw/blocking-sigmod/X1_new.csv', index=False)
X_2.to_csv('../../../data/raw/blocking-sigmod/X2_new.csv', index=False)