In [1]:
import pandas as pd
import numpy as np
import os
import itertools
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Training

In [2]:
df = pd.read_csv('../datasets/labeled/labeled_with_titles_large.csv')
df.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
1,www.ebay.com//53278,www.priceme.co.nz//2246,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 1200d + 18-55/3.5-5.6 new zealand p...
2,www.ebay.com//53278,www.shopbot.com.au//1376,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d7000 / 18-105mm vr kit - price comparis...
3,www.ebay.com//53278,www.flipkart.com//2193,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 1200d kit (ef s18-55 is ii + 55-250 ...
4,www.ebay.com//53278,www.ebay.com//58781,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 5d digital slr camera black body onl...


In [3]:
y = df['label'].copy()
y

0         1
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         1
10        0
11        0
12        1
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        1
25        0
26        0
27        0
28        0
29        0
         ..
303147    0
303148    0
303149    0
303150    0
303151    0
303152    0
303153    0
303154    0
303155    0
303156    0
303157    0
303158    0
303159    0
303160    0
303161    0
303162    0
303163    0
303164    0
303165    0
303166    0
303167    0
303168    0
303169    0
303170    0
303171    0
303172    0
303173    0
303174    0
303175    0
303176    0
Name: label, Length: 303177, dtype: int64

In [4]:
titles = np.hstack([df['left_page_title'].values, df['right_page_title'].values])
titles.shape

(606354,)

In [5]:
vect = TfidfVectorizer()
vect.fit(titles)
left = vect.transform(df['left_page_title']).toarray()
right = vect.transform(df['right_page_title']).toarray()

In [6]:
rows = []
for l, r in zip(left, right):
    elem = {'left_page_title': l, 'right_page_title': r}
    rows.append(elem)
    
X = pd.DataFrame(rows)
X

Unnamed: 0,left_page_title,right_page_title
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.5585104430195296, 0.0, ..."
5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [7]:
X = X.to_numpy()
X = np.array([np.concatenate(x) for x in X])
X.shape

(303177, 1582)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Naive Bayes

In [9]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
nb.score(X_test, y_test)

0.9002407810541593

### Random forests

In [11]:
rf = RandomForestClassifier(random_state=42, n_jobs=2)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
rf.score(X_test, y_test)

0.9985652087868593

# Testing

In [13]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [14]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  8%|▊         | 2/24 [00:00<00:01, 11.15it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:13<00:00,  2.52it/s]

>>> Dataframe created successfully!






In [15]:
df = df[['spec_id', 'page_title']]
df.head()

Unnamed: 0,spec_id,page_title
0,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [16]:
chunk_size = 100
number_chunks = len(df) // chunk_size + 1
for i in tqdm(range(number_chunks)):
    # Takes a small chunk
    df_small = df[i * chunk_size:(i + 1) * chunk_size].copy()
    
    # Computes the numerical representation of the title
    df_small['page_title'] = df_small['page_title'].apply(lambda x: vect.transform([x]).toarray().flatten())
    
    # Computes all pairs
    merged = (df_small.merge(df_small, on=df_small.assign(key_col=1)['key_col'], suffixes=('', '_right'))
    .query('spec_id < spec_id_right') # filter out joins on the same row
    .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)

    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True)
    merged.drop(columns=['index'], inplace=True)
    merged.rename(columns = {"page_title" : "left_page_title", "page_title_right" : "right_page_title"}, inplace=True)
    
    # Flattens the embedding, generates one matrix with a column per feature
    X = merged[['left_page_title', 'right_page_title']]
    X = X.to_numpy()
    X = np.array([np.concatenate(x) for x in X])
    
    # Predicts the labels using random forests, change 'rf' to 'nb' for naive Bayes
    merged['label'] = rf.predict(X)
    
    # Filters the result, keeping only the matches
    matches = merged.query('label == 1')
    
    # Saves the matches to a CSV file
    output = matches[['left_spec_id', 'right_spec_id']]
    output.to_csv('submission.csv', index=False, mode='a')

100%|██████████| 298/298 [01:07<00:00,  4.54it/s]
