In [1]:
import pandas as pd
import numpy as np
import os
import itertools
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Training

In [2]:
df = pd.read_csv('../datasets/labeled/labeled_with_titles_large.csv')
df.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
1,www.ebay.com//53278,www.priceme.co.nz//2246,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 1200d + 18-55/3.5-5.6 new zealand p...
2,www.ebay.com//53278,www.shopbot.com.au//1376,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d7000 / 18-105mm vr kit - price comparis...
3,www.ebay.com//53278,www.flipkart.com//2193,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 1200d kit (ef s18-55 is ii + 55-250 ...
4,www.ebay.com//53278,www.ebay.com//58781,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 5d digital slr camera black body onl...


In [3]:
y = df['label'].copy()
y

0         1
1         0
2         0
3         0
4         0
         ..
303172    0
303173    0
303174    0
303175    0
303176    0
Name: label, Length: 303177, dtype: int64

In [4]:
titles = np.hstack([df['left_page_title'].values, df['right_page_title'].values])
titles.shape

(606354,)

In [5]:
vect = TfidfVectorizer()
vect.fit(titles)
left = vect.transform(df['left_page_title']).toarray()
right = vect.transform(df['right_page_title']).toarray()

In [6]:
rows = []
for l, r in zip(left, right):
    elem = {'left_page_title': l, 'right_page_title': r}
    rows.append(elem)
    
X = pd.DataFrame(rows)
X.head()

Unnamed: 0,left_page_title,right_page_title
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.5585104430195296, 0.0, ..."


In [7]:
X = X.to_numpy()
X = np.array([np.concatenate(x) for x in X])
X.shape

(303177, 1582)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
nb.score(X_test, y_test)

### Random forests

In [9]:
rf = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=2)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=2, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [10]:
rf.score(X_test, y_test)

0.999076456230622

### SVM

In [None]:
svc = svm.LinearSVC()
svc.fit(X_train, y_train)

In [None]:
svc.score(X_test, y_test)

### Gradient boosted trees

In [None]:
gbt = GradientBoostingClassifier()
gbt.fit(X_train, y_train)

In [None]:
gbt.score(X_test, y_test)

# Testing

In [21]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [22]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')


  0%|          | 0/24 [00:00<?, ?it/s][A
 17%|█▋        | 4/24 [00:00<00:00, 30.40it/s][A

>>> Creating dataframe...




 33%|███▎      | 8/24 [00:00<00:01, 11.86it/s][A
 58%|█████▊    | 14/24 [00:01<00:00, 15.46it/s][A
 71%|███████   | 17/24 [00:01<00:00, 10.27it/s][A
 88%|████████▊ | 21/24 [00:01<00:00, 12.62it/s][A
100%|██████████| 24/24 [00:01<00:00, 13.27it/s][A

>>> Dataframe created successfully!



In [23]:
df = df[['spec_id', 'page_title']]
df.head()

Unnamed: 0,spec_id,page_title
0,www.mypriceindia.com//50,"panasonic hc v130 price in india, bangalore, h..."
1,www.mypriceindia.com//34,canon eos 1100d (ef-s 18-55 mm is ii) price in...
2,www.mypriceindia.com//47,"panasonic lumix dmc tz30 price in india, banga..."
3,www.mypriceindia.com//40,"sony alpha ilce 7s (body only) price in india,..."
4,www.mypriceindia.com//726,"samsung st72 price in india, bangalore, hydera..."


In [24]:
with open('submission.csv', 'w') as file:
    file.write('left_spec_id,right_spec_id\n')

In [25]:
chunk_size = 100
number_chunks = len(df) // chunk_size + 1
for i in tqdm(range(number_chunks)):
    # Takes a small chunk
    df_small = df[i * chunk_size:(i + 1) * chunk_size].copy()
    
    # Computes the numerical representation of the title
    df_small['page_title'] = df_small['page_title'].apply(lambda x: vect.transform([x]).toarray().flatten())
    
    # Computes all pairs
    merged = (df_small.merge(df_small, on=df_small.assign(key_col=1)['key_col'], suffixes=('', '_right'))
    .query('spec_id < spec_id_right') # filter out joins on the same row
    .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)

    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True)
    merged.drop(columns=['index'], inplace=True)
    merged.rename(columns = {"page_title" : "left_page_title", "page_title_right" : "right_page_title"}, inplace=True)
    
    # Flattens the embedding, generates one matrix with a column per feature
    X = merged[['left_page_title', 'right_page_title']]
    X = X.to_numpy()
    X = np.array([np.concatenate(x) for x in X])
    
    # Predicts the labels using random forests, change 'rf' to 'nb' for naive Bayes
    merged['label'] = rf.predict(X)
    
    # Filters the result, keeping only the matches
    matches = merged.query('label == 1')
    
    # Saves the matches to a CSV file
    output = matches[['left_spec_id', 'right_spec_id']]
    output.to_csv('submission.csv', index=False, header=False, mode='a')


  0%|          | 0/298 [00:00<?, ?it/s][A
  0%|          | 1/298 [00:01<09:19,  1.88s/it][A
  1%|          | 2/298 [00:03<09:24,  1.91s/it][A
  1%|          | 3/298 [00:05<09:02,  1.84s/it][A
  1%|▏         | 4/298 [00:07<09:00,  1.84s/it][A
  2%|▏         | 5/298 [00:09<09:05,  1.86s/it][A
  2%|▏         | 6/298 [00:11<09:37,  1.98s/it][A
  2%|▏         | 7/298 [00:13<09:33,  1.97s/it][A
  3%|▎         | 8/298 [00:16<10:28,  2.17s/it][A
  3%|▎         | 9/298 [00:18<10:19,  2.14s/it][A
  3%|▎         | 10/298 [00:20<09:56,  2.07s/it][A
  4%|▎         | 11/298 [00:22<09:56,  2.08s/it][A
  4%|▍         | 12/298 [00:24<09:31,  2.00s/it][A
  4%|▍         | 13/298 [00:25<09:13,  1.94s/it][A
  5%|▍         | 14/298 [00:28<09:45,  2.06s/it][A
  5%|▌         | 15/298 [00:30<09:30,  2.02s/it][A
  5%|▌         | 16/298 [00:32<09:40,  2.06s/it][A
  6%|▌         | 17/298 [00:34<09:38,  2.06s/it][A
  6%|▌         | 18/298 [00:36<09:09,  1.96s/it][A
  6%|▋         | 19/298 [00:3