In [63]:
import pandas as pd
import numpy as np
import os
import itertools
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Training

In [64]:
df = pd.read_csv('../datasets/labeled/labeled_with_titles_medium.csv')
df.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.garricks.com.au//31,www.ebay.com//53278,1,nikon d3200 black w/ 18-55mm vr lens,nikon d3200 24 2 mp digital slr camera black k...
1,www.garricks.com.au//31,www.ebay.com//48947,0,nikon d3200 black w/ 18-55mm vr lens,canon eos 5d 12 8mp digital slr with battery g...
2,www.garricks.com.au//31,www.ebay.com//42569,0,nikon d3200 black w/ 18-55mm vr lens,nikon d800 36 3 mp mb d12 multi power battery ...
3,www.garricks.com.au//31,www.shopbot.com.au//1376,0,nikon d3200 black w/ 18-55mm vr lens,nikon d7000 / 18-105mm vr kit - price comparis...
4,www.garricks.com.au//31,www.ebay.com//55623,0,nikon d3200 black w/ 18-55mm vr lens,nikon d7000 16 2 mp digital slr camera black k...


In [65]:
y = df['label'].copy()
y

0        1
1        0
2        0
3        0
4        0
        ..
46660    1
46661    1
46662    1
46663    1
46664    1
Name: label, Length: 46665, dtype: int64

In [66]:
titles = np.hstack([df['left_page_title'].values, df['right_page_title'].values])
titles.shape

(93330,)

In [67]:
vect = TfidfVectorizer()
vect.fit(titles)
left = vect.transform(df['left_page_title']).toarray()
right = vect.transform(df['right_page_title']).toarray()

In [68]:
rows = []
for l, r in zip(left, right):
    elem = {'left_page_title': l, 'right_page_title': r}
    rows.append(elem)
    
X = pd.DataFrame(rows)
X

Unnamed: 0,left_page_title,right_page_title
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.2362400321865185, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
46660,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46661,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46662,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46663,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [69]:
X = X.to_numpy()
X = np.array([np.concatenate(x) for x in X])
X.shape

(46665, 980)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
clf.score(X_test, y_test)

0.9390335369120326

# Testing

In [76]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [77]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  4%|▍         | 1/24 [00:00<00:02,  8.54it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:10<00:00,  3.63it/s]

>>> Dataframe created successfully!






In [78]:
df = df[['spec_id', 'page_title']]
df.head()

Unnamed: 0,spec_id,page_title
0,www.mypriceindia.com//50,"panasonic hc v130 price in india, bangalore, h..."
1,www.mypriceindia.com//34,canon eos 1100d (ef-s 18-55 mm is ii) price in...
2,www.mypriceindia.com//47,"panasonic lumix dmc tz30 price in india, banga..."
3,www.mypriceindia.com//40,"sony alpha ilce 7s (body only) price in india,..."
4,www.mypriceindia.com//726,"samsung st72 price in india, bangalore, hydera..."


In [79]:
df_small = df.sample(frac=0.001)
df_small.head()

Unnamed: 0,spec_id,page_title
162,www.mypriceindia.com//10,"nikon d4s (body only) price in india, bangalor..."
11445,www.ebay.com//43022,canon powershot g12 digital camera | ebay
19600,www.alibaba.com//25236,hikvision ip camera dome 3 megapixel mini came...
8929,www.ebay.com//24368,kodak easyshare m590 14 0 mp digital camera si...
825,www.shopmania.in//1629,"casio exilim ex-s12 digital camera prices, sho..."


In [80]:
df_small['page_title'] = df_small['page_title'].apply(lambda x: vect.transform([x]).toarray().flatten())

In [81]:
merged = (df_small.merge(df_small, on=df_small.assign(key_col=1)['key_col'], suffixes=('', '_right'))
.query('spec_id < spec_id_right') # filter out joins on the same row
.reset_index(drop=True))
merged.drop(columns = ["key_0"], axis = 1, inplace=True)
merged["zipped"] = tuple(map(lambda line : sorted(line), list(zip(merged["spec_id"], merged["spec_id_right"]))))
merged.drop_duplicates("zipped", inplace=True)
merged.drop(columns=["zipped"], inplace=True)
merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
merged.reset_index(inplace=True)
merged.drop(columns=['index'], inplace=True)
merged.rename(columns = {"page_title" : "left_page_title", "page_title_right" : "right_page_title"}, inplace=True)
merged.head()

Unnamed: 0,left_spec_id,left_page_title,right_spec_id,right_page_title
0,www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.shopmania.in//1629,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.walmart.com//692,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.mypriceindia.com//726,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,www.ebay.com//43022,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,www.ebay.com//43022,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.shopmania.in//1629,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [82]:
X = merged[['left_page_title', 'right_page_title']]
X.head()

Unnamed: 0,left_page_title,right_page_title
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [83]:
X = X.to_numpy()
X = np.array([np.concatenate(x) for x in X])
X.shape

(435, 980)

In [85]:
labels = clf.predict(X)

In [86]:
merged['label'] = labels
merged.head()

Unnamed: 0,left_spec_id,left_page_title,right_spec_id,right_page_title,label
0,www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.shopmania.in//1629,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.walmart.com//692,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.mypriceindia.com//726,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
3,www.ebay.com//43022,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.mypriceindia.com//10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
4,www.ebay.com//43022,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.shopmania.in//1629,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [87]:
matches = merged.query('label == 1')
matches

Unnamed: 0,left_spec_id,left_page_title,right_spec_id,right_page_title,label
332,www.ebay.com//57363,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.ebay.com//58183,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
423,www.ebay.com//24752,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",www.ebay.com//53340,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [90]:
output = matches[['left_spec_id', 'right_spec_id']]
output.to_csv('submission.csv', index=False)