In [25]:
import pandas as pd
import numpy as np
import os
import itertools
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Training

In [26]:
df = pd.read_csv('../datasets/labeled/labeled_with_titles_large.csv')
df.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.garricks.com.au//31,www.ebay.com//53278,1,nikon d3200 black w/ 18-55mm vr lens,nikon d3200 24 2 mp digital slr camera black k...
1,www.garricks.com.au//31,www.ebay.com//48947,0,nikon d3200 black w/ 18-55mm vr lens,canon eos 5d 12 8mp digital slr with battery g...
2,www.garricks.com.au//31,www.ebay.com//42569,0,nikon d3200 black w/ 18-55mm vr lens,nikon d800 36 3 mp mb d12 multi power battery ...
3,www.garricks.com.au//31,www.shopbot.com.au//1376,0,nikon d3200 black w/ 18-55mm vr lens,nikon d7000 / 18-105mm vr kit - price comparis...
4,www.garricks.com.au//31,www.ebay.com//55623,0,nikon d3200 black w/ 18-55mm vr lens,nikon d7000 16 2 mp digital slr camera black k...


In [27]:
y = df['label'].copy()
y

0        1
1        0
2        0
3        0
4        0
        ..
46660    1
46661    1
46662    1
46663    1
46664    1
Name: label, Length: 46665, dtype: int64

In [28]:
titles = np.hstack([df['left_page_title'].values, df['right_page_title'].values])
titles.shape

(93330,)

In [29]:
vect = TfidfVectorizer()
vect.fit(titles)
left = vect.transform(df['left_page_title']).toarray()
right = vect.transform(df['right_page_title']).toarray()

In [30]:
rows = []
for l, r in zip(left, right):
    elem = {'left_page_title': l, 'right_page_title': r}
    rows.append(elem)
    
X = pd.DataFrame(rows)
X

Unnamed: 0,left_page_title,right_page_title
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.2362400321865185, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
46660,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46661,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46662,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46663,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [31]:
X = X.to_numpy()
X = np.array([np.concatenate(x) for x in X])
X.shape

(46665, 980)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Naive Bayes

In [33]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
nb.score(X_test, y_test)

0.9390335369120326

### Random forests

In [56]:
rf = RandomForestClassifier(random_state=42, n_jobs=2)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [57]:
rf.score(X_test, y_test)

0.9950712525447337

# Testing

In [63]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [64]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')



  0%|          | 0/24 [00:00<?, ?it/s][A[A

  4%|▍         | 1/24 [00:00<00:03,  7.37it/s][A[A

>>> Creating dataframe...





 12%|█▎        | 3/24 [00:00<00:02,  8.96it/s][A[A

 17%|█▋        | 4/24 [00:00<00:03,  6.25it/s][A[A

 21%|██        | 5/24 [00:00<00:02,  6.45it/s][A[A

 25%|██▌       | 6/24 [00:00<00:02,  6.96it/s][A[A

 33%|███▎      | 8/24 [00:04<00:10,  1.46it/s][A[A

 46%|████▌     | 11/24 [00:04<00:06,  2.02it/s][A[A

 50%|█████     | 12/24 [00:04<00:04,  2.63it/s][A[A

 62%|██████▎   | 15/24 [00:05<00:02,  3.55it/s][A[A

 71%|███████   | 17/24 [00:07<00:04,  1.68it/s][A[A

 75%|███████▌  | 18/24 [00:08<00:02,  2.00it/s][A[A

 79%|███████▉  | 19/24 [00:09<00:03,  1.50it/s][A[A

 83%|████████▎ | 20/24 [00:09<00:02,  1.86it/s][A[A

 88%|████████▊ | 21/24 [00:09<00:01,  2.25it/s][A[A

 96%|█████████▌| 23/24 [00:09<00:00,  2.98it/s][A[A

100%|██████████| 24/24 [00:09<00:00,  3.44it/s][A[A

>>> Dataframe created successfully!



In [65]:
df = df[['spec_id', 'page_title']]
df.head()

Unnamed: 0,spec_id,page_title
0,www.mypriceindia.com//50,"panasonic hc v130 price in india, bangalore, h..."
1,www.mypriceindia.com//34,canon eos 1100d (ef-s 18-55 mm is ii) price in...
2,www.mypriceindia.com//47,"panasonic lumix dmc tz30 price in india, banga..."
3,www.mypriceindia.com//40,"sony alpha ilce 7s (body only) price in india,..."
4,www.mypriceindia.com//726,"samsung st72 price in india, bangalore, hydera..."


In [66]:
chunk_size = 100
number_chunks = len(df) // chunk_size + 1
for i in tqdm(range(number_chunks)):
    # Takes a small chunk
    df_small = df[i * chunk_size:(i + 1) * chunk_size].copy()
    
    # Computes the numerical representation of the title
    df_small['page_title'] = df_small['page_title'].apply(lambda x: vect.transform([x]).toarray().flatten())
    
    # Computes all pairs
    merged = (df_small.merge(df_small, on=df_small.assign(key_col=1)['key_col'], suffixes=('', '_right'))
    .query('spec_id < spec_id_right') # filter out joins on the same row
    .reset_index(drop=True))
    merged.drop(columns = ["key_0"], axis = 1, inplace=True)
    merged["zipped"] = tuple(map(lambda line : sorted(line), list(zip(merged["spec_id"], merged["spec_id_right"]))))
    merged.drop_duplicates("zipped", inplace=True)
    merged.drop(columns=["zipped"], inplace=True)
    merged.rename(columns = {"spec_id" : "left_spec_id", "spec_id_right" : "right_spec_id"}, inplace=True)
    merged.reset_index(inplace=True)
    merged.drop(columns=['index'], inplace=True)
    merged.rename(columns = {"page_title" : "left_page_title", "page_title_right" : "right_page_title"}, inplace=True)
    
    # Flattens the embedding, generates one matrix with a column per feature
    X = merged[['left_page_title', 'right_page_title']]
    X = X.to_numpy()
    X = np.array([np.concatenate(x) for x in X])
    
    # Predicts the labels using random forests, change 'rf' to 'nb' for naive Bayes
    merged['label'] = rf.predict(X)
    
    # Filters the result, keeping only the matches
    matches = merged.query('label == 1')
    
    # Saves the matches to a CSV file
    output = matches[['left_spec_id', 'right_spec_id']]
    output.to_csv('submission.csv', index=False, mode='a')



  0%|          | 0/298 [00:00<?, ?it/s][A[A

  0%|          | 1/298 [00:00<02:03,  2.40it/s][A[A

  1%|          | 2/298 [00:00<01:59,  2.48it/s][A[A

  1%|          | 3/298 [00:01<01:57,  2.50it/s][A[A

  1%|▏         | 4/298 [00:01<01:56,  2.51it/s][A[A

  2%|▏         | 5/298 [00:01<01:54,  2.56it/s][A[A

  2%|▏         | 6/298 [00:02<01:55,  2.53it/s][A[A

  2%|▏         | 7/298 [00:02<02:01,  2.40it/s][A[A

  3%|▎         | 8/298 [00:03<02:00,  2.41it/s][A[A

  3%|▎         | 9/298 [00:03<01:56,  2.47it/s][A[A

  3%|▎         | 10/298 [00:04<01:55,  2.50it/s][A[A

  4%|▎         | 11/298 [00:04<01:53,  2.54it/s][A[A

  4%|▍         | 12/298 [00:04<01:54,  2.51it/s][A[A

  4%|▍         | 13/298 [00:05<01:54,  2.49it/s][A[A

  5%|▍         | 14/298 [00:05<01:52,  2.51it/s][A[A

KeyboardInterrupt: 