In [1]:
import os
import sys
import random
import json
import collections

import pandas as pd
import numpy as np
import scipy
import statsmodels

from tqdm import trange, tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams["figure.figsize"] = (10,7)
pd.options.display.max_columns = 999

sns.set()

In [2]:
from annoy import AnnoyIndex

In [3]:
os.getcwd()

'/home/boris/study/UT study/ACM Sigmod/sigmod/notebooks'

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
from sigmod_src.utils import read_json, pprint_json, path_from_spec_id, get_vector_for_spec_id
from sigmod_src.data.make_dataset import make_specs_dataset, preprocess_specs_dataset, make_labelled_dataset, make_classes_df
from sigmod_src.features.build_features import make_features
from sigmod_src.data.embedding import Embedder

[nltk_data] Downloading package punkt to /home/boris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load raw data

In [6]:
SPECS_PATH = '../data/raw/2013_camera_specs/'

In [7]:
MED_LABELS_PATH = '../data/raw/sigmod_medium_labelled_dataset.csv'
LG_LABELS_PATH = '../data/raw/sigmod_large_labelled_dataset.csv'

# Make disjoint labelled datasets

In [8]:
med_labels_df = pd.read_csv(MED_LABELS_PATH)
lg_labels_df = pd.read_csv(LG_LABELS_PATH)

med_labels_df.shape, lg_labels_df.shape

((46665, 3), (297651, 3))

In [9]:
lg_no_overlap_df = lg_labels_df[~lg_labels_df.isin(med_labels_df)].dropna()
lg_no_overlap_df.shape

(250986, 3)

In [10]:
lg_no_overlap_df.to_csv('../data/interim/large_labelled_dataset_disjoint.csv', index=None)

# Make specs dataset

Extract fields from json, create one csv file

In [11]:
specs_df = make_specs_dataset(SPECS_PATH)

In [12]:
specs_df.shape

(29787, 3)

In [16]:
specs_df.head()

[nltk_data] Downloading package punkt to /home/boris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,spec_id,page_title,all_text,page_title_stem,all_text_stem,brand,site
0,www.ebay.com//57656,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp canon manu...,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp canon manu...,canon,www.ebay.com
1,www.ebay.com//60583,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruct boo...,canon rebel 2000 35 mm great case instruct boo...,canon,www.ebay.com
2,www.ebay.com//60440,canon eos rebel t3i 18 55mm 75 300mm iii lens ...,canon eos rebel t3i 18 55mm 75 300mm iii lens ...,canon eo rebel t3i 18 55mm 75 300mm iii len ki...,canon eo rebel t3i 18 55mm 75 300mm iii len ki...,canon,www.ebay.com
3,www.ebay.com//24139,ge c1033 10 1 mp 3x zoom 2 4 lcd,ge c1033 10 1 mp 3x zoom 2 4 lcd ge unused uno...,ge c1033 10 1 mp 3x zoom 2 4 lcd,ge c1033 10 1 mp 3x zoom 2 4 lcd ge unus unope...,,www.ebay.com
4,www.ebay.com//54903,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp vivitar unused unopen...,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp vivitar unus unopen u...,vivitar,www.ebay.com


In [22]:
specs_df = preprocess_specs_dataset(specs_df)
specs_df.head()

[nltk_data] Downloading package punkt to /home/boris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,spec_id,page_title,all_text,page_title_stem,all_text_stem,brand,site
0,www.ebay.com//57656,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp canon manu...,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp canon manu...,canon,www.ebay.com
1,www.ebay.com//60583,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruct boo...,canon rebel 2000 35 mm great case instruct boo...,canon,www.ebay.com
2,www.ebay.com//60440,canon eos rebel t3i 18 55mm 75 300mm iii lens ...,canon eos rebel t3i 18 55mm 75 300mm iii lens ...,canon eo rebel t3i 18 55mm 75 300mm iii len ki...,canon eo rebel t3i 18 55mm 75 300mm iii len ki...,canon,www.ebay.com
3,www.ebay.com//24139,ge c1033 10 1 mp 3x zoom 2 4 lcd,ge c1033 10 1 mp 3x zoom 2 4 lcd ge unused uno...,ge c1033 10 1 mp 3x zoom 2 4 lcd,ge c1033 10 1 mp 3x zoom 2 4 lcd ge unus unope...,,www.ebay.com
4,www.ebay.com//54903,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp vivitar unused unopen...,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp vivitar unus unopen u...,vivitar,www.ebay.com


In [23]:
specs_df.sample(5)[['page_title', 'all_text']].values

array([['2015 waterproof case sale waterproof case waterproof shockproof case waterproof case nikon',
        '2015 waterproof case sale waterproof case waterproof shockproof case waterproof case nikon us piece get latest 1000 western union shenzhen 30000 per month'],
       ['nikon d5100 18 55mm vr 3 lens 16gb case filters battery',
        'delivers lifelike images vivid colors reduced noise smooth tonal gradations enables capture action four frames per second nikon d5100 18 55mm vr 3 lens 16gb case filters battery included 1 76 28 maximum minimum 22 video mono yes 1x rechargeable battery pack nikon case bag extra battery flash lens memory card dx crop factor interchangeable lenses unused unopened undamaged original packaging packaging applicable packaging found retail store unless handmade packaged manufacturer packaging unprinted box plastic bag seller full details opens window tab read moreabout microphone av output hdmi c mini usb 4 fps discovered defective upon receipt replaced 

In [24]:
specs_df.to_csv('../data/processed/specs_preprocessed.csv', index=None)

# Make classes from labels

Each class - camera model. All specs that are duplicates of eachother consitute the same class. In a graph one class = one connected component.

In [88]:
classes_df = make_classes_df(pd.concat([med_labels_df, lg_no_overlap_df], axis=0))
classes_df.shape

(908, 2)

In [93]:
classes_df.class_.value_counts()

6     178
2     130
11     95
4      91
13     80
7      78
9      57
1      55
12     16
24     14
10     14
19     11
20      9
5       9
23      9
3       9
17      7
0       6
14      5
16      5
18      5
15      4
21      4
26      4
25      3
28      2
8       2
22      2
27      2
29      2
Name: class_, dtype: int64

In [77]:
classes_df.head()

Unnamed: 0,spec_id,class_
0,buy.net//5641,0
1,www.ebay.com//58588,0
2,www.gosale.com//849,0
3,www.price-hunt.com//9794,0
4,buy.net//5698,1


In [94]:
classes_df.drop_duplicates().shape

(908, 2)

In [95]:
classes_df.to_csv('../data/interim/classes.csv', index=None)

# Make embedding

In [133]:
emb = Embedder(vector_size=100,
               train_epochs=100,
               index_trees=1000)

[nltk_data] Downloading package punkt to /home/boris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
emb.fit(specs_df.page_title_stem, specs_df.spec_id)

Epoch #10
Epoch #20
Epoch #30
Epoch #40
Epoch #50
Epoch #60
Epoch #70
Epoch #80
Epoch #90
Epoch #100


  index = AnnoyIndex(num_features)


Quick test by eye

In [None]:
random_row = specs_df.sample(1).iloc[0]
text, spec_id = random_row.page_title_stem, random_row.spec_id

nns = emb_lookup(text, emb, indexer)

nn_pairs = []
for nn in nns:
    nn_spec_id, dist = nn
    nn_row = specs_df[specs_df.spec_id == nn_spec_id].iloc[0]
    nn_pairs.append((nn_row.spec_id, nn_row.page_title_stem))
print(f'[{spec_id}] ', text)
print('')
print('Most similar')
for pair in nn_pairs:
    print(f'[{pair[0]}] ', pair[1])

In [None]:
emb.save('../models/embedder')

# Make features for classifier

In [None]:
spec_features_df, vectorizers, site_le = make_features(specs_df)

In [None]:
spec_features_df.shape

In [None]:
spec_features_df.head()

In [None]:
spec_features_df.info()

In [None]:
spec_features_df = specs_df.merge(spec_features_df, on='spec_id', how='left')

In [None]:
spec_features_df.head()

In [None]:
spec_features_df.to_csv('../data/processed/specs_features.csv', index=None)

# Make labelled dataset

In [None]:
labels_df = make_labelled_dataset(LABELS_PATH, spec_features_df)
labels_df.shape

In [None]:
labels_df.head()

In [None]:
labels_df.info()

In [None]:
labels_df.to_csv('../data/processed/labelled_features.csv', index=None)