In [1]:
import os
import sys
import random
import json
import collections

import pandas as pd
import numpy as np
import scipy
import statsmodels

from tqdm import trange, tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams["figure.figsize"] = (10,7)
pd.options.display.max_columns = 999

sns.set()

In [2]:
from annoy import AnnoyIndex

In [3]:
os.getcwd()

'/home/boris/study/UT study/ACM Sigmod/sigmod/notebooks'

In [4]:
%load_ext autoreload
%autoreload 2

In [7]:
from sigmod_src.utils import read_json, pprint_json, path_from_spec_id, get_vector_for_spec_id
from sigmod_src.data.make_dataset import make_specs_dataset, preprocess_specs_dataset, make_labelled_dataset
from sigmod_src.features.build_features import make_features
from sigmod_src.data.embedding import Embedder

# Load raw data

In [8]:
SPECS_PATH = '../data/raw/2013_camera_specs/'

In [9]:
LABELS_PATH = '../data/raw/sigmod_medium_labelled_dataset.csv'

# Make specs dataset

Extract fields from json, create one csv file

In [110]:
specs_df = make_specs_dataset(SPECS_PATH)

In [111]:
specs_df.shape

(29787, 2)

In [112]:
specs_df.head()

Unnamed: 0,spec_id,page_title
0,www.ebay.com//57656,Canon PowerShot ELPH 110 HS 16 1 MP Digital Ca...
1,www.ebay.com//60583,Canon Rebel 2000 35 mm Camera Great Condition ...
2,www.ebay.com//60440,Canon EOS Rebel T3i Digital SLR Camera 18 55mm...
3,www.ebay.com//24139,"GE C1033 10 1 MP Digital Camera 3X Zoom 2 4"" L..."
4,www.ebay.com//54903,Vivitar Clip Shot Digital Camera 1 1 MP | eBay


In [113]:
specs_df = preprocess_specs_dataset(specs_df)
specs_df.head()



Unnamed: 0,spec_id,page_title,page_title_stem,brand,site
0,www.ebay.com//57656,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp,canon,www.ebay.com
1,www.ebay.com//60583,canon rebel 2000 35 mm great condition case,canon rebel 2000 35 mm great condit case,canon,www.ebay.com
2,www.ebay.com//60440,canon eos rebel t3i 18 55mm,canon eo rebel t3i 18 55mm,canon,www.ebay.com
3,www.ebay.com//24139,ge c1033 10 1 mp 3x zoom 2,ge c1033 10 1 mp 3x zoom 2,ge,www.ebay.com
4,www.ebay.com//54903,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp,vivitar,www.ebay.com


In [120]:
specs_df.sample(5).page_title.values

array(['polaroid z2300 instant print zink zero ink',
       'nikon d80 10 2 mp w 18', 'pentax k50 mm india',
       'canon eos rebel t2i 18mp kit 18 55mm',
       '720p cube hikvision wifi wireless webcam night vision led ir'],
      dtype=object)

In [121]:
specs_df.to_csv('../data/processed/specs_preprocessed.csv', index=None)

# Make embedding

In [133]:
emb = Embedder(vector_size=100,
               train_epochs=100,
               index_trees=1000)

[nltk_data] Downloading package punkt to /home/boris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
emb.fit(specs_df.page_title_stem, specs_df.spec_id)

Epoch #10
Epoch #20
Epoch #30
Epoch #40
Epoch #50
Epoch #60
Epoch #70
Epoch #80
Epoch #90
Epoch #100


  index = AnnoyIndex(num_features)


Quick test by eye

In [None]:
random_row = specs_df.sample(1).iloc[0]
text, spec_id = random_row.page_title_stem, random_row.spec_id

nns = emb_lookup(text, emb, indexer)

nn_pairs = []
for nn in nns:
    nn_spec_id, dist = nn
    nn_row = specs_df[specs_df.spec_id == nn_spec_id].iloc[0]
    nn_pairs.append((nn_row.spec_id, nn_row.page_title_stem))
print(f'[{spec_id}] ', text)
print('')
print('Most similar')
for pair in nn_pairs:
    print(f'[{pair[0]}] ', pair[1])

In [None]:
emb.save('../models/embedder')

# Make features for classifier

In [None]:
spec_features_df, vectorizers, site_le = make_features(specs_df)

In [None]:
spec_features_df.shape

In [None]:
spec_features_df.head()

In [None]:
spec_features_df.info()

In [None]:
spec_features_df = specs_df.merge(spec_features_df, on='spec_id', how='left')

In [None]:
spec_features_df.head()

In [None]:
spec_features_df.to_csv('../data/processed/specs_features.csv', index=None)

# Make labelled dataset

In [None]:
labels_df = make_labelled_dataset(LABELS_PATH, spec_features_df)
labels_df.shape

In [None]:
labels_df.head()

In [None]:
labels_df.info()

In [None]:
labels_df.to_csv('../data/processed/labelled_features.csv', index=None)