In [1]:
import os
import sys
import random
import json
import collections
import re
import itertools
from itertools import combinations

import pandas as pd
import numpy as np
import scipy
import statsmodels
from tqdm import trange, tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from lightgbm import LGBMClassifier

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline


pd.options.display.max_columns = 999

pd.options.display.max_rows = 100

sns.set()
plt.rcParams["figure.figsize"] = (10,7)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from sigmod_src.make_dataset import make_specs_dataset
from sigmod_src.preprocessing import preprocess_specs_dataset
from sigmod_src.pipeline import LGBMPipeline
from sigmod_src.utils import get_additional_labels, make_classes_df

In [4]:
LG_LABELS_PATH = '../data/raw/sigmod_large_labelled_dataset.csv'
SPECS_PATH = '../data/raw/2013_camera_specs/'

In [5]:
specs_df = pd.read_csv('../data/processed/specs.csv')
specs_df.head()

Unnamed: 0,spec_id,page_title,brand,model,all_text,page_title_stem,all_text_stem,site
0,www.ebay.com//57656,canon powershot elph 110 hs 16-1-mp digital,canon,,canon powershot 110 hs 16-1-mp digital canon r...,canon powershot elph 110 hs 16-1-mp digit,canon powershot 110 hs 16-1-mp digit canon ref...,www.ebay.com
1,www.ebay.com//60583,canon rebel 2000 35mm great case instruction b...,canon,,canon rebel 2000 35mm great case instruction b...,canon rebel 2000 35mm great case instruct booklet,canon rebel 2000 35mm great case instruct book...,www.ebay.com
2,www.ebay.com//60440,canon eos rebel t3i digital slr 18 55mm 75 300...,canon,t3i,canon eos rebel t3i digital slr 18 55mm 75 300...,canon eo rebel t3i digit slr 18 55mm 75 300mm ...,canon eo rebel t3i digit slr 18 55mm 75 300mm ...,www.ebay.com
3,www.ebay.com//24139,ge c1033 10-1-mp digital 3x zoom 2 4 lcd,,c1033,ge c1033 10-1-mp digital 3x zoom 2 4 ge brand-...,ge c1033 10-1-mp digit 3x zoom 2 4 lcd,ge c1033 10-1-mp digit 3x zoom 2 4 ge brand-ne...,www.ebay.com
4,www.ebay.com//54903,vivitar clip shot digital 1-1-mp,vivitar,,vivitar clip shot digital 1-1-mp vivitar brand...,vivitar clip shot digit 1-1-mp,vivitar clip shot digit 1-1-mp vivitar brand-n...,www.ebay.com


In [6]:
specs_df.shape

(29694, 8)

In [11]:
specs_df['spec_idx'] = range(len(specs_df))

In [25]:
specs_df['brand'] = specs_df.brand.fillna('missing')

In [26]:
import Levenshtein as lev

In [27]:
page_titles = specs_df.page_title_stem.values

In [28]:
threshold = 0.9

In [29]:
brand_groups = specs_df.groupby('brand')['spec_idx'].agg(list).to_dict()

In [31]:
brand_group_sizes = {k: len(v) for k, v in brand_groups.items()}
collections.Counter(brand_group_sizes).most_common(30)

[('missing', 8713),
 ('canon', 5589),
 ('sony', 3407),
 ('nikon', 3361),
 ('fujifilm', 1516),
 ('olympus', 1457),
 ('panasonic', 1353),
 ('samsung', 1071),
 ('kodak', 763),
 ('pentax', 622),
 ('casio', 221),
 ('gopro', 215),
 ('vivitar', 194),
 ('leica', 174),
 ('polaroid', 117),
 ('ricoh', 97),
 ('sigma', 65),
 ('konica', 59),
 ('svp', 51),
 ('sanyo', 46),
 ('coleman', 41),
 ('bell', 38),
 ('vizio', 37),
 ('sakar', 32),
 ('minolta', 27),
 ('digital', 25),
 ('fvanor', 24),
 ('hasselblad', 21),
 ('lytro', 20),
 ('toshiba', 19)]

In [35]:
edge_list = []
threshold = 0.9
for brand, group_specs in tqdm(brand_groups.items()):
    brand_combs = np.array(list(combinations(group_specs, 2)))
    for pair in tqdm(brand_combs):
        left, right = pair    
        ratio = lev.ratio(page_titles[left], page_titles[right])
        if ratio >= threshold:
            edge_list.append((left, right, ratio))

In [37]:
len(edge_list)

93444

In [40]:
with open('../data/processed/graph_edgelist.txt', 'w') as f:
    for row in edge_list:
        f.write(f'{row[0]} {row[1]} {row[2]}\n')

In [41]:
import networkx as nx

In [42]:
graph = nx.read_weighted_edgelist('../data/processed/graph_edgelist.txt')

In [46]:
graph.adj['0']

AtlasView({'580': {'weight': 0.9512195121951219}, '2616': {'weight': 0.925}, '3180': {'weight': 0.9512195121951219}, '7095': {'weight': 0.926829268292683}, '7260': {'weight': 0.926829268292683}, '7292': {'weight': 0.9512195121951219}, '8380': {'weight': 1.0}, '11757': {'weight': 0.9425287356321839}})

In [47]:
graph.adj['580']

AtlasView({'0': {'weight': 0.9512195121951219}, '1673': {'weight': 0.9213483146067416}, '1770': {'weight': 0.9425287356321839}, '2616': {'weight': 0.95}, '3180': {'weight': 1.0}, '7095': {'weight': 0.975609756097561}, '7260': {'weight': 0.975609756097561}, '7292': {'weight': 1.0}, '8380': {'weight': 0.9512195121951219}, '12379': {'weight': 0.9210526315789473}})

In [53]:
def n_graph_common_neighboors(left_idx, right_idx, graph_adj):
    graph_keys = graph_adj.keys()
    n_common_neighboors = np.zeros(len(left_idx))
    for i in range(len(left_idx)):
        left, right = left_idx[i], right_idx[i]
        if left not in graph_keys or right not in graph_keys:
            n_common_neighboors[i] = 0
            continue
        left_neighboors = graph_adj[left]
        right_neighboors = graph_adj[right]
        n_common_neighboors[i] = len(set(left_neighboors).intersection(set(right_neighboors)))
    return n_common_neighboors

n_graph_common_neighboors(['0'], ['580'], graph.adj)

array([6.])

In [55]:
graph.nodes()

NodeView(('14297', '14444', '14499', '3833', '11211', '4285', '7782', '6817', '9604', '16969', '16997', '17007', '17045', '588', '10169', '16021', '16164', '16221', '16279', '19332', '19433', '19435', '19583', '0', '580', '2616', '3180', '7095', '7260', '7292', '8380', '11757', '2', '4594', '5223', '6499', '7198', '13832', '9', '14173', '11', '140', '260', '659', '755', '1026', '1077', '1206', '1208', '1319', '1399', '1526', '1553', '1726', '1902', '2161', '2364', '2558', '2708', '3072', '3403', '3614', '3907', '3959', '4467', '4705', '4799', '5689', '5760', '5808', '5881', '6217', '6223', '6279', '6706', '6760', '6894', '6940', '7509', '7924', '8495', '8666', '8725', '8783', '9576', '9647', '9702', '9908', '9945', '9977', '10047', '10115', '10152', '10966', '11400', '11652', '11689', '11756', '11956', '12013', '12065', '12086', '12158', '12225', '12262', '12398', '12425', '12601', '12727', '12759', '12793', '12923', '13004', '13061', '13112', '13133', '13194', '13426', '13829', '13997

In [57]:
relabel = {i: int(i) for i in graph.nodes()}

In [59]:
graph1 = nx.relabel_nodes(graph, relabel)

In [61]:
graph1.adj[0]

AtlasView({580: {'weight': 0.9512195121951219}, 2616: {'weight': 0.925}, 3180: {'weight': 0.9512195121951219}, 7095: {'weight': 0.926829268292683}, 7260: {'weight': 0.926829268292683}, 7292: {'weight': 0.9512195121951219}, 8380: {'weight': 1.0}, 11757: {'weight': 0.9425287356321839}})