In [1]:
import os
import sys
import random
import json
import collections

import pandas as pd
import numpy as np
import scipy
import statsmodels
from tqdm import trange, tqdm_notebook as tqdm

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline


pd.options.display.max_columns = 999

sns.set()
plt.rcParams["figure.figsize"] = (10,7)

In [2]:
LG_LABELS_PATH = '../data/raw/sigmod_large_labelled_dataset.csv'

In [3]:
lg_labels_df = pd.read_csv(LG_LABELS_PATH)

### Get Brands for all products

In [4]:
from tqdm import tqdm_notebook
import re

In [5]:
SPECS_PATH = '../data/raw/2013_camera_specs/'

In [6]:
def read_json(path):
    with open(path, 'r') as f:
        return json.load(f)
    
def pprint_json(json_):
    print(json.dumps(json_, indent=4, sort_keys=True))

getting all titles and brands form specs

In [7]:
site_folders = os.listdir(SPECS_PATH)

In [8]:
from collections import defaultdict
brands = defaultdict(list)
titles = []
sps = []
title_name = []
for site in site_folders:
    specs = os.path.join(SPECS_PATH, site)
    for i, spec in enumerate(os.listdir(specs)):
        spec_path = os.path.join(specs, spec)
        js = read_json(spec_path)
        brand = js.get('brand')
        title = js.get('<page title>')
        if isinstance(brand, list):
            continue
        if brand:
            brand = re.sub('[^a-z0-9] ', '', brand.lower())
            brands[brand].append(len(titles))
        title = re.sub('[^a-z0-9]', ' ', title.lower())
        title = re.sub(' +', ' ', title)
        titles.append(title)
        title_name.append(site+'//'+spec[:-5])
        sps.append(js)

add some brands from internal knowledges

In [9]:
list_of_brands = list(brands.keys())
list_of_brands.append('cctv')
list_of_brands.append('ip')
list_of_brands.append('binoculars')
list_of_brands.append('sekonic')
list_of_brands.append('samyang')
list_of_brands.append('lexar')
list_of_brands.append('ksm')
list_of_brands.append('uv')
list_of_brands.append('hoya')
list_of_brands.append('dahua')
list_of_brands.append('hikvision')
list_of_brands.append('colorpix')
list_of_brands.append('onvif')

drop all brands with less then 10 examples

In [10]:
br_in = defaultdict(int)
br_num = defaultdict(list)
emp = []
for i, title in tqdm_notebook(enumerate(titles)):
    list_title = title.split(' ')
    z = 0
    for word in list_title:
        if word in list_of_brands:
            br_in[word] += 1
            br_num[word].append(i)
            z = 1
    if z == 0:
        emp.append(i)
        
for key, val in tuple(br_in.items()):
    if val < 10:
        br_in.pop(key)
        br_num.pop(key)
        emp.append(key)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




for each title get it's brand number

In [11]:
br_num['nonbrand'] = 0
not_brand = set(('eos', 'action', 'shoot', 'new', 'class', 'neopine', 'ion', 'sharp', 'fuji', 'tamron'))
brands = set(br_num.keys()) - not_brand

In [12]:
brand_to_num = {brand:i for i, brand in enumerate(brands)} 
num_to_brand = {i:brand for i, brand in enumerate(brands)} 

In [13]:
br_finish = defaultdict(list)
title_labels = []
for i, title in tqdm_notebook(enumerate(titles)):
    title = re.sub('[^a-z0-9]', ' ', title.lower())
    title = re.sub(' +', ' ', title)
    title = title.replace('cannon', 'canon')

    if 'fuji' in title:
        title_labels.append(brand_to_num['fujifilm'])
        br_finish['fujifilm'].append(i)
        continue

    if 'eos' in title:
        title_labels.append(brand_to_num['canon'])
        br_finish['canon'].append(i)
        continue

    for br in brands:
        if br + ' ' in title or ' ' + br in title:
            title_labels.append(brand_to_num[br])
            br_finish[br].append(i)
            break
    else:
        title_labels.append(brand_to_num['nonbrand'])
        br_finish['nonbrand'].append(i)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




len(set(tuple(br_num['telesin'])) & set(tuple(br_num['gopro']))) # not shure merge or not

len(set(tuple(br_num['dv'])) & set(tuple(br_num['sony'])))

Check whether are dublicates from labels in the came class 

In [14]:
LG_LABELS_PATH = '../data/raw/sigmod_large_labelled_dataset.csv'
labels_df = pd.read_csv(LG_LABELS_PATH)
title_name = np.array(title_name)

In [None]:

bad_examples = []
for i, (first_it, second_it, cl) in tqdm_notebook(enumerate(np.array(labels_df))):
    id_f = np.where(title_name == first_it)[0][0]
    id_sec = np.where(title_name == second_it)[0][0]

    if title_labels[id_f] != title_labels[id_sec] and cl == '1':
        print('ALERT')
        bad_examples.append([first_it, second_it, cl, id_f, id_sec])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

### Get model for all products

In [None]:
from collections import defaultdict
models = defaultdict(list)
z = 0
for site in site_folders:
    specs = os.path.join(SPECS_PATH, site)
    for i, spec in enumerate(os.listdir(specs)):
        spec_path = os.path.join(specs, spec)
        js = read_json(spec_path)
        model = js.get('model')
        if isinstance(model, list):
            model = ' '.join(model)
        if model:
            model = re.sub('[^a-z0-9] ', '', model.lower())
            models[model].append(z)
        z +=1

In [25]:
from sigmod_src.data.make_dataset import make_specs_dataset, preprocess_specs_dataset, make_labelled_dataset 
from sigmod_src.features.build_features import make_features

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
SPECS_PATH = '../data/raw/2013_camera_specs/'
specs_df = make_specs_dataset(SPECS_PATH)
specs_df = preprocess_specs_dataset(specs_df)



In [46]:
spec_features_df, vectorizers, site_le = make_features(specs_df)
spec_features_df = specs_df.merge(spec_features_df, on='spec_id', how='left')

In [47]:
spec_features_df.to_csv('../data/processed/specs_df.csv', index=None)

In [35]:
spec_features_df = pd.read_csv('../data/processed/specs_df.csv')

In [None]:
labels_df = make_labelled_dataset(LG_LABELS_PATH, spec_features_df)

In [944]:
labels_df.to_csv('../data/processed/labelled_features.csv', index=None)

In [16]:
labelled_dataset_path = '../data/processed/labelled_features.csv'
df = pd.read_csv(labelled_dataset_path)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = df.drop(['label', 'left_spec_id', 'right_spec_id'], axis=1)
Y = df.label

In [19]:
X = df.drop(['label', 'left_spec_id', 'right_spec_id', 
             'left_all_text', 'left_all_text_stem', 
             'right_all_text', 'right_all_text_stem'], axis=1)

In [20]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, stratify=Y)
train_Y.shape, test_Y.shape

((223238,), (74413,))

In [21]:
from sklearn.metrics import f1_score, classification_report

In [22]:
from lightgbm import LGBMClassifier

In [23]:
train_Y.value_counts()

0    190209
1     33029
Name: label, dtype: int64

In [24]:
test_Y.value_counts()

0    63403
1    11010
Name: label, dtype: int64

In [25]:
sample_pos_weight = Y.value_counts()[0] / Y.value_counts()[1]
sample_pos_weight

5.75880469583778

In [65]:
clf = LGBMClassifier(sample_pos_weight=sample_pos_weight)

In [66]:
clf.fit(train_X, train_Y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               sample_pos_weight=5.75880469583778, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [70]:
pred_train_Y = clf.predict(test_X)

CPU times: user 2.19 s, sys: 828 ms, total: 3.02 s
Wall time: 2.36 s


In [29]:
print('Train F1', f1_score(test_Y, pred_train_Y))
print(classification_report(test_Y, pred_train_Y))

Train F1 0.9974056711119201
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     63403
           1       1.00      1.00      1.00     11010

    accuracy                           1.00     74413
   macro avg       1.00      1.00      1.00     74413
weighted avg       1.00      1.00      1.00     74413



### Combine all with all

In [30]:
from itertools import combinations

In [176]:
dframe = np.array(spec_features_df.drop(['spec_id', 'page_title', 'all_text',
                                'page_title_stem','all_text_stem','brand','site'], axis=1))

sorted_finish = sorted(br_finish.items(), key=lambda a:len(a[1]), reverse=True)
comb = [list(combinations(ids, 2)) for _, ids in sorted_finish]

In [242]:
df = pd.DataFrame(columns=['left_spec_id', 'right_spec_id'])
df.to_csv('predict.csv', index=False, columns=None, mode='w')

In [243]:
# @njit
same_specs = []
step = 10000
i = 0
for c in comb:
    id_f, id_s = np.array(c).T
    for ix in tqdm_notebook(np.arange(0, len(c)-step, step)):
        crop_id_f = id_f[ix:ix+step]
        crop_id_s = id_s[ix:ix+step]
        spec_f = dframe[crop_id_f]
        spec_s = dframe[crop_id_s]
        data = np.hstack((spec_f, spec_s))
        prs = clf.predict(data, n_jobs=10)
        ixs = np.where(prs == 1)[0]
        first_column = title_name[crop_id_f[ixs]]
        second_column = title_name[crop_id_s[ixs]]
        up = pd.DataFrame(np.array([first_column, second_column]).T)
        up.to_csv('predict.csv', index=False,  header=False, mode='a')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=1442.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=1134.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=544.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=484.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=144.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=97.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=75.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=52.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


