In [1]:
import os
import sys
import random
import json
import collections
import re
from itertools import combinations

import pandas as pd
import numpy as np
import scipy
import statsmodels
from tqdm import trange, tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from lightgbm import LGBMClassifier

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline


pd.options.display.max_columns = 999

sns.set()
plt.rcParams["figure.figsize"] = (10,7)

In [2]:
%load_ext autoreload
%autoreload 2

In [16]:
from sigmod_src.utils import read_json, pprint_json, get_known_brands
from sigmod_src.data.make_dataset import make_specs_dataset, preprocess_specs_dataset, join_labels_specs, make_classes_df
from sigmod_src.features.build_features import make_features

In [4]:
LG_LABELS_PATH = '../data/raw/sigmod_large_labelled_dataset.csv'
SPECS_PATH = '../data/raw/2013_camera_specs/'

# Load labels df

In [5]:
labels_df = pd.read_csv(LG_LABELS_PATH)
labels_df.shape

(297651, 3)

# Load specs df

In [6]:
specs_dataset_src = make_specs_dataset(SPECS_PATH)

In [9]:
brand_blacklist = ['shoot', 'unbranded/generic', 'as', 
            'eos', 'action', 'new', 'class', 'neopine', 'sharp',
            'telesin']
known_brand_cutoff = 10
brand_cutoff = 5 # Remove brands appear in < brand_cutoff specs
cutoff = 1
max_words = 500

In [10]:
known_brands = get_known_brands(specs_dataset_src, known_brand_cutoff, brand_blacklist)
len(known_brands)

37

In [13]:
specs_df = preprocess_specs_dataset(specs_dataset_src,
                                         max_words=max_words,
                                         cutoff=cutoff,
                                         known_brands=known_brands,
                                        brand_blacklist=brand_blacklist,
                                        brand_cutoff=brand_cutoff)

Conflict. Found: sigma , brand field: nikon  Will use brand field
Conflict. Found: sony , brand field: olympus  Will use brand field
Conflict. Found: sony , brand field: kodak  Will use brand field
Conflict. Found: sony , brand field: olympus  Will use brand field
Conflict. Found: samsung , brand field: vivitar  Will use brand field
Conflict. Found: olympus , brand field: panasonic  Will use brand field
Conflict. Found: canon , brand field: fujifilm  Will use brand field
Conflict. Found: sigma , brand field: nikon  Will use brand field
Conflict. Found: sony , brand field: olympus  Will use brand field
Conflict. Found: gopro , brand field: ion  Will use brand field
Conflict. Found: konica , brand field: minolta  Will use brand field
Conflict. Found: canon , brand field: casio  Will use brand field
Conflict. Found: kodak , brand field: vivitar  Will use brand field
Conflict. Found: sigma , brand field: nikon  Will use brand field
Conflict. Found: canon , brand field: olympus  Will use br

# Test pipeline

In [189]:
from sigmod_src.pipeline import LGBMPipeline

In [190]:
model = LGBMPipeline(specs_df, labels_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs_df['spec_idx'] = range(len(self.specs_df))


In [191]:
model.precompute()

In [199]:
model.tfidf.head()

Unnamed: 0,page_title_stem__10,page_title_stem__10 cctv,page_title_stem__10 mp,page_title_stem__1080p,page_title_stem__1080p hd,page_title_stem__1080p ip,page_title_stem__10x,page_title_stem__12,page_title_stem__12 mp,page_title_stem__12mp,page_title_stem__14,page_title_stem__14 mp,page_title_stem__14mp,page_title_stem__16,page_title_stem__16 mp,page_title_stem__16gb,page_title_stem__16mp,page_title_stem__18,page_title_stem__18 55,page_title_stem__18 55mm,page_title_stem__18 mp,page_title_stem__1mp,page_title_stem__20,page_title_stem__20 mp,page_title_stem__2014,page_title_stem__2014 spec,page_title_stem__20x,page_title_stem__24,page_title_stem__24 mp,page_title_stem__2mp,page_title_stem__300,page_title_stem__32gb,page_title_stem__3mp,page_title_stem__3x,page_title_stem__4x,page_title_stem__50mm,page_title_stem__55,page_title_stem__55mm,page_title_stem__55mm len,page_title_stem__5d,page_title_stem__5d mark,page_title_stem__5mp,page_title_stem__5x,page_title_stem__600d,page_title_stem__60d,page_title_stem__700tvl,page_title_stem__70d,page_title_stem__720p,page_title_stem__75,page_title_stem__7d,page_title_stem__accessori,page_title_stem__action,page_title_stem__af,page_title_stem__af dx,page_title_stem__ahd,page_title_stem__ahmedabad,page_title_stem__ahmedabad surat,page_title_stem__alarm,page_title_stem__alpha,page_title_stem__angl,page_title_stem__australia,page_title_stem__bag,page_title_stem__bag bag,page_title_stem__bag case,page_title_stem__bag waterproof,page_title_stem__bangalor,page_title_stem__bangalor hyderabad,page_title_stem__batteri,page_title_stem__best,page_title_stem__best india,page_title_stem__bodi,page_title_stem__box,page_title_stem__bullet,page_title_stem__bullet hikvis,page_title_stem__bullet ip,page_title_stem__bundl,page_title_stem__camcord,page_title_stem__camera,page_title_stem__camera australia,page_title_stem__camera canon,page_title_stem__camera point,page_title_stem__cammarkt,page_title_stem__canon,page_title_stem__canon eo,page_title_stem__canon europ,page_title_stem__canon ixus,page_title_stem__canon powershot,page_title_stem__car,page_title_stem__card,page_title_stem__card slot,page_title_stem__carri,page_title_stem__carri case,page_title_stem__case,page_title_stem__case bag,page_title_stem__case canon,page_title_stem__case case,page_title_stem__case eva,page_title_stem__case gopro,page_title_stem__case hard,page_title_stem__case waterproof,page_title_stem__casio,page_title_stem__casio exilim,page_title_stem__ccd,page_title_stem__cctv,page_title_stem__cctv camera,page_title_stem__cctv cctv,page_title_stem__cctv hikvis,page_title_stem__cctv ip,page_title_stem__cctv secur,page_title_stem__charger,page_title_stem__cheap,page_title_stem__chennai,page_title_stem__chennai kolkata,page_title_stem__chennai mumbai,page_title_stem__china,page_title_stem__cmos,page_title_stem__color,page_title_stem__color ccd,page_title_stem__compact,page_title_stem__compact camera,page_title_stem__comparison,page_title_stem__comparison camera,page_title_stem__comparison nikon,page_title_stem__connect,page_title_stem__coolpix,page_title_stem__cover,page_title_stem__custom,page_title_stem__cyber,page_title_stem__cyber shot,page_title_stem__cybershot,page_title_stem__cybershot dsc,page_title_stem__d3100,page_title_stem__d3200,page_title_stem__d5100,page_title_stem__d5200,page_title_stem__d7000,page_title_stem__dahua,page_title_stem__dahua ip,page_title_stem__dahua ptz,page_title_stem__delhi,page_title_stem__delhi chennai,page_title_stem__delhi mumbai,page_title_stem__design,page_title_stem__detect,page_title_stem__dive,page_title_stem__dmc,page_title_stem__dome,page_title_stem__dome cctv,page_title_stem__dome dahua,page_title_stem__dome hikvis,page_title_stem__dome ip,page_title_stem__dome ir,page_title_stem__dsc,page_title_stem__dual,page_title_stem__dv,page_title_stem__dvr,page_title_stem__dx,page_title_stem__easyshar,page_title_stem__ed,page_title_stem__ef,page_title_stem__ef 18,page_title_stem__elph,page_title_stem__eo,page_title_stem__eo 5d,page_title_stem__eo 60d,page_title_stem__eo 70d,page_title_stem__eo 7d,page_title_stem__eo rebel,page_title_stem__europ,page_title_stem__eva,page_title_stem__eva case,page_title_stem__ex,page_title_stem__excel,page_title_stem__exilim,page_title_stem__extra,page_title_stem__factori,page_title_stem__fashion,page_title_stem__fashion bag,page_title_stem__fi,page_title_stem__finepix,page_title_stem__free,page_title_stem__fujifilm,page_title_stem__fujifilm finepix,page_title_stem__full,page_title_stem__full hd,page_title_stem__full specif,page_title_stem__galaxi,page_title_stem__good,page_title_stem__gopro,page_title_stem__gopro hero,page_title_stem__great,page_title_stem__hard,page_title_stem__hd,page_title_stem__hd 1080p,page_title_stem__hd 720p,page_title_stem__hd ip,page_title_stem__hd network,page_title_stem__hd video,page_title_stem__hdmi,page_title_stem__hero,page_title_stem__hidden,page_title_stem__high,page_title_stem__high qualiti,page_title_stem__high speed,page_title_stem__hikvis,page_title_stem__hikvis 1080p,page_title_stem__hikvis 3mp,page_title_stem__hikvis cctv,page_title_stem__hikvis hikvis,page_title_stem__hikvis ip,page_title_stem__hikvis ip66,page_title_stem__hikvis ir,page_title_stem__hikvis megapixel,page_title_stem__hikvis mini,page_title_stem__hikvis network,page_title_stem__hikvis weatherproof,page_title_stem__home,page_title_stem__hot,page_title_stem__hot sale,page_title_stem__hot sell,page_title_stem__hous,page_title_stem__hous case,page_title_stem__hs,page_title_stem__hyderabad,page_title_stem__hyderabad chennai,page_title_stem__hyderabad delhi,page_title_stem__ii,page_title_stem__iii,page_title_stem__imag,page_title_stem__inch,page_title_stem__india,page_title_stem__india 2014,page_title_stem__india bangalor,page_title_stem__india offer,page_title_stem__india shopmania,page_title_stem__indoor,page_title_stem__infrar,page_title_stem__inspect,page_title_stem__ip,page_title_stem__ip cctv,page_title_stem__ip dahua,page_title_stem__ip dome,page_title_stem__ip hd,page_title_stem__ip hikvis,page_title_stem__ip ip,page_title_stem__ip ir,page_title_stem__ip megapixel,page_title_stem__ip network,page_title_stem__ip outdoor,page_title_stem__ip poe,page_title_stem__ip ptz,page_title_stem__ip secur,page_title_stem__ip wireless,page_title_stem__ip66,page_title_stem__ir,page_title_stem__ir bullet,page_title_stem__ir dome,page_title_stem__ir hikvis,page_title_stem__ir ip,page_title_stem__ir network,page_title_stem__ir ptz,page_title_stem__ixus,page_title_stem__ixus compact,page_title_stem__japan,page_title_stem__kit,page_title_stem__kit 18,page_title_stem__kit af,page_title_stem__kit comparison,page_title_stem__kit ef,page_title_stem__kodak,page_title_stem__kodak easyshar,page_title_stem__kolkata,page_title_stem__kolkata ahmedabad,page_title_stem__kolkatta,page_title_stem__lcd,page_title_stem__leather,page_title_stem__leather bag,page_title_stem__leather case,page_title_stem__led,page_title_stem__leica,page_title_stem__len,page_title_stem__len kit,page_title_stem__lens,page_title_stem__low,page_title_stem__lumix,page_title_stem__lumix dmc,page_title_stem__mark,page_title_stem__mark ii,page_title_stem__mark iii,page_title_stem__mavica,page_title_stem__mega,page_title_stem__mega pixel,page_title_stem__megapixel,page_title_stem__megapixel ip,page_title_stem__memori,page_title_stem__memori card,page_title_stem__metal,page_title_stem__micro,page_title_stem__mini,page_title_stem__mini dome,page_title_stem__mint,page_title_stem__mirrorless,page_title_stem__mm,page_title_stem__mode,page_title_stem__model,page_title_stem__monopod,page_title_stem__motion,page_title_stem__mount,page_title_stem__movi,page_title_stem__mp,page_title_stem__mp bodi,page_title_stem__mp cmos,page_title_stem__mp kit,page_title_stem__mumbai,page_title_stem__mumbai bangalor,page_title_stem__mumbai pune,page_title_stem__mvc,page_title_stem__neopren,page_title_stem__network,page_title_stem__network hikvis,page_title_stem__network ip,page_title_stem__nex,page_title_stem__night,page_title_stem__night vision,page_title_stem__nikkor,page_title_stem__nikon,page_title_stem__nikon coolpix,page_title_stem__nikon d3100,page_title_stem__nikon d3200,page_title_stem__nikon d5100,page_title_stem__nikon d5200,page_title_stem__nikon d7000,page_title_stem__nylon,page_title_stem__offer,page_title_stem__offer full,page_title_stem__olympus,page_title_stem__olympus pen,page_title_stem__olympus stylus,page_title_stem__onlin,page_title_stem__onlin india,page_title_stem__onvif,page_title_stem__opt,page_title_stem__optic,page_title_stem__optic zoom,page_title_stem__optio,page_title_stem__orang,page_title_stem__outdoor,page_title_stem__outdoor ip,page_title_stem__p2p,page_title_stem__p2p ip,page_title_stem__panason,page_title_stem__panason lumix,page_title_stem__part,page_title_stem__pc,page_title_stem__pc connect,page_title_stem__pen,page_title_stem__pentax,page_title_stem__pentax optio,page_title_stem__phone,page_title_stem__photo,page_title_stem__photograph,page_title_stem__pink,page_title_stem__pipe,page_title_stem__pipe inspect,page_title_stem__pixel,page_title_stem__plastic,page_title_stem__poe,page_title_stem__poe ip,page_title_stem__point,page_title_stem__point shoot,page_title_stem__portabl,page_title_stem__pouch,page_title_stem__power,page_title_stem__powershot,page_title_stem__powershot elph,page_title_stem__powershot ixus,page_title_stem__price,page_title_stem__price pricem,page_title_stem__price shop,page_title_stem__pricem,page_title_stem__pro,page_title_stem__product,page_title_stem__profession,page_title_stem__proof,page_title_stem__protect,page_title_stem__provid,page_title_stem__ptz,page_title_stem__ptz dahua,page_title_stem__ptz dome,page_title_stem__ptz ip,page_title_stem__pu,page_title_stem__pu leather,page_title_stem__pune,page_title_stem__pune kolkatta,page_title_stem__qualiti,page_title_stem__rebel,page_title_stem__rebel t3,page_title_stem__rebel t3i,page_title_stem__record,page_title_stem__remot,page_title_stem__repair,page_title_stem__review,page_title_stem__review valid,page_title_stem__sale,page_title_stem__samsung,page_title_stem__screen,page_title_stem__sd,page_title_stem__sd card,page_title_stem__secur,page_title_stem__secur cctv,page_title_stem__secur system,page_title_stem__sell,page_title_stem__sensor,page_title_stem__seri,page_title_stem__ship,page_title_stem__shockproof,page_title_stem__shoe,page_title_stem__shoot,page_title_stem__shoot best,page_title_stem__shoot india,page_title_stem__shop,page_title_stem__shop comparison,page_title_stem__shopmania,page_title_stem__shot,page_title_stem__shot dsc,page_title_stem__shoulder,page_title_stem__sigma,page_title_stem__silicon,page_title_stem__slot,page_title_stem__slot comparison,page_title_stem__small,page_title_stem__smart,page_title_stem__soni,page_title_stem__soni alpha,page_title_stem__soni ccd,page_title_stem__soni cyber,page_title_stem__soni cybershot,page_title_stem__soni dsc,page_title_stem__spec,page_title_stem__spec review,page_title_stem__specif,page_title_stem__speed,page_title_stem__speed dome,page_title_stem__sport,page_title_stem__stm,page_title_stem__style,page_title_stem__stylus,page_title_stem__surat,page_title_stem__surveil,page_title_stem__system,page_title_stem__t3,page_title_stem__t3i,page_title_stem__tablet,page_title_stem__today,page_title_stem__today pc,page_title_stem__top,page_title_stem__top 10,page_title_stem__touch,page_title_stem__tough,page_title_stem__tripod,page_title_stem__tv,page_title_stem__underwat,page_title_stem__univers,page_title_stem__usa,page_title_stem__usb,page_title_stem__valid,page_title_stem__valid delhi,page_title_stem__varifoc,page_title_stem__vf,page_title_stem__video,page_title_stem__vintag,page_title_stem__vision,page_title_stem__vivicam,page_title_stem__vivitar,page_title_stem__vr,page_title_stem__vr 18,page_title_stem__vr len,page_title_stem__warranti,page_title_stem__watch,page_title_stem__waterproof,page_title_stem__waterproof bag,page_title_stem__waterproof case,page_title_stem__weatherproof,page_title_stem__weatherproof ir,page_title_stem__wex,page_title_stem__wex photograph,page_title_stem__wholesal,page_title_stem__wi,page_title_stem__wi fi,page_title_stem__wide,page_title_stem__wide angl,page_title_stem__wifi,page_title_stem__wifi ip,page_title_stem__wireless,page_title_stem__wireless ip,page_title_stem__work,page_title_stem__zealand,page_title_stem__zealand price,page_title_stem__zoom,page_title_stem__zoom len
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311794,0.334985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21946,0.0,0.0,0.0,0.295779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.414391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292293,0.432542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.539219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19679,0.0,0.233676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141902,0.186923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18549,0.0,0.0,0.0,0.0,0.237193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.314231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.184756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176165,0.29019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230479,0.0,0.322516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.314729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.30057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.384768,0.0,0.415802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.545994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35821,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.289519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.561443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.775217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
x = model.make_X([0, 2], [1, 3])
x.shape

(2, 9)

In [206]:
model.labels

array([1, 0, 0, ..., 0, 0, 0])

In [208]:
model.train()

In [209]:
train_left_spec_idx = model.specs_id_to_idx[labels_df['left_spec_id']]
train_right_spec_idx = model.specs_id_to_idx[labels_df['right_spec_id']]

train_X = model.make_X(train_left_spec_idx, train_right_spec_idx)

assert train_X.shape[0] == labels_df.shape[0]

pred_train = model.clf.predict(train_X)

print('Train F1', f1_score(labels_df.label, pred_train))
print(classification_report(labels_df.label, pred_train))

Train F1 0.7562692111415721
              precision    recall  f1-score   support

           0       0.95      0.97      0.96    253612
           1       0.79      0.72      0.76     44039

    accuracy                           0.93    297651
   macro avg       0.87      0.85      0.86    297651
weighted avg       0.93      0.93      0.93    297651



Sanity check passed

In [210]:
model.make_submission()

KeyboardInterrupt: 

# CV

In [174]:
def get_split(classes_df, labels_df, test_classes=5):
    classes = classes_df.class_.unique()
    test_classes = np.random.choice(classes, test_classes)
    
    test_spec_ids = classes_df[classes_df.class_.isin(test_classes)].spec_id
    
    test_label_mask = labels_df.left_spec_id.isin(test_spec_ids) | labels_df.right_spec_id.isin(test_spec_ids)
    
    test_idx = np.array(labels_df[test_label_mask].index)
    train_idx = np.array(labels_df[~test_label_mask].index)
    
    return train_idx, test_idx

def evaluate_submit(submit_df, test_labels_df):
    submit_df = submit_df.copy()
    submit_df['label'] = 1
    
    merged = test_labels_df.merge(submit_df, on=['left_spec_id', 'right_spec_id'], how='left', suffixes = ('_true', '_pred'))
    merged['label_pred'] = merged['label_pred'].fillna(0)
    
    tp = merged[(merged.label_true == 1) & (merged.label_pred == 1)]
    tn = merged[(merged.label_true == 0) & (merged.label_pred == 0)]
    
    fp = merged[(merged.label_true == 0) & (merged.label_pred == 1)]
    fn = merged[(merged.label_true == 1) & (merged.label_pred == 0)]
    
    precision = tp.shape[0]/(tp.shape[0]+fp.shape[0])
    recall = tp.shape[0]/(tp.shape[0]+fn.shape[0])
    
    f1 = 2 * (precision * recall)/(precision+recall)
    
    return f1, tp, tn, fp, fn

def cross_validate_pipeline(pipeline_class, specs_df, labels_df, n_folds=5, test_classes=5):
    # Obtain triangles from label and number them
    classes_df = make_classes_df(labels_df)
    folds = []
    submit_fpaths = []
    scores = []
    for i_fold in tqdm(range(n_folds)):
        print('Fold', i_fold)
        labels_train_idx, labels_test_idx = get_split(classes_df, labels_df, test_classes=test_classes)
        folds.append( (labels_train_idx, labels_test_idx) )
        
        train_labels = labels_df.iloc[labels_train_idx].copy()
        test_labels = labels_df.iloc[labels_test_idx].copy()
        
        submit_fpath = f'../data/interim/cv_submit_fold{i_fold}.csv'
        submit_fpaths.append(submit_fpath)
        
        model = pipeline_class(specs_df, train_labels, submit_fpath=submit_fpath)
        print('training model')
        model.train()
        
        print('generating submit')
        model.make_submission()
        
        print('evaluating submit')
        f1score, _, _, _, _ = evaluate_submit(pd.read_csv(model.submit_fpath), test_labels)
        scores.append(f1score)
    return folds, submit_fpaths, scores

In [212]:
# np.random.seed(42)
np.random.seed(None)
folds, submit_fpaths, scores = cross_validate_pipeline(LGBMPipeline, specs_df, labels_df, n_folds=3)

Fold 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs_df['spec_idx'] = range(len(self.specs_df))


training model
generating submit


evaluating submit
Fold 1
training model


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs_df['spec_idx'] = range(len(self.specs_df))


generating submit


evaluating submit
Fold 2
training model


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs_df['spec_idx'] = range(len(self.specs_df))


generating submit


evaluating submit


In [213]:
np.mean(scores)

0.5089669513023917

## Test evaluation scheme

In [173]:

    
    
test_labels_df = labels_df.iloc[folds[0][1]]
submit_df = pd.read_csv(submit_fpaths[0])

score, tp, tn, fp, fn = evaluate_submit(submit_df, test_labels_df)
score

0.009749390663083559

Test one true positive

In [157]:
tp.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
327,www.ebay.com//24206,www.priceme.co.nz//54,1,1.0
568,www.ebay.com//24016,www.ebay.com//42074,1,1.0
1056,www.ebay.com//54776,www.ebay.com//55526,1,1.0
1070,www.ebay.com//44615,www.ebay.com//54040,1,1.0
1291,www.ebay.com//24016,www.ebay.com//44028,1,1.0


In [154]:
test_labels_df[test_labels_df.label == 1][test_labels_df.left_spec_id == 'www.ebay.com//24206']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
1562,www.ebay.com//24206,www.priceme.co.nz//54,1
8908,www.ebay.com//24206,www.ebay.com//44615,1
11184,www.ebay.com//24206,www.ebay.com//54040,1
13743,www.ebay.com//24206,www.ebay.com//54457,1


In [156]:
submit_df[(submit_df.left_spec_id == 'www.ebay.com//24206') & (submit_df.right_spec_id == 'www.priceme.co.nz//54')]

Unnamed: 0,left_spec_id,right_spec_id
433177,www.ebay.com//24206,www.priceme.co.nz//54


Pass

Check one tn

In [158]:
tn.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
0,www.ebay.com//42074,www.ebay.com//47107,0,0.0
1,www.garricks.com.au//31,www.mypriceindia.com//211,0,0.0
2,www.ebay.com//45946,www.ebay.com//54776,0,0.0
3,www.ebay.com//43019,www.mypriceindia.com//211,0,0.0
4,www.ebay.com//41954,www.ukdigitalcameras.co.uk//130,0,0.0


In [161]:
test_labels_df[test_labels_df.label == 0][test_labels_df.left_spec_id == 'www.ebay.com//42074'][test_labels_df.right_spec_id == 'www.ebay.com//47107']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
5,www.ebay.com//42074,www.ebay.com//47107,0


In [162]:
submit_df[submit_df.left_spec_id == 'www.ebay.com//42074'][submit_df.right_spec_id == 'www.ebay.com//47107']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id


Pass

Check one fp

In [163]:
fp.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
396,www.ebay.com//42074,www.ebay.com//54040,0,1.0
1214,www.ebay.com//42668,www.ebay.com//54040,0,1.0
1856,www.ebay.com//24608,www.priceme.co.nz//54,0,1.0
1900,www.ebay.com//42074,www.priceme.co.nz//54,0,1.0
1989,www.ebay.com//24608,www.ebay.com//54040,0,1.0


In [164]:
test_labels_df[test_labels_df.label == 0][test_labels_df.left_spec_id == 'www.ebay.com//42074'][test_labels_df.right_spec_id == 'www.ebay.com//54040']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
1908,www.ebay.com//42074,www.ebay.com//54040,0


In [165]:
submit_df[submit_df.left_spec_id == 'www.ebay.com//42074'][submit_df.right_spec_id == 'www.ebay.com//54040']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id
360471,www.ebay.com//42074,www.ebay.com//54040


Pass

Check one fn

In [166]:
fn.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
12,www.ebay.com//54040,www.ebay.com//54457,1,0.0
117,www.ebay.com//44670,www.ebay.com//56033,1,0.0
135,www.ebay.com//42074,www.ebay.com//42668,1,0.0
140,www.ebay.com//24280,www.ebay.com//56033,1,0.0
229,www.ebay.com//24608,www.ebay.com//48059,1,0.0


In [168]:
test_labels_df[test_labels_df.label == 1][test_labels_df.left_spec_id == 'www.ebay.com//54040'][test_labels_df.right_spec_id == 'www.ebay.com//54457']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
73,www.ebay.com//54040,www.ebay.com//54457,1


In [169]:
submit_df[submit_df.left_spec_id == 'www.ebay.com//54040'][submit_df.right_spec_id == 'www.ebay.com//54457']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id


pass

# Submit

In [214]:
model = LGBMPipeline(specs_df, labels_df, submit_fpath='../data/submit/submit.csv')
model.train()
model.make_submission()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs_df['spec_idx'] = range(len(self.specs_df))
