In [121]:
import os
import sys
import random
import json
import collections
import re
from itertools import combinations

import pandas as pd
import numpy as np
import scipy
import statsmodels
from tqdm import trange, tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from lightgbm import LGBMClassifier

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline


pd.options.display.max_columns = 999

sns.set()
plt.rcParams["figure.figsize"] = (10,7)

In [122]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [123]:
from sigmod_src.utils import read_json, pprint_json, get_known_brands
from sigmod_src.data.make_dataset import make_specs_dataset, preprocess_specs_dataset, join_labels_specs, make_classes_df
from sigmod_src.features.build_features import make_features

In [124]:
LG_LABELS_PATH = '../data/raw/sigmod_large_labelled_dataset.csv'
SPECS_PATH = '../data/raw/2013_camera_specs/'

# Load labels df

In [125]:
labels_df = pd.read_csv(LG_LABELS_PATH)
labels_df.shape

(297651, 3)

# Load specs df

In [168]:
specs_dataset_src = make_specs_dataset(SPECS_PATH)

In [181]:
brand_blacklist = ['shoot', 'unbranded/generic', 'as', 
            'eos', 'action', 'new', 'class', 'neopine', 'sharp', 'digital', 
            'telesin']
known_brand_cutoff = 10
brand_cutoff = 5 # Remove brands appear in < brand_cutoff specs
cutoff = 1
max_words = 500

In [182]:
known_brands = get_known_brands(specs_dataset_src, known_brand_cutoff, brand_blacklist)
len(known_brands)

37

In [183]:
specs_df = preprocess_specs_dataset(specs_dataset_src,
                                         max_words=max_words,
                                         cutoff=cutoff,
                                         known_brands=known_brands,
                                        brand_blacklist=brand_blacklist,
                                        brand_cutoff=brand_cutoff)

Conflict. Found: sigma , brand field: nikon  Will use brand field
Conflict. Found: sony , brand field: olympus  Will use brand field
Conflict. Found: sony , brand field: kodak  Will use brand field
Conflict. Found: sony , brand field: olympus  Will use brand field
Conflict. Found: bell , brand field: digital  Will use brand field
Conflict. Found: samsung , brand field: vivitar  Will use brand field
Conflict. Found: olympus , brand field: panasonic  Will use brand field
Conflict. Found: canon , brand field: fujifilm  Will use brand field
Conflict. Found: sigma , brand field: nikon  Will use brand field
Conflict. Found: sony , brand field: olympus  Will use brand field
Conflict. Found: gopro , brand field: ion  Will use brand field
Conflict. Found: konica , brand field: minolta  Will use brand field
Conflict. Found: canon , brand field: casio  Will use brand field
Conflict. Found: kodak , brand field: vivitar  Will use brand field
Conflict. Found: sigma , brand field: nikon  Will use bra

In [184]:
specs_df.head()

Unnamed: 0,spec_id,page_title,brand,all_text,page_title_stem,all_text_stem,site
0,www.ebay.com//57656,canon powershot elph 110 hs 16 1 mp digital,canon,canon powershot 110 hs 16 1 mp digital canon r...,canon powershot elph 110 hs 16 1 mp digit,canon powershot 110 hs 16 1 mp digit canon ref...,www.ebay.com
1,www.ebay.com//60583,canon rebel 2000 35 mm great case instruction ...,canon,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruct boo...,canon rebel 2000 35 mm great case instruct boo...,www.ebay.com
2,www.ebay.com//60440,canon eos rebel t3i digital slr 18 55mm 75 300...,canon,canon eos rebel t3i digital slr 18 55mm 75 300...,canon eo rebel t3i digit slr 18 55mm 75 300mm ...,canon eo rebel t3i digit slr 18 55mm 75 300mm ...,www.ebay.com
3,www.ebay.com//24139,ge c1033 10 1 mp digital 3x zoom 2 4 lcd,ge,ge c1033 10 1 mp digital 3x zoom 2 4 ge brand-...,ge c1033 10 1 mp digit 3x zoom 2 4 lcd,ge c1033 10 1 mp digit 3x zoom 2 4 ge brand-ne...,www.ebay.com
4,www.ebay.com//54903,vivitar clip shot digital 1 1 mp,vivitar,vivitar clip shot digital 1 1 mp vivitar brand...,vivitar clip shot digit 1 1 mp,vivitar clip shot digit 1 1 mp vivitar brand-n...,www.ebay.com


In [188]:
specs_df.sample(5).page_title_stem.values

array(['panason hc v750 india bangalor hyderabad delhi chennai mumbai pune kolkatta',
       'oem protect eva bag buckl portabl eva case eva bag oem protect eva bag buckl portabl eva case alibaba com',
       'telesin gopro silicon case go pro hero4 3 green go pro gopro silicon case gopro silicon case hero 3 4 gopro silicon case hero 4 alibaba com',
       'soni cyber shot dsc rx100 iii rx100m3 carl zeiss vario sonnar digit',
       'canon eo 60d digit slr 2 pack batteri sale 30 90'], dtype=object)

# Test pipeline

In [189]:
from sigmod_src.pipeline import LGBMPipeline

In [190]:
model = LGBMPipeline(specs_df, labels_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs = specs_df.values


In [191]:
model.precompute()

In [192]:
model.tfidf.head()

Unnamed: 0,page_title_stem__00,page_title_stem__0mp,page_title_stem__10,page_title_stem__10 mp,page_title_stem__1080p,page_title_stem__1080p hd,page_title_stem__1080p ip,page_title_stem__12,page_title_stem__12 mp,page_title_stem__12mp,page_title_stem__135mm,page_title_stem__14,page_title_stem__14 42mm,page_title_stem__14 mp,page_title_stem__15,page_title_stem__16,page_title_stem__16 megapixel,page_title_stem__16 mp,page_title_stem__16gb,page_title_stem__16mp,page_title_stem__18,page_title_stem__18 55,page_title_stem__18 55mm,page_title_stem__18 mp,page_title_stem__1mp,page_title_stem__1mp digit,page_title_stem__20,page_title_stem__20 mp,page_title_stem__200mm,page_title_stem__2014,page_title_stem__2014 spec,page_title_stem__20x,page_title_stem__24,page_title_stem__24 mp,page_title_stem__264,page_title_stem__2cd2112,page_title_stem__2cd2532f,page_title_stem__2cd2632f,page_title_stem__2cd2632f is,page_title_stem__2cd3332,page_title_stem__2mp,page_title_stem__2mp digit,page_title_stem__300,page_title_stem__300mm,page_title_stem__3megapixel,page_title_stem__3mp,page_title_stem__3x,page_title_stem__42mm,page_title_stem__4x,page_title_stem__50,page_title_stem__50mm,page_title_stem__55,page_title_stem__55mm,page_title_stem__55mm len,page_title_stem__5d,page_title_stem__5d mark,page_title_stem__5mp,page_title_stem__5x,page_title_stem__600d,page_title_stem__60d,page_title_stem__70,page_title_stem__700tvl,page_title_stem__720p,page_title_stem__75,page_title_stem__7d,page_title_stem__99,page_title_stem__accessori,page_title_stem__action,page_title_stem__af,page_title_stem__af dx,page_title_stem__ahd,page_title_stem__ahmedabad,page_title_stem__ahmedabad surat,page_title_stem__alarm,page_title_stem__alibaba,page_title_stem__alibaba com,page_title_stem__alpha,page_title_stem__alpha nex,page_title_stem__australia,page_title_stem__bag,page_title_stem__bag alibaba,page_title_stem__bag bag,page_title_stem__bag case,page_title_stem__bag digit,page_title_stem__bag waterproof,page_title_stem__bangalor,page_title_stem__bangalor hyderabad,page_title_stem__batteri,page_title_stem__best,page_title_stem__best india,page_title_stem__bodi,page_title_stem__box,page_title_stem__bullet,page_title_stem__bundl,page_title_stem__buzzillion,page_title_stem__buzzillion com,page_title_stem__camcord,page_title_stem__camera,page_title_stem__camera australia,page_title_stem__camera canon,page_title_stem__camera digit,page_title_stem__cammarkt,page_title_stem__canon,page_title_stem__canon eo,page_title_stem__canon powershot,page_title_stem__car,page_title_stem__card,page_title_stem__card slot,page_title_stem__carri,page_title_stem__carri case,page_title_stem__case,page_title_stem__case alibaba,page_title_stem__case bag,page_title_stem__case canon,page_title_stem__case case,page_title_stem__case digit,page_title_stem__case eva,page_title_stem__case gopro,page_title_stem__case hard,page_title_stem__case waterproof,page_title_stem__casio,page_title_stem__casio exilim,page_title_stem__ccd,page_title_stem__cctv,page_title_stem__cctv alibaba,page_title_stem__cctv camera,page_title_stem__cctv hikvis,page_title_stem__cctv ip,page_title_stem__charger,page_title_stem__cheap,page_title_stem__chennai,page_title_stem__chennai kolkata,page_title_stem__chennai mumbai,page_title_stem__china,page_title_stem__cmos,page_title_stem__color,page_title_stem__com,page_title_stem__compact,page_title_stem__compact digit,page_title_stem__comparison,page_title_stem__comparison digit,page_title_stem__comparison net,page_title_stem__connect,page_title_stem__coolpix,page_title_stem__cover,page_title_stem__custom,page_title_stem__cyber,page_title_stem__cyber shot,page_title_stem__cybershot,page_title_stem__cybershot dsc,page_title_stem__d3100,page_title_stem__d3200,page_title_stem__dahua,page_title_stem__dahua ip,page_title_stem__dahua ptz,page_title_stem__delhi,page_title_stem__delhi chennai,page_title_stem__delhi mumbai,page_title_stem__design,page_title_stem__dh,page_title_stem__digit,page_title_stem__digit 16,page_title_stem__digit alibaba,page_title_stem__digit bag,page_title_stem__digit bodi,page_title_stem__digit buzzillion,page_title_stem__digit camera,page_title_stem__digit case,page_title_stem__digit compact,page_title_stem__digit elph,page_title_stem__digit ixus,page_title_stem__digit kit,page_title_stem__digit price,page_title_stem__digit sale,page_title_stem__digit slr,page_title_stem__digit video,page_title_stem__digit waterproof,page_title_stem__digit zoom,page_title_stem__dmc,page_title_stem__dome,page_title_stem__dome alibaba,page_title_stem__dome hikvis,page_title_stem__dome ip,page_title_stem__ds,page_title_stem__ds 2cd2532f,page_title_stem__ds 2cd2632f,page_title_stem__ds 2cd3332,page_title_stem__dsc,page_title_stem__dslr,page_title_stem__dslr bag,page_title_stem__dslr bodi,page_title_stem__dslr slr,page_title_stem__dual,page_title_stem__dvr,page_title_stem__dx,page_title_stem__easyshar,page_title_stem__ef,page_title_stem__ef 18,page_title_stem__elph,page_title_stem__eo,page_title_stem__eo 5d,page_title_stem__eo 7d,page_title_stem__eo rebel,page_title_stem__europ,page_title_stem__eva,page_title_stem__eva case,page_title_stem__ex,page_title_stem__excel,page_title_stem__exilim,page_title_stem__extra,page_title_stem__factori,page_title_stem__fashion,page_title_stem__finepix,page_title_stem__free,page_title_stem__fujifilm,page_title_stem__fujifilm finepix,page_title_stem__full,page_title_stem__full hd,page_title_stem__full specif,page_title_stem__gopro,page_title_stem__hard,page_title_stem__hd,page_title_stem__hd 1080p,page_title_stem__hd ip,page_title_stem__hero,page_title_stem__hidden,page_title_stem__high,page_title_stem__high qualiti,page_title_stem__hikvis,page_title_stem__hikvis 1080p,page_title_stem__hikvis 3mp,page_title_stem__hikvis alibaba,page_title_stem__hikvis cctv,page_title_stem__hikvis ds,page_title_stem__hikvis hikvis,page_title_stem__hikvis ip,page_title_stem__hikvis ip66,page_title_stem__hikvis ir,page_title_stem__hikvis network,page_title_stem__hikvis weatherproof,page_title_stem__hn,page_title_stem__home,page_title_stem__hot,page_title_stem__hous,page_title_stem__hs,page_title_stem__hunt,page_title_stem__hyderabad,page_title_stem__hyderabad chennai,page_title_stem__hyderabad delhi,page_title_stem__i5,page_title_stem__ii,page_title_stem__iii,page_title_stem__imag,page_title_stem__inch,page_title_stem__india,page_title_stem__india 2014,page_title_stem__india bangalor,page_title_stem__india offer,page_title_stem__india shopmania,page_title_stem__indoor,page_title_stem__infrar,page_title_stem__inspect,page_title_stem__ip,page_title_stem__ip alibaba,page_title_stem__ip cctv,page_title_stem__ip dahua,page_title_stem__ip dome,page_title_stem__ip ds,page_title_stem__ip hikvis,page_title_stem__ip ip,page_title_stem__ip network,page_title_stem__ip poe,page_title_stem__ip ptz,page_title_stem__ip66,page_title_stem__ipc,page_title_stem__ir,page_title_stem__ir alibaba,page_title_stem__ir bullet,page_title_stem__ir dome,page_title_stem__ir ip,page_title_stem__ir network,page_title_stem__ir ptz,page_title_stem__is,page_title_stem__ixus,page_title_stem__japan,page_title_stem__kit,page_title_stem__kit 18,page_title_stem__kit af,page_title_stem__kit comparison,page_title_stem__kit ef,page_title_stem__kodak,page_title_stem__kodak easyshar,page_title_stem__kolkata,page_title_stem__kolkata ahmedabad,page_title_stem__kolkatta,page_title_stem__lcd,page_title_stem__leather,page_title_stem__leather bag,page_title_stem__leather case,page_title_stem__led,page_title_stem__leica,page_title_stem__len,page_title_stem__len kit,page_title_stem__lens,page_title_stem__low,page_title_stem__lumix,page_title_stem__lumix dmc,page_title_stem__mark,page_title_stem__mark ii,page_title_stem__mega,page_title_stem__mega pixel,page_title_stem__megapixel,page_title_stem__megapixel digit,page_title_stem__megapixel ip,page_title_stem__memori,page_title_stem__memori card,page_title_stem__metal,page_title_stem__micro,page_title_stem__mini,page_title_stem__mini dome,page_title_stem__mint,page_title_stem__mirrorless,page_title_stem__mm,page_title_stem__mode,page_title_stem__model,page_title_stem__monopod,page_title_stem__motion,page_title_stem__mount,page_title_stem__mp,page_title_stem__mp digit,page_title_stem__mumbai,page_title_stem__mumbai bangalor,page_title_stem__mumbai pune,page_title_stem__neopren,page_title_stem__net,page_title_stem__network,page_title_stem__network alibaba,page_title_stem__network hikvis,page_title_stem__network ip,page_title_stem__nex,page_title_stem__night,page_title_stem__night vision,page_title_stem__nikkor,page_title_stem__nikon,page_title_stem__nikon coolpix,page_title_stem__nikon d3200,page_title_stem__nylon,page_title_stem__offer,page_title_stem__offer full,page_title_stem__olympus,page_title_stem__olympus pen,page_title_stem__olympus stylus,page_title_stem__onlin,page_title_stem__onlin india,page_title_stem__onvif,page_title_stem__optic,page_title_stem__optic zoom,page_title_stem__outdoor,page_title_stem__outdoor ip,page_title_stem__p2p,page_title_stem__panason,page_title_stem__panason lumix,page_title_stem__part,page_title_stem__pc,page_title_stem__pc connect,page_title_stem__pen,page_title_stem__pentax,page_title_stem__phone,page_title_stem__photo,page_title_stem__pink,page_title_stem__pipe,page_title_stem__pixel,page_title_stem__plastic,page_title_stem__poe,page_title_stem__point,page_title_stem__point shoot,page_title_stem__portabl,page_title_stem__pouch,page_title_stem__power,page_title_stem__powershot,page_title_stem__powershot digit,page_title_stem__powershot ixus,page_title_stem__price,page_title_stem__price hunt,page_title_stem__price pricem,page_title_stem__price shop,page_title_stem__pricedekho,page_title_stem__pricedekho com,page_title_stem__pricem,page_title_stem__pro,page_title_stem__product,page_title_stem__profession,page_title_stem__proof,page_title_stem__protect,page_title_stem__provid,page_title_stem__ptz,page_title_stem__ptz dahua,page_title_stem__ptz dome,page_title_stem__ptz ip,page_title_stem__pu,page_title_stem__pune,page_title_stem__pune kolkatta,page_title_stem__qualiti,page_title_stem__rebel,page_title_stem__rebel t3i,page_title_stem__record,page_title_stem__review,page_title_stem__review valid,page_title_stem__sale,page_title_stem__samsung,page_title_stem__screen,page_title_stem__sd,page_title_stem__sd card,page_title_stem__sdhc,page_title_stem__sdxc,page_title_stem__secur,page_title_stem__secur system,page_title_stem__sell,page_title_stem__sensor,page_title_stem__seri,page_title_stem__shockproof,page_title_stem__shoot,page_title_stem__shoot digit,page_title_stem__shop,page_title_stem__shop comparison,page_title_stem__shopmania,page_title_stem__shot,page_title_stem__shot dsc,page_title_stem__shoulder,page_title_stem__slot,page_title_stem__slot comparison,page_title_stem__slr,page_title_stem__slr 18,page_title_stem__slr bag,page_title_stem__slr bodi,page_title_stem__slr digit,page_title_stem__slr kit,page_title_stem__slt,page_title_stem__small,page_title_stem__smart,page_title_stem__soni,page_title_stem__soni alpha,page_title_stem__soni cyber,page_title_stem__soni cybershot,page_title_stem__soni dsc,page_title_stem__spec,page_title_stem__spec review,page_title_stem__specif,page_title_stem__specif pricedekho,page_title_stem__speed,page_title_stem__speed dome,page_title_stem__sport,page_title_stem__stm,page_title_stem__stylus,page_title_stem__surat,page_title_stem__surat price,page_title_stem__surveil,page_title_stem__system,page_title_stem__t3,page_title_stem__t3i,page_title_stem__tablet,page_title_stem__today,page_title_stem__today pc,page_title_stem__top,page_title_stem__top 10,page_title_stem__tv,page_title_stem__underwat,page_title_stem__usa,page_title_stem__usb,page_title_stem__valid,page_title_stem__valid delhi,page_title_stem__vandal,page_title_stem__vandal proof,page_title_stem__vf,page_title_stem__video,page_title_stem__vintag,page_title_stem__vision,page_title_stem__vivitar,page_title_stem__vr,page_title_stem__walmart,page_title_stem__walmart com,page_title_stem__watch,page_title_stem__waterproof,page_title_stem__waterproof bag,page_title_stem__waterproof case,page_title_stem__waterproof digit,page_title_stem__weatherproof,page_title_stem__weatherproof ir,page_title_stem__wholesal,page_title_stem__wide,page_title_stem__wifi,page_title_stem__wifi ip,page_title_stem__wireless,page_title_stem__wireless ip,page_title_stem__work,page_title_stem__zealand,page_title_stem__zealand price,page_title_stem__zoom,page_title_stem__zoom digit,page_title_stem__zoom len
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320089,0.0,0.355527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237495,0.0,0.320332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.448669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19959,0.216759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.381781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161578,0.0,0.198561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127672,0.168405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166955,0.0,0.0,0.220921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158466,0.261501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20747,0.290315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14246,0.273214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.361063,0.401264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208428,0.226357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34762,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.772321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
x = model.make_X([0, 2], [1, 3])
x.shape

(2, 15)

In [194]:
model.labels

array([1, 0, 0, ..., 0, 0, 0])

In [195]:
model.train()

Precomputing
Making features
Fitting model


In [196]:
train_left_spec_idx = model.specs_id_to_idx[labels_df['left_spec_id']]
train_right_spec_idx = model.specs_id_to_idx[labels_df['right_spec_id']]

train_X = model.make_X(train_left_spec_idx, train_right_spec_idx)

assert train_X.shape[0] == labels_df.shape[0]

pred_train = model.clf.predict(train_X)

print('Train F1', f1_score(labels_df.label, pred_train))
print(classification_report(labels_df.label, pred_train))

Train F1 0.9999886462981254
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    253612
           1       1.00      1.00      1.00     44039

    accuracy                           1.00    297651
   macro avg       1.00      1.00      1.00    297651
weighted avg       1.00      1.00      1.00    297651



Sanity check passed

In [83]:
model.make_submission()

In [84]:
submit_df = pd.read_csv('../data/submit/submit.csv')
submit_df.shape

(3635519, 2)

In [87]:
specs_df.index = specs_df.spec_id

In [103]:
row = submit_df.sample(1)

print('Left\n', specs_df.loc[row.left_spec_id].all_text.values[0])
print('')
print('Right\n', specs_df.loc[row.right_spec_id].all_text.values[0])

Left
 canon powershot s5 8 0 mp canon may signs cosmetic wear fully operational functions intended may floor model return full description imperfections mp s5 2077b005 12x point shoot

Right
 canon powershot sx50 hs 12 1 mp w 4gb wifi sd card 2 batteries canon extra battery memory card memory reader strap neck wrist may signs cosmetic wear fully operational functions intended may floor model return full description imperfections mp sx50 hs 6352b001 50x bridge


# CV

In [197]:
def get_split(classes_df, labels_df, test_classes=5):
    classes = classes_df.class_.unique()
    test_classes = np.random.choice(classes, test_classes)
    
    test_spec_ids = classes_df[classes_df.class_.isin(test_classes)].spec_id
    
    test_label_mask = labels_df.left_spec_id.isin(test_spec_ids) | labels_df.right_spec_id.isin(test_spec_ids)
    
    test_idx = np.array(labels_df[test_label_mask].index)
    train_idx = np.array(labels_df[~test_label_mask].index)
    
    return train_idx, test_idx

def evaluate_submit(submit_df, test_labels_df):
    submit_df = submit_df.copy()
    submit_df['label'] = 1
    
    merged = test_labels_df.merge(submit_df, on=['left_spec_id', 'right_spec_id'], how='left', suffixes = ('_true', '_pred'))
    merged['label_pred'] = merged['label_pred'].fillna(0)
    
    tp = merged[(merged.label_true == 1) & (merged.label_pred == 1)]
    tn = merged[(merged.label_true == 0) & (merged.label_pred == 0)]
    
    fp = merged[(merged.label_true == 0) & (merged.label_pred == 1)]
    fn = merged[(merged.label_true == 1) & (merged.label_pred == 0)]
    
    precision = tp.shape[0]/(tp.shape[0]+fp.shape[0])
    recall = tp.shape[0]/(tp.shape[0]+fn.shape[0])
    
    f1 = 2 * (precision * recall)/(precision+recall)
    
    return f1, tp, tn, fp, fn

def cross_validate_pipeline(pipeline_class, specs_df, labels_df, n_folds=5, test_classes=5):
    # Obtain triangles from label and number them
    classes_df = make_classes_df(labels_df)
    folds = []
    submit_fpaths = []
    scores = []
    for i_fold in tqdm(range(n_folds)):
        print('Fold', i_fold)
        labels_train_idx, labels_test_idx = get_split(classes_df, labels_df, test_classes=test_classes)
        folds.append( (labels_train_idx, labels_test_idx) )
        
        train_labels = labels_df.iloc[labels_train_idx].copy()
        test_labels = labels_df.iloc[labels_test_idx].copy()
        
        submit_fpath = f'../data/interim/cv_submit_fold{i_fold}.csv'
        submit_fpaths.append(submit_fpath)
        
        model = pipeline_class(specs_df, train_labels, submit_fpath=submit_fpath)
        print('training model')
        model.train()
        
        print('generating submit')
        model.make_submission()
        
        print('evaluating submit')
        f1score, _, _, _, _ = evaluate_submit(pd.read_csv(model.submit_fpath), test_labels)
        scores.append(f1score)
        print('Score', f1score)
    return folds, submit_fpaths, scores

In [198]:
# np.random.seed(42)
np.random.seed(None)
folds, submit_fpaths, scores = cross_validate_pipeline(LGBMPipeline, specs_df, labels_df, n_folds=3)

Fold 0
training model
Precomputing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs = specs_df.values


Making features
Fitting model
generating submit


evaluating submit
Fold 1
training model
Precomputing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs = specs_df.values


Making features
Fitting model
generating submit


evaluating submit
Fold 2
training model
Precomputing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs = specs_df.values


Making features
Fitting model
generating submit


evaluating submit


In [199]:
scores

[0.7228915662650602, 0.6289752650176679, 0.4685714285714285]

In [200]:
np.mean(scores)

0.6068127532847188

## Test evaluation scheme

In [107]:

    
    
test_labels_df = labels_df.iloc[folds[0][1]]
submit_df = pd.read_csv(submit_fpaths[0])

score, tp, tn, fp, fn = evaluate_submit(submit_df, test_labels_df)
score

0.5037468776019983

Test one true positive

In [108]:
tp.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
137,www.ebay.com//54255,www.ebay.com//57722,1,1.0
191,www.ebay.com//45184,www.ebay.com//57071,1,1.0
202,www.ebay.com//54892,www.ebay.com//55380,1,1.0
361,www.shopbot.com.au//388,www.shopmania.in//1311,1,1.0
437,www.ebay.com//46185,www.ebay.com//56851,1,1.0


In [109]:
test_labels_df[test_labels_df.label == 1][test_labels_df.left_spec_id == 'www.ebay.com//24206']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label


In [156]:
submit_df[(submit_df.left_spec_id == 'www.ebay.com//24206') & (submit_df.right_spec_id == 'www.priceme.co.nz//54')]

Unnamed: 0,left_spec_id,right_spec_id
433177,www.ebay.com//24206,www.priceme.co.nz//54


Pass

Check one tn

In [158]:
tn.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
0,www.ebay.com//42074,www.ebay.com//47107,0,0.0
1,www.garricks.com.au//31,www.mypriceindia.com//211,0,0.0
2,www.ebay.com//45946,www.ebay.com//54776,0,0.0
3,www.ebay.com//43019,www.mypriceindia.com//211,0,0.0
4,www.ebay.com//41954,www.ukdigitalcameras.co.uk//130,0,0.0


In [161]:
test_labels_df[test_labels_df.label == 0][test_labels_df.left_spec_id == 'www.ebay.com//42074'][test_labels_df.right_spec_id == 'www.ebay.com//47107']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
5,www.ebay.com//42074,www.ebay.com//47107,0


In [162]:
submit_df[submit_df.left_spec_id == 'www.ebay.com//42074'][submit_df.right_spec_id == 'www.ebay.com//47107']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id


Pass

Check one fp

In [163]:
fp.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
396,www.ebay.com//42074,www.ebay.com//54040,0,1.0
1214,www.ebay.com//42668,www.ebay.com//54040,0,1.0
1856,www.ebay.com//24608,www.priceme.co.nz//54,0,1.0
1900,www.ebay.com//42074,www.priceme.co.nz//54,0,1.0
1989,www.ebay.com//24608,www.ebay.com//54040,0,1.0


In [164]:
test_labels_df[test_labels_df.label == 0][test_labels_df.left_spec_id == 'www.ebay.com//42074'][test_labels_df.right_spec_id == 'www.ebay.com//54040']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
1908,www.ebay.com//42074,www.ebay.com//54040,0


In [165]:
submit_df[submit_df.left_spec_id == 'www.ebay.com//42074'][submit_df.right_spec_id == 'www.ebay.com//54040']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id
360471,www.ebay.com//42074,www.ebay.com//54040


Pass

Check one fn

In [166]:
fn.head()

Unnamed: 0,left_spec_id,right_spec_id,label_true,label_pred
12,www.ebay.com//54040,www.ebay.com//54457,1,0.0
117,www.ebay.com//44670,www.ebay.com//56033,1,0.0
135,www.ebay.com//42074,www.ebay.com//42668,1,0.0
140,www.ebay.com//24280,www.ebay.com//56033,1,0.0
229,www.ebay.com//24608,www.ebay.com//48059,1,0.0


In [168]:
test_labels_df[test_labels_df.label == 1][test_labels_df.left_spec_id == 'www.ebay.com//54040'][test_labels_df.right_spec_id == 'www.ebay.com//54457']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id,label
73,www.ebay.com//54040,www.ebay.com//54457,1


In [169]:
submit_df[submit_df.left_spec_id == 'www.ebay.com//54040'][submit_df.right_spec_id == 'www.ebay.com//54457']

  """Entry point for launching an IPython kernel.


Unnamed: 0,left_spec_id,right_spec_id


pass

# Submit

In [202]:
model = LGBMPipeline(specs_df, labels_df, submit_fpath='../data/submit/submit.csv')
model.train()
model.make_submission()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.specs_df['spec_idx'] = range(len(self.specs_df))


Precomputing
Making features
Fitting model


In [224]:
submit_df = pd.read_csv('../data/submit/submit.csv')
submit_df.shape

(360185, 2)

In [225]:
specs_df.index = specs_df.spec_id

In [226]:
row = submit_df.sample(1)

print('Left\n', specs_df.loc[row.left_spec_id].all_text.values[0])
print('')
print('Right\n', specs_df.loc[row.right_spec_id].all_text.values[0])

Left
 d300 12 3 mp digital slr body extra battery extra battery charger may signs cosmetic wear fully operational functions intended may floor model return full description imperfections definitions 12 3 mp d300 25432 3 digital slr

Right
 gorgeous d300 12 3mp bundle 2batts charger strap 10 actuations extra battery memory card extra battery charger strap neck wrist may signs cosmetic wear fully operational functions intended may floor model return full description imperfections definitions 12 3 mp d300 25432 3 digital slr
