In [1]:
import math
import re
import os
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
from urlparse import urlparse
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
from sklearn.preprocessing import StandardScaler

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

links_df = pd.read_csv("product_links_df.csv")

In [2]:
from time import time
from random import randint
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
import pickle


In [3]:
len(links_df)

714

In [4]:
links_df.head()

Unnamed: 0,domain,url,label
0,printpapa.com,http://www.printpapa.com/eshop/pc/Flyer-Half-P...,1
1,printpapa.com,http://www.printpapa.com/eshop/pc/16-Page-Book...,1
2,printpapa.com,http://www.printpapa.com/eshop/pc/28-Page-Book...,1
3,printpapa.com,http://www.printpapa.com/eshop/pc/Flyer-Half-P...,1
4,printpapa.com,http://www.printpapa.com/eshop/pc/28-Page-Book...,1


In [6]:
PATH_LEN_CAP=100

def preprocess_features(df, load_scaler_from_file=False):
    processed_features = df[["url"]].copy()
    processed_features["path"] = processed_features["url"].map(lambda x: urlparse(x).path + urlparse(x).params + urlparse(x).query + urlparse(x).fragment)
    processed_features["path_len"] = processed_features["path"].map(lambda x: min(len(x), PATH_LEN_CAP))
    processed_features["num_hyphen"] = processed_features["path"].map(lambda x: x.count("-") + x.rstrip("/").count("/"))
    #processed_features["num_slash"] = processed_features["path"].map(lambda x: x.rstrip("/").count("/"))
    processed_features["contains_product"] = processed_features["path"].map(lambda x: 1 if "product" in x else 0)
    processed_features["contains_category"] = processed_features["path"].map(lambda x: 1 if "category" in x else 0)
    processed_features["longest_num"] = processed_features["path"].map(lambda x: len(max(re.findall(r'[0-9]+', x), key=len)) if re.search(r'\d', x) else 0)
    cols_to_drop = ['url', 'path']
    processed_features.drop(cols_to_drop, axis=1, inplace=True)
    scaled_features = processed_features.copy()
    col_names = [col for col in processed_features if col not in cols_to_drop and not "contains" in col]
    features = scaled_features[col_names]
    scaler_filename = 'StandardScaler.est'
    if load_scaler_from_file and os.path.isfile(scaler_filename):
        scaler = StandardScaler()
        scaler = StandardScaler().fit(features.values)
        pickle.dump(scaler, open(scaler_filename, 'wb'))
    else:
        scaler = pickle.load(open(scaler_filename, 'rb'))

    features = scaler.transform(features.values)
    scaled_features[col_names] = features
    return scaled_features

def preprocess_targets(df):
  """Prepares target features (i.e., labels) from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  # Create a boolean categorical feature representing whether the
  # median_house_value is above a set threshold.
  output_targets["label"] = df["label"].astype(int)
  #output_targets["median_house_value_is_high"] = (
  #  california_housing_dataframe["median_house_value"] > 265000).astype(float)
  return output_targets

def get_groups(df):
  return df["domain"].values

In [7]:
# Choose the first 90% of the examples for training.
n_links = len(links_df)
train_len = int(math.floor(0.9*n_links))
validation_len = int(n_links - train_len)
print "train_len", train_len, "validation_len", validation_len

training_input = links_df.head(train_len)
validation_input = links_df.tail(validation_len)

training_input = training_input.reindex(
    np.random.permutation(training_input.index))

validation_input = validation_input.reindex(
    np.random.permutation(validation_input.index))

training_examples = preprocess_features(training_input)
training_targets = preprocess_targets(training_input)
training_groups = get_groups(training_input)

# Choose the last 30% of the examples for validation.
validation_examples = preprocess_features(validation_input)
validation_targets = preprocess_targets(validation_input)

print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

train_len 642 validation_len 72
Training examples summary:




Unnamed: 0,path_len,num_hyphen,contains_product,contains_category,longest_num
count,642.0,642.0,642.0,642.0,642.0
mean,-0.7,-0.7,0.3,0.1,-0.6
std,1.0,1.0,0.4,0.2,0.7
min,-2.4,-2.1,0.0,0.0,-1.2
25%,-1.5,-1.3,0.0,0.0,-1.2
50%,-1.0,-1.1,0.0,0.0,-0.9
75%,-0.1,-0.2,1.0,0.0,-0.1
max,1.7,4.4,1.0,1.0,2.4


Validation examples summary:


Unnamed: 0,path_len,num_hyphen,contains_product,contains_category,longest_num
count,72.0,72.0,72.0,72.0,72.0
mean,-1.3,-1.1,0.3,0.0,-0.8
std,0.9,0.7,0.5,0.0,0.7
min,-2.4,-2.1,0.0,0.0,-1.2
25%,-1.8,-1.6,0.0,0.0,-1.2
50%,-1.5,-1.3,0.0,0.0,-1.2
75%,-0.8,-0.5,1.0,0.0,-0.8
max,1.7,0.6,1.0,0.0,2.4


Training targets summary:


Unnamed: 0,label
count,642.0
mean,0.5
std,0.5
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Validation targets summary:


Unnamed: 0,label
count,72.0
mean,0.4
std,0.5
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [8]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier


In [9]:
training_input.head()

Unnamed: 0,domain,url,label
624,lacelab.com,https://www.lacelab.com/,0
306,darkknightarmoury.com,http://www.darkknightarmoury.com/p-32783-72-in...,1
637,jjill.com,https://www.jjill.com/product/gentle-souls-for...,1
413,seasalt.com,https://www.seasalt.com/recipes/roast-chicken-...,0
410,seasalt.com,https://www.seasalt.com/saltworks-reviews/,0


In [10]:
gkf = GroupKFold(n_splits=5)

clf = LogisticRegression(solver="lbfgs", C=0.05, penalty="l2").fit(training_examples, training_targets.values)
#sgd_clf = SGDClassifier(loss="log", max_iter=1000, eta0=0.0002, learning_rate="adaptive").fit(training_examples, training_targets.values)
sgd_clf = SGDClassifier(loss="log", max_iter=10000, alpha=0.01,
                        learning_rate="optimal").\
                        fit(training_examples, training_targets.values)
logit_scores = cross_val_score(clf, training_examples,
                               training_targets.values,
                               cv=gkf, groups=training_groups)
sgd_scores = cross_val_score(sgd_clf, training_examples,
                             training_targets.values,
                             cv=gkf, groups=training_groups)

print("Logit Accuracy: %0.2f (+/- %0.2f)" % (logit_scores.mean(), logit_scores.std() * 2))
print("SGD Accuracy: %0.2f (+/- %0.2f)" % (sgd_scores.mean(), sgd_scores.std() * 2))

print "Logit", "%0.2f" % clf.score(validation_examples, validation_targets)
print "SGD", "%0.2f" % sgd_clf.score(validation_examples, validation_targets)


  y = column_or_1d(y, warn=True)


Logit Accuracy: 0.80 (+/- 0.08)
SGD Accuracy: 0.82 (+/- 0.09)
Logit 0.88
SGD 0.86


In [11]:
model_filename = 'SGDClassifier.est'
pickle.dump(sgd_clf, open(model_filename, 'wb'))


In [12]:
sgd_est = pickle.load(open(model_filename, 'rb'))
sgd_est.score(validation_examples, validation_targets)

0.8611111111111112

In [None]:
urls= ["https://www.bobswatches.com/used-rolex-daytona-116500-black-ceramic-bezel-white-dial.html",
"https://www.ebay.com/itm/Samsung-QN75Q8FN-75-Smart-QLED-4K-Ultra-HD-TV-with-HDR/273174766800",
"https://www.etsy.com/listing/581066860/personalized-unique-best-friends-forever",
"https://loveandlinen.co/collections/womens-graphic-tees/products/zihuatanejo-mexico-womens-fit-t-shirt",
"https://24-style.com/products/marine-shark-socks",
"https://www.maxshop.com/shop/essentials/cami-singlets/essential-reversible-cami/blush?refSrc=230128GIF&nosto=productpage-nosto-2",
"https://teeherivar.com/product/i-find-myself-to-be-exorbitantly-superannuated-for-this-feculence",
"https://shop.tilleyangling.com/products/warm-winter-double-fleece-bally",
"https://amzerprint.com/products/hearts-within-a-heart-slim-designer-cover-for-huawei-honor-6a",
"https://www.xfyro.com/products/xfyro-xs2-2-pack-bundle",
"https://www.macys.com/shop/product/circus-by-sam-edelman-kirby-booties-created-for-macys?ID=6636316&CategoryID=13616",
"https://www.lordandtaylor.com/jones-new-york-textured-plaid-coat/product/0500088736033",
"https://www.amazon.com/Linksys-Tri-Band-Intelligent-bedrooms-Multi-Story/dp/B01N2NLNEH?ref_=Oct_DLandingS_PC_NA_NA&smid=ATVPDKIKX0DER",
"https://www.forever21.com/eu/shop/catalog/product/f21/women-new-arrivals/2000291064",
"https://www.target.com/p/58-barn-door-tv-stand-with-side-doors-saracina-home/-/A-53151115?preselect=52182076#lnk=sametab",
"https://jet.com/product/Samsung-Galaxy-Kids-Tab-E-Lite-7-Inch-8GB-Wi-Fi-Tablet-Cream-White/8c0932f035b7495bb7fefcd4c77d19bf?beaconId=e24d0655-917a-448e-8daa-7047326ec99e%2F2%2Fx~8c0932f035b7495bb7fefcd4c77d19bf&experienceId=26",
"https://www.zaful.com/coffee-letter-graphic-sweatshirt-p_617651.html",
"https://www.bedbathandbeyond.com/store/product/crosley-lydia-bath-cabinet/3259113?poc=215225",
"https://www.aeropostale.com/low-rise-bootcut-jean/87084550.html?dwvar_87084550_color=189&cgid=jeans-girls#start=1",
"https://www.customink.com/products/styles/bella-+-canvas-tri-blend-t-shirt/242000",
"https://tepui.com/products/explorer-series-kukenam-3",
"https://voe21.com/collections/all-products-1/products/the-void",
"https://www.boohoo.com/mid-rise-marble-wash-mom-jeans/DZZ66784.html",
"https://faradayscienceshop.com/collections/frontpage/products/air-swimmer-the-remote-controlled-fish-blimp",
"https://lebrontshirtsla.com/products/lebron-james-lakers-t-shirt-witness",
"https://www.mightyape.com.au/product/2tb-wd-elements-portable-harddrive/26826365",
"https://www.the-house.com/vn3voss04fd18zz-vans-t-shirts.html",
"https://www.jcpenney.com/p/xersion-long-sleeve-performance-tee/ppr5007697784?pTmplType=regular&cm_re=ZH-_-hotdeals-_-WOMEN-ACTIVE-DEALZONE%7C5%7C%26rrec%3Dtrue%26rrplacementtype%3Dnorecs&",
"https://throwbackjerseys.com/collections/bryant/products/light-blue-los-angeles-bryant-8-basketball-throwback-jersey",
"http://www.darkknightarmoury.com/p-11380-nobles-leather-arm-bracers.aspx",
"http://www.printpapa.com/eshop/pc/16-Page-Booklet-5-5x8-5-338p1662.htm",
"http://www.robertgraham.us/men/accessories/sunglasses/rob-sebastian-sunglasses-robsebasd570.html",
"https://buffusa.com/buff-products/hats/knitted-polar-hat/agna-sand/117849.302",
"https://elixinol.com/?p=793",
"https://forbiddenplanet.com/107164-very-naughty-boys-amazing-true-story-of-handmade-films/",
"https://gearclub.vitalmtb.com/product/gear-club-box-4-june/",
"https://glorycycles.com/cane-creek-eewings-titanium-crank/",
"https://shop.mochithings.com/products/15768",
"https://telescopes.net/store/baader-planetarium-100mm-aperture-astrosolar-spotter-filter.html",
"https://tkbtrading.com/collections/all-colors/products/passion-orange",
"https://www.6ku.com/collections/parts-accessories/products/6ku-pedal-straps",
"https://www.alexandermcqueen.com/Item/index?cod10=34854999UH&siteCode=ALEXANDERMCQUEEN_US",
"https://www.bestbuyautoequipment.com/Chassis-Liner-16-Revolution-p/cl832000.htm",
"https://www.clarks.in/Amali-Ice-8298.html",
"https://www.deliciousseeds.com/del_en/amnesia-haze-fem-5-seeds.html",
"https://www.dickssportinggoods.com/p/neosport-womens-neoprene-5mm-jumpsuit-16hndwwn5mmnprnjmwst/16hndwwn5mmnprnjmwst",
"https://www.holabirdsports.com/collections/brand-new-babolat-sfx3/products/babolat-sfx3-all-court-mens-black-silver",
"https://www.jjill.com/ClickInfo?URL=%2fproduct%2fpure-jill-peplum-top%3fevtype%3d%26mpe_id%3d%26intv_id%3d%26storeId%3d%26catalogId%3d%26langId%3d%26experimentId%3d%26testElementId%3d%26controlElement%3d%26expDataType%3d%26expDataUniqueID%3d&evtype=CpgnClick&mpe_id=715854148&intv_id=715851192&storeId=10101&catalogId=10051&langId=-1&expDataType=CatalogEntryId&expDataUniqueID=221289",
"https://www.lacelab.com/collections/3m-reflective-rope-laces/products/cove-blue-3m-reflective-rope-laces",
"https://www.livingsocial.com/deals/4-elements-wellness-center-4",
"https://www.melbournesnowboard.com.au/collections/all/products/burton-moto-boa-2019?variant=12657538465853",
"https://www.pckuwait.com/product/intel-i3-8100-8th-generation-core-i3-processor-3-60ghz-6mb-cache/",
"https://www.proclipusa.com/product/835232-proclip-console-mount",
"https://www.rpphobby.com/product_p/asc7430.htm",
"https://www.seasalt.com/alaea-hawaiian-sea-salt-coarse-grain-grinder-jar.html",
"https://www.shopzerouv.com/collections/the-90s/products/retro-geometric-diamond-shape-sunglasses-c748",
"https://www.swimsuitsforall.com/Aquabelle-Medley-Capri#rrec=true",
"https://www.yoogiscloset.com/handbags/3-1-phillip-lim-black-leather-soleil-large-bucket-drawstring-bag-97627.html",
"https://www.spanx.com/leggings/faux-leather-leggings",
"https://usa.tommy.com/en/men/men-shirts/lewis-hamilton-logo-shirt-mw08299",
"https://www.calvinklein.us/en/mens-clothing/mens-featured-shops-calvin-klein-jeans/slim-fit-archive-western-shirt-22705235",
"http://www2.hm.com/en_us/productpage.0476583002.html"]

df = pd.DataFrame.from_records([(url,) for url in urls], columns=["url"])
X = preprocess_features(df, load_scaler_from_file=True)
sgd_est = pickle.load(open(model_filename, 'rb'))
probs = sgd_est.predict_proba(X.values)
for url, probs in zip(urls, probs):
    if probs[1] < 0.55:
        print url, probs[1]
