In [1]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../../datasets'
FILENAME = 'bookoutlet.csv'
FILEPATH = '{}/{}'.format(FILEPATH_PREFIX, FILENAME)
FILEPATH

'../../../datasets/bookoutlet.csv'

In [2]:
df = pd.read_csv(FILEPATH)

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector
0,https://bookoutlet.com/,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 4, 0, 1, 3, 0)"
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 0, 0, 1, 3, 0)"
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 8, 1, 1, 1, 0)"
3,https://bookoutlet.com/Store/Browse?N=isTopTen...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)"
4,https://bookoutlet.com/Store/Browse?N=isGiftCe...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)"


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(16387, 4)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16387 entries, 0 to 16386
Data columns (total 4 columns):
url               16387 non-null object
referer_url       16387 non-null object
src               16387 non-null object
shingle_vector    16387 non-null object
dtypes: object(4)
memory usage: 512.2+ KB


In [8]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src,shingle_vector
count,16387,16387,16387,16387
unique,16387,8050,16387,11
top,https://bookoutlet.com/Store/Browse?Na=140870&...,https://bookoutlet.com/Store/Browse?Npb=2279,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)"
freq,1,31,1,6138


## Page labelling
Label (possibly) each page with its own template. Let's start with product (books) pages. 
The following URL links to a page which shows info regarding a book:

```
https://bookoutlet.com/Store/Details/9781426310492B
```

So we could infer that each product page has the following url schema:

```
https://bookoutlet.com/Store/Details/<ISBN-13>
```

In [10]:
import re

#Urls associated with product pages
testStrings1= ['https://bookoutlet.com/Store/Details/9781426310492B',
              'https://bookoutlet.com/Store/Details/9780789331106B/1000-football-clubs-teams-stadiums-and-legend',
              'https://bookoutlet.com/Store/Details/9781481470780B/little-bigfoot-big-city-the-littlest-bigfoot',
              'https://bookoutlet.com/Store/Details/9780465050659B/the-design-of-everyday-things-revised-and-exp',
              'https://bookoutlet.com/Store/Details/9781401301958B/jamies-italy',
              'https://bookoutlet.com/Store/Details/9780306820748B/devoted-the-story-of-a-fathers-love-for-his-s']

#Various urls which aren't product pages
testStrings2 = ['https://bookoutlet.com/Store/Browse?N=isGiftCertificate',
               'https://bookoutlet.com/Store/Browse?N=isTopTwoHundred&Nq=0',
               'https://bookoutlet.com/Store/Browse?N=isTopTwoHundred&Nq=0&size=24&sort=arrival_1&Nc=22',
               'https://bookoutlet.com/Store/Social/#Instagram',
               'https://bookoutlet.com/Loyalty',
               'https://bookoutlet.com/Store/Search?qf=All&q=game+of+thrones',
               'https://bookoutlet.com/Store/Search?qf=All&q=',
               'https://bookoutlet.com/Store/Browse?Na=176493',
               'https://bookoutlet.com/Account/Login',
               'https://bookoutlet.com/Store/Browse',
               'https://bookoutlet.com/Help/OurProduct',
               'https://bookoutlet.com/Cart/Details',
               'https://bookoutlet.com/Store/Social/#Reviews']

testStrings = testStrings1 + testStrings2

productRegex = r'^https\:\/\/bookoutlet\.com\/Store\/Details\/.+'

# simple tests
print("Testing article regex:")
print("Only the first 6 urls should be matching with the article regex")
print("---------------------------------------------------------------")

for uri in testStrings:
    print(re.match(productRegex, uri))
    
print('---------------------------------------------------------------')

Testing article regex:
Only the first 6 urls should be matching with the article regex
---------------------------------------------------------------
<re.Match object; span=(0, 51), match='https://bookoutlet.com/Store/Details/978142631049>
<re.Match object; span=(0, 97), match='https://bookoutlet.com/Store/Details/978078933110>
<re.Match object; span=(0, 96), match='https://bookoutlet.com/Store/Details/978148147078>
<re.Match object; span=(0, 97), match='https://bookoutlet.com/Store/Details/978046505065>
<re.Match object; span=(0, 64), match='https://bookoutlet.com/Store/Details/978140130195>
<re.Match object; span=(0, 97), match='https://bookoutlet.com/Store/Details/978030682074>
None
None
None
None
None
None
None
None
None
None
None
None
None
---------------------------------------------------------------


Next we have to label pages which show list of books. We noted that books in bookoutlet.com are grouped by category, so that books which share the same category are listed in the same page in a paginated way. 

```
https://bookoutlet.com/Store/Browse?Nc=5

```

So urls referring to categories (seems to) have the following schema:

```
https://bookoutlet.com/Store/Browse<? symbol or / symbol or nothing><something else>
```

Note that also search results are paginated:

```
https://bookoutlet.com/Store/Search?qf=All&q=
```

In that case the url schema follows this pattern:
```
https://bookoutlet.com/Store/Search?<something else>
```

In [12]:
catalogRegex = r'^https\:\/\/bookoutlet\.com\/Store\/(Browse(\?|\/|$)|Search\?)'

print("Testing catalog regex:")
print("Only 7 urls should be matching with the bestsellers regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(catalogRegex, uri))
print('---------------------------------------------------------------')

Testing catalog regex:
Only 7 urls should be matching with the bestsellers regex
-----------------------------------------------------------
None
None
None
None
None
None
<re.Match object; span=(0, 36), match='https://bookoutlet.com/Store/Browse?'>
<re.Match object; span=(0, 36), match='https://bookoutlet.com/Store/Browse?'>
<re.Match object; span=(0, 36), match='https://bookoutlet.com/Store/Browse?'>
None
None
<re.Match object; span=(0, 36), match='https://bookoutlet.com/Store/Search?'>
<re.Match object; span=(0, 36), match='https://bookoutlet.com/Store/Search?'>
<re.Match object; span=(0, 36), match='https://bookoutlet.com/Store/Browse?'>
None
<re.Match object; span=(0, 35), match='https://bookoutlet.com/Store/Browse'>
None
None
None
---------------------------------------------------------------


## Add `label` column
Labelling each possible row of the dataframe

In [13]:
df['label'] = None
df.head()

Unnamed: 0,url,referer_url,src,shingle_vector,label
0,https://bookoutlet.com/,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 4, 0, 1, 3, 0)",
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 0, 0, 1, 3, 0)",
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 8, 1, 1, 1, 0)",
3,https://bookoutlet.com/Store/Browse?N=isTopTen...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",
4,https://bookoutlet.com/Store/Browse?N=isGiftCe...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",


In [14]:
def set_label(url, regex, old_label, new_label):
    if re.match(regex, url):
        if old_label is None:
            return new_label
        else:
            return 'collision'
    else:
        return old_label

Add product label

In [15]:
df['label'] = df.apply(lambda x: set_label(x['url'], productRegex, x['label'], 'product'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,shingle_vector,label


Add list label for paginated pages

In [16]:
df['label'] = df.apply(lambda x: set_label(x['url'], catalogRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,shingle_vector,label


Summary:

In [17]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 6 row with no label
There are 4801 row with product label
There are 11580 row with list label


In [18]:
df[df['label'].isnull()][['url', 'referer_url']].head(20)

Unnamed: 0,url,referer_url
0,https://bookoutlet.com/,https://bookoutlet.com/
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/
9,https://bookoutlet.com/Store/Categories,https://bookoutlet.com/
10,https://bookoutlet.com/Loyalty/ReferAFriend,https://bookoutlet.com/
17,https://bookoutlet.com/landing/student-discount,https://bookoutlet.com/


Note that there are some rows which aren't labeled. That is because these pages aren't generated neither by the 'product' template nor the 'list' template.

## Write dataset to a file

In [20]:
df.to_csv(FILEPATH, encoding='utf-8', index=False)

In [21]:
test = pd.read_csv(FILEPATH)

In [22]:
test.head()

Unnamed: 0,url,referer_url,src,shingle_vector,label
0,https://bookoutlet.com/,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 4, 0, 1, 3, 0)",
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 0, 0, 1, 3, 0)",
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 8, 1, 1, 1, 0)",
3,https://bookoutlet.com/Store/Browse?N=isTopTen...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list
4,https://bookoutlet.com/Store/Browse?N=isGiftCe...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list


In [23]:
test.shape

(16387, 5)

In [24]:
print(fmt_string.format(len(test[test['label'].isnull()]),'no'))
print(fmt_string.format(len(test[test['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 6 row with no label
There are 4801 row with product label
There are 11580 row with list label
