In [3]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../book_crawler/data'
SPIDERNAME = 'bookdepository'
FILENAME = '2019-05-05T14-31-04.csv'
FILEPATH = '{}/{}/{}'.format(FILEPATH_PREFIX, SPIDERNAME, FILENAME)
FILEPATH

'../../book_crawler/data/bookdepository/2019-05-05T14-31-04.csv'

In [4]:
df = pd.read_csv(FILEPATH, nrows =100)

## Data analisys
Some preliminary analisys of the dataset

In [5]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src
0,https://www.bookdepository.com/,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ..."
1,https://www.bookdepository.com/author/J-K-Rowling,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ..."
2,https://www.bookdepository.com/category/3098/T...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ..."
3,https://www.bookdepository.com/category/3392/B...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ..."
4,https://www.bookdepository.com/category/2967/T...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ..."


In [6]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(100, 3)

In [7]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [8]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [9]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
url            100 non-null object
referer_url    100 non-null object
src            100 non-null object
dtypes: object(3)
memory usage: 2.4+ KB


In [10]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src
count,100,100,100
unique,100,19,100
top,https://www.bookdepository.com/roald-dahl,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ..."
freq,1,27,1


## Data cleaning
During the crawling phase (using Scrapy), we restrained the crawled pages to be only the web pages matching with `https://www.bookdepository.com/` domain.

In [11]:
import re

In [12]:
bookdepositoryRegex = r'^https\:\/\/www\.bookdepository\.com'

In [13]:
otherDomainsDataFrame = df[- df['url'].str.match(bookdepositoryRegex)]

In [14]:
otherDomainsDataFrame.head()

Unnamed: 0,url,referer_url,src
97,https://www.goodreads.com/book/show/29744417-t...,https://www.bookdepository.com/Bee-Book-DK/978...,"<!DOCTYPE html>\n<html class=""desktop\n"">\n<he..."


In [15]:
otherDomainsDataFrame.shape

(1, 3)

Let's check if all these pages have the same domain

In [16]:
goodreadsRegex = r'https://www.goodreads.com/'

In [17]:
(otherDomainsDataFrame[otherDomainsDataFrame['url'].str.match(goodreadsRegex)]).shape

(1, 3)

So let's delete these pages:

In [18]:
df = df[df['url'].str.match(bookdepositoryRegex)]

In [19]:
df.shape

(99, 3)

## Page labelling
Label (possibly) each page with its own template. Let's start with product (books) pages. 
The following URL links to a page which shows info regarding a book:

```
https://www.bookdepository.com/This-is-Going-Hurt-Adam-Kay/9781509858637?ref=grid-vie
```

So we could infer that each product page has the following url schema:

```
https://www.bookdepository.com/<bookName>/<ISBN-13>
```

where ISBN-13 format starts with 978 or 979 prefix (EAN prefix)

In [61]:


#Urls associated with product pages
testStrings1= ['https://www.bookdepository.com/Definitive-Book-Body-Language-Allan-Pease/9781409168508',
              'https://www.bookdepository.com/Magnolia-Kitchen-Bernadette-Gee/9781988547008',
              'https://www.bookdepository.com/This-is-Going-Hurt-Adam-Kay/9781509858637',
              'https://www.bookdepository.com/Power-Now-Eckhart-Tolle/9780340733509',
              'https://www.bookdepository.com/Educated-Tara-Westover/9780099511021',
              'https://www.bookdepository.com/Dungeons-Dragons-Players-Handbook-Dungeons-Dragons-Core-Rulebooks-Wizards-Coast/9780786965601?ref=grid-view']

#Various urls which aren't product pages
testStrings2 = ['https://www.bookdepository.com/account/login',
               'https://www.bookdepository.com/contactus',
               'https://www.bookdepository.com/help',
               'https://www.bookdepository.com/julia-donaldson',
               'https://www.bookdepository.com/category/2/Art-Photography',
               'https://www.bookdepository.com/category/8/Art-History',
               'https://www.bookdepository.com/category/20/Art-History-c-1600-c-1800',
               'https://www.bookdepository.com/category/21/Styles-Baroque',
               'https://www.bookdepository.com/bestsellers',
               'https://www.bookdepository.com/author/J-K-Rowling',
               'https://www.bookdepository.com/publishers/Penguin-Books-Australia',
               'https://www.bookdepository.com/search?searchTerm=Cornelia+Funke&amp;search=search',
               'https://www.bookdepository.com/search/advanced?seriesId=48384']

testStrings = testStrings1 + testStrings2

productRegex = r'^https\:\/\/www\.bookdepository\.com\/[^/]+/97[89][0-9]{10}'

# simple tests
print("Testing article regex:")
print("Only the first 6 urls should be matching with the article regex")
print("---------------------------------------------------------------")

for uri in testStrings:
    print(re.match(productRegex, uri))
    
print('---------------------------------------------------------------')

Testing article regex:
Only the first 6 urls should be matching with the article regex
---------------------------------------------------------------
<re.Match object; span=(0, 86), match='https://www.bookdepository.com/Definitive-Book-Bo>
<re.Match object; span=(0, 76), match='https://www.bookdepository.com/Magnolia-Kitchen-B>
<re.Match object; span=(0, 72), match='https://www.bookdepository.com/This-is-Going-Hurt>
<re.Match object; span=(0, 68), match='https://www.bookdepository.com/Power-Now-Eckhart->
<re.Match object; span=(0, 67), match='https://www.bookdepository.com/Educated-Tara-West>
<re.Match object; span=(0, 124), match='https://www.bookdepository.com/Dungeons-Dragons-P>
None
None
None
None
None
None
None
None
None
None
None
None
None
---------------------------------------------------------------


Next we have to label pages which show list of books. We noted that books in bookdepository.com are grouped by category, so that books which share the same category are listed in the same page. 
We also noted that bookdepository.com displays a paginated list of books only if the main category (top level category) has already been selected. As an example consider the following url`

```
https://www.bookdepository.com/category/107/Architecture
```

The above page shows a list of books which are from the "Architecture" category, which, in turn, is a subcategory of "Art & Photography". So we have to select all url x which have a substring that matches with the string "category/" if and only if the referring url for x matches with the string "category/"

So urls referring to categories (seems to) have the following schema:

```
https://www.bookdepository.com/category/<somethingElse>
```

Also, bookdepository.com shows a paginated list of books grouped by author, so we should consider also that kind of links if and only if these kind of links matches with the string "author/" (there are some author for which books aren't displayed in paginated fashion. In that case the url doesn't contain the string "author/"):

```
https://www.bookdepository.com/author/J-K-Rowling
```
Note that also search results are paginated, as well as books grouped by publishers:

```
https://www.bookdepository.com/search/advanced?seriesId=48384
https://www.bookdepository.com/search?searchTerm=Cornelia+Funke&amp;search=search
https://www.bookdepository.com/publishers/Penguin-Books-Australia
```

Finally, bestsellers books are also listed in a paginated fashion:

```
https://www.bookdepository.com/bestsellers
```

In [62]:
'''

bestsellersRegex = r'^https\:\/\/www\.bookdepository\.com\/bestsellers$'
authorRegex = r'^https\:\/\/www\.bookdepository\.com\/author\/'
searchRegex = r'^https\:\/\/www\.bookdepository\.com\/search(\?|\/)'
publisherRegex = r'^https\:\/\/www\.bookdepository\.com\/publishers\/'
categoryRegex = r'^https\:\/\/www\.bookdepository\.com\/category\/'

print("Testing bestsellers regex:")
print("Only 1 url should be matching with the bestsellers regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(bestsellersRegex, uri))
print('---------------------------------------------------------------')


print()


print("Testing author regex:")
print("Only 1 url should be matching with the author regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(authorRegex, uri))
print('---------------------------------------------------------------')


print()


print("Testing search regex:")
print("Only 2 url should be matching with the search regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(searchRegex, uri))
print('---------------------------------------------------------------')


print()




print("Testing publisher regex:")
print("Only 1 url should be matching with the publisher regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(publisherRegex, uri))
print('---------------------------------------------------------------')


print()


print("Testing category regex:")
print("Only 4 url should be matching with the category regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(categoryRegex, uri))
print('---------------------------------------------------------------')
'''

'\n\nbestsellersRegex = r\'^https\\:\\/\\/www\\.bookdepository\\.com\\/bestsellers$\'\nauthorRegex = r\'^https\\:\\/\\/www\\.bookdepository\\.com\\/author\\/\'\nsearchRegex = r\'^https\\:\\/\\/www\\.bookdepository\\.com\\/search(\\?|\\/)\'\npublisherRegex = r\'^https\\:\\/\\/www\\.bookdepository\\.com\\/publishers\\/\'\ncategoryRegex = r\'^https\\:\\/\\/www\\.bookdepository\\.com\\/category\\/\'\n\nprint("Testing bestsellers regex:")\nprint("Only 1 url should be matching with the bestsellers regex")\nprint("-----------------------------------------------------------")\n\nfor uri in testStrings:\n    print(re.match(bestsellersRegex, uri))\nprint(\'---------------------------------------------------------------\')\n\n\nprint()\n\n\nprint("Testing author regex:")\nprint("Only 1 url should be matching with the author regex")\nprint("-----------------------------------------------------------")\n\nfor uri in testStrings:\n    print(re.match(authorRegex, uri))\nprint(\'----------------------

## Add `label` column
Labelling each possible row of the dataframe

In [63]:
df['label'] = None
df.head()

Unnamed: 0,url,referer_url,src,label
0,https://www.bookdepository.com/,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
1,https://www.bookdepository.com/author/J-K-Rowling,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
2,https://www.bookdepository.com/category/3098/T...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
3,https://www.bookdepository.com/category/3392/B...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
4,https://www.bookdepository.com/category/2967/T...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",


In [64]:
def set_label(url, regex, old_label, new_label):
    if re.match(regex, url):
        if old_label is None:
            return new_label
        else:
            return 'collision'
    else:
        return old_label

Add product label

In [65]:
df['label'] = df.apply(lambda x: set_label(x['url'], productRegex, x['label'], 'product'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,label


Add list label for paginated pages

In [66]:
'''
df['label'] = df.apply(lambda x: set_label(x['url'], bestsellersRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()
'''

'\ndf[\'label\'] = df.apply(lambda x: set_label(x[\'url\'], bestsellersRegex, x[\'label\'], \'list\'), axis=1)\nprint("There should be no row with \'collision\' label")\nprint("---------------------------------------------")\ndf[df[\'label\']==\'collision\'].head()\n'

In [67]:
'''
df['label'] = df.apply(lambda x: set_label(x['url'], authorRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()
'''

'\ndf[\'label\'] = df.apply(lambda x: set_label(x[\'url\'], authorRegex, x[\'label\'], \'list\'), axis=1)\nprint("There should be no row with \'collision\' label")\nprint("---------------------------------------------")\ndf[df[\'label\']==\'collision\'].head()\n'

In [68]:
'''
df['label'] = df.apply(lambda x: set_label(x['url'], searchRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()
'''

'\ndf[\'label\'] = df.apply(lambda x: set_label(x[\'url\'], searchRegex, x[\'label\'], \'list\'), axis=1)\nprint("There should be no row with \'collision\' label")\nprint("---------------------------------------------")\ndf[df[\'label\']==\'collision\'].head()\n'

In [69]:
'''
df['label'] = df.apply(lambda x: set_label(x['url'], publisherRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()
'''

'\ndf[\'label\'] = df.apply(lambda x: set_label(x[\'url\'], publisherRegex, x[\'label\'], \'list\'), axis=1)\nprint("There should be no row with \'collision\' label")\nprint("---------------------------------------------")\ndf[df[\'label\']==\'collision\'].head()\n'

In [70]:
'''
def set_label_based_on_referer(url, regex, old_label, new_label, referer_url):
    if re.match(regex, url) and re.match(regex, referer_url):
        if old_label is None:
            return new_label
        else:
            return 'collision'
    else:
        return old_label
'''

"\ndef set_label_based_on_referer(url, regex, old_label, new_label, referer_url):\n    if re.match(regex, url) and re.match(regex, referer_url):\n        if old_label is None:\n            return new_label\n        else:\n            return 'collision'\n    else:\n        return old_label\n"

In [71]:
'''
df['label'] = df.apply(lambda x: set_label_based_on_referer(x['url'], categoryRegex, x['label'], 'list', x['referer_url']), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()
'''

'\ndf[\'label\'] = df.apply(lambda x: set_label_based_on_referer(x[\'url\'], categoryRegex, x[\'label\'], \'list\', x[\'referer_url\']), axis=1)\nprint("There should be no row with \'collision\' label")\nprint("---------------------------------------------")\ndf[df[\'label\']==\'collision\'].head()\n'

Summary:

In [72]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
'''
print(fmt_string.format(len(df[df['label']=='list']), 'list'))
'''

There are 9742 row with no label
There are 15807 row with product label


"\nprint(fmt_string.format(len(df[df['label']=='list']), 'list'))\n"

In [73]:
'''
df[df['label'].isnull()][['url', 'referer_url']].head()
'''

"\ndf[df['label'].isnull()][['url', 'referer_url']].head()\n"

## Add `shingle_vector` label
Compute shingle vector for each page of the dataframe

In [32]:
#add top level folder to sys.path
import sys
sys.path.append('../../')

from foxlink_clustering.clustering.shingler import compute_shingle_vector

In [33]:
src = df.iloc[0]['src']

The default value for window size in Foxlink is 3

In [35]:
DEFAULT_WINDOW_SIZE = 3
result = compute_shingle_vector(src, DEFAULT_WINDOW_SIZE)

In [36]:
result

(0, 3, 2, 0, 5, 1, 1, 1)

Set `shingle_vector` label for each row    

In [None]:
df['shingle_vector'] = df.apply(lambda x: compute_shingle_vector(x['src'], DEFAULT_WINDOW_SIZE), axis=1)

## Write dataset to a file

In [74]:
df.to_csv('../datasets/bookdepository.csv', encoding='utf-8', index=False)

In [75]:
test = pd.read_csv('../datasets/bookdepository.csv')

In [76]:
test.head()

Unnamed: 0,url,referer_url,src,label
0,https://www.bookdepository.com/,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
1,https://www.bookdepository.com/author/J-K-Rowling,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
2,https://www.bookdepository.com/category/3098/T...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
3,https://www.bookdepository.com/category/3392/B...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",
4,https://www.bookdepository.com/category/2967/T...,https://www.bookdepository.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\n ...",


In [77]:
test.shape

(25549, 4)

In [79]:
print(fmt_string.format(len(test[test['label'].isnull()]),'no'))
print(fmt_string.format(len(test[test['label']=='product']), 'product'))

There are 9742 row with no label
There are 15807 row with product label
