In [1]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../../datasets'
FILENAME = 'bookswagon.csv'
FILEPATH = '{}/{}'.format(FILEPATH_PREFIX, FILENAME)
FILEPATH

'../../../datasets/bookswagon.csv'

In [2]:
df = pd.read_csv(FILEPATH)

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.bookswagon.com/,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(0, 3, 6, 4, 0, 2, 10, 1)"
1,https://www.bookswagon.com/view-books/0/new-ar...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)"
2,https://www.bookswagon.com/travel-holiday-books,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)"
3,https://www.bookswagon.com/all-categories/1000-0,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(7, 3, 3, 4, 0, 2, 10, 1)"
4,https://www.bookswagon.com/view-books/4/textbook,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)"


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(4447, 4)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4447 entries, 0 to 4446
Data columns (total 4 columns):
url               4447 non-null object
referer_url       4447 non-null object
src               4447 non-null object
shingle_vector    4447 non-null object
dtypes: object(4)
memory usage: 139.0+ KB


In [8]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src,shingle_vector
count,4447,4447,4447,4447
unique,4447,2192,4447,26
top,https://www.bookswagon.com/book/vorontsovy-ih-...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(0, 2, 5, 0, 0, 0, 6, 1)"
freq,1,18,1,1762


## Page labelling
Label (possibly) each page with its own template. Let's start with product (books) pages. 
The following URL links to a page which shows info regarding a book:

```
https://www.bookswagon.com/book/hunger-games-suzanne-collins/9781407132082
```

So we could infer that each product page has the following url schema:

```
https://www.bookswagon.com/book/<somethingElse>
```

In [9]:
import re

#Urls associated with product pages
testStrings1= ['https://www.bookswagon.com/book/hunger-games-suzanne-collins/9781407132082',
              'https://www.bookswagon.com/book/boy-in-dress/9780007516643',
              'https://www.bookswagon.com/book/subtle-art-not-giving-a/9780062641540',
              'https://www.bookswagon.com/book/poor-little-rich-slum-rashmi/9789381626184',
              'https://www.bookswagon.com/book/essential-english-grammar/9788175960299',
              'https://www.bookswagon.com/book/oxford-word-skills-gairns/9780194620123']

#Various urls which aren't product pages
testStrings2 = ['https://www.bookswagon.com',
               'https://www.bookswagon.com/environment-geography-books',
               'https://www.bookswagon.com/environment-geography-books?sid=13592',
               'https://www.bookswagon.com/view-books/0/new-arrivals',
               'https://www.bookswagon.com/author/william-wordsworth',
               'https://www.bookswagon.com/login',
               'https://www.bookswagon.com/signup',
               'https://www.bookswagon.com/english-language-teaching-books',
               'https://www.bookswagon.com/view-books/5/award-winners',
               'https://www.bookswagon.com/paymenthelp',
               'hhttps://www.bookswagon.com/aboutus',
               'https://www.bookswagon.com/returnhelp',
               'https://www.bookswagon.com/requestbook']

testStrings = testStrings1 + testStrings2

productRegex = r'^https\:\/\/www\.bookswagon\.com\/book\/.+'

# simple tests
print("Testing article regex:")
print("Only the first 6 urls should be matching with the article regex")
print("---------------------------------------------------------------")

for uri in testStrings:
    print(re.match(productRegex, uri))
    
print('---------------------------------------------------------------')

Testing article regex:
Only the first 6 urls should be matching with the article regex
---------------------------------------------------------------
<re.Match object; span=(0, 74), match='https://www.bookswagon.com/book/hunger-games-suza>
<re.Match object; span=(0, 58), match='https://www.bookswagon.com/book/boy-in-dress/9780>
<re.Match object; span=(0, 69), match='https://www.bookswagon.com/book/subtle-art-not-gi>
<re.Match object; span=(0, 74), match='https://www.bookswagon.com/book/poor-little-rich->
<re.Match object; span=(0, 71), match='https://www.bookswagon.com/book/essential-english>
<re.Match object; span=(0, 71), match='https://www.bookswagon.com/book/oxford-word-skill>
None
None
None
None
None
None
None
None
None
None
None
None
None
---------------------------------------------------------------


Next we have to label pages which show list of books. We noted that books in blackwells.co.uk are grouped by category, so that books which share the same category are listed in the same page in a paginated way. 

```
https://www.bookswagon.com/environment-geography-books
```

So urls referring to categories (seems to) have the following schema:

```
https://www.bookswagon.com/<somethingElse>-books
```

In [10]:
catalogRegex = r'^https\:\/\/www\.bookswagon\.com\/(author\/.+|[^/]+\-books($|\/.+|\?.+))'
print("Testing catalog regex:")
print("Only 6 urls should be matching with the bestsellers regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(catalogRegex, uri))
print('---------------------------------------------------------------')

Testing catalog regex:
Only 6 urls should be matching with the bestsellers regex
-----------------------------------------------------------
None
None
None
None
None
None
None
<re.Match object; span=(0, 54), match='https://www.bookswagon.com/environment-geography->
<re.Match object; span=(0, 64), match='https://www.bookswagon.com/environment-geography->
<re.Match object; span=(0, 52), match='https://www.bookswagon.com/view-books/0/new-arriv>
<re.Match object; span=(0, 52), match='https://www.bookswagon.com/author/william-wordswo>
None
None
<re.Match object; span=(0, 58), match='https://www.bookswagon.com/english-language-teach>
<re.Match object; span=(0, 53), match='https://www.bookswagon.com/view-books/5/award-win>
None
None
None
None
---------------------------------------------------------------


## Add `label` column
Labelling each possible row of the dataframe

In [11]:
df['label'] = None
df.head()

Unnamed: 0,url,referer_url,src,shingle_vector,label
0,https://www.bookswagon.com/,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(0, 3, 6, 4, 0, 2, 10, 1)",
1,https://www.bookswagon.com/view-books/0/new-ar...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)",
2,https://www.bookswagon.com/travel-holiday-books,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)",
3,https://www.bookswagon.com/all-categories/1000-0,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(7, 3, 3, 4, 0, 2, 10, 1)",
4,https://www.bookswagon.com/view-books/4/textbook,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)",


In [12]:
def set_label(url, regex, old_label, new_label):
    if re.match(regex, url):
        if old_label is None:
            return new_label
        else:
            return 'collision'
    else:
        return old_label

Add product label

In [13]:
df['label'] = df.apply(lambda x: set_label(x['url'], productRegex, x['label'], 'product'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,shingle_vector,label


Add list label for paginated pages

In [14]:
df['label'] = df.apply(lambda x: set_label(x['url'], catalogRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,shingle_vector,label


Summary:

In [15]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 1605 row with no label
There are 2323 row with product label
There are 519 row with list label


In [17]:
pd.options.display.max_rows=1000
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
df[df['label'].isnull()][['url', 'referer_url']].head(2000)

Unnamed: 0,url,referer_url
0,https://www.bookswagon.com/,https://www.bookswagon.com/
3,https://www.bookswagon.com/all-categories/1000-0,https://www.bookswagon.com/
7,https://www.bookswagon.com/featured-authors,https://www.bookswagon.com/
9,https://www.bookswagon.com/BookMap.aspx,https://www.bookswagon.com/
15,https://www.bookswagon.com/affiliate/login,https://www.bookswagon.com/
16,https://www.bookswagon.com/shoppingcart.aspx?pid=26677150&vid=11&ptype=1,https://www.bookswagon.com/view-books/3/coming-soon-pre-order-now
17,https://www.bookswagon.com/requestbook,https://www.bookswagon.com/
18,https://www.bookswagon.com/shoppingcart.aspx?&pid=10341972&vid=51&ptype=1,https://www.bookswagon.com/science-mathematics-books
19,https://www.bookswagon.com/shoppingcart.aspx?&pid=12363363&vid=179&ptype=1,https://www.bookswagon.com/view-books/5/award-winners
22,https://www.bookswagon.com/affiliate/forgetpassword,https://www.bookswagon.com/affiliate/login


In [18]:
pd.reset_option('^display.', silent=True)

Note that there are some rows which aren't labeled. That is because these pages aren't generated neither by the 'product' template nor the 'list' template.

## Write dataset to a file

In [19]:
df.to_csv(FILEPATH, encoding='utf-8', index=False)

In [20]:
test = pd.read_csv(FILEPATH)

In [21]:
test.head()

Unnamed: 0,url,referer_url,src,shingle_vector,label
0,https://www.bookswagon.com/,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(0, 3, 6, 4, 0, 2, 10, 1)",
1,https://www.bookswagon.com/view-books/0/new-ar...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)",list
2,https://www.bookswagon.com/travel-holiday-books,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)",list
3,https://www.bookswagon.com/all-categories/1000-0,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(7, 3, 3, 4, 0, 2, 10, 1)",
4,https://www.bookswagon.com/view-books/4/textbook,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)",list


In [22]:
test.shape

(4447, 5)

In [23]:
print(fmt_string.format(len(test[test['label'].isnull()]),'no'))
print(fmt_string.format(len(test[test['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 1605 row with no label
There are 2323 row with product label
There are 519 row with list label
