In [1]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../../datasets'
FILENAME = 'powells.csv'
FILEPATH = '{}/{}'.format(FILEPATH_PREFIX, FILENAME)
FILEPATH

'../../../datasets/powells.csv'

In [2]:
df = pd.read_csv(FILEPATH)

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 2, 1, 8, 3, 10, 0, 5)"
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 5, 0, 1)"
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)"
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 1, 1, 0, 0)"
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 0, 0, 4, 2, 2, 0, 0)"


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(10571, 4)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10571 entries, 0 to 10570
Data columns (total 4 columns):
url               10571 non-null object
referer_url       10571 non-null object
src               10571 non-null object
shingle_vector    10571 non-null object
dtypes: object(4)
memory usage: 330.4+ KB


In [8]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src,shingle_vector
count,10571,10571,10571,10571
unique,10571,6556,10571,30
top,https://www.powells.com/searchresults?keyword=...,https://www.powells.com/ProductMoreIsbn?produc...,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 3, 3, 0, 0, 0, 0, 0)"
freq,1,28,1,8081


## Page labelling
Label (possibly) each page with its own template. Let's start with product (books) pages. 
The following URL links to a page which shows info regarding a book:

```
https://www.powells.com/book/-9780553805444
```

So we could infer that each product page has the following url schema:


```
https://www.powells.com/book/<somethingElse>
```

Similarly, some gift pages share the same template as book pages

```
https://www.powells.com/gift/powells-rainbow-mug-1110000285182/18-0
```

So in that case the schema is like the following:

```
https://www.powells.com/gift/<somethingElse>
```



In [9]:
import re

#Urls associated with product pages
testStrings1= ['https://www.powells.com/book/-4004117328190',
              'https://www.powells.com/GIFT/powells-rainbow-mug-1110000285182/18-0',
              'https://www.powells.com/BOOK/read-rise-resist-yellow-blue-mug-1110000260113/18-0',
              'https://www.powells.com/book/-0092657025390',
              'https://www.powells.com/gift/powells-rainbow-mug-1110000285182/18-0',
              'https://www.powells.com/book/read-rise-resist-blue-pink-sticker-1110000258565/18-0']

#Various urls which aren't product pages
testStrings2 = ['https://www.powells.com/SEARCHRESULTS?keyword=Martin+George+R+R',
               'https://www.powells.com/SearchResults?keyword=Martin+George+R+R',
               'https://www.powells.com/searchresults?keyword=game+of+thrones',
               'https://www.powells.com/used',
               'https://www.powells.com/new-arrivals',
               'https://www.powells.com/category/childrens-books?category=Children%27s&binding=Trade%20Paperback&mpp=50',
               'https://www.powells.com/category/calendars-notebooks-and-gifts/journals-and-notebooks',
               'https://www.powells.com/category',
               'https://www.powells.com/staff-picks',
               'http://www.powells.com/calendar',
               'http://www.powells.com/sell-books',
               'https://www.powells.com/digital-audio-books',
               'http://www.powells.com/calendar']

testStrings = testStrings1 + testStrings2

productRegex = r'^https\:\/\/www\.powells\.com\/([bB][oO][oO][kK]|[gG][iI][fF][tT])\/.+'

# simple tests
print("Testing article regex:")
print("Only the first 6 urls should be matching with the article regex")
print("---------------------------------------------------------------")

for uri in testStrings:
    print(re.match(productRegex, uri))
    
print('---------------------------------------------------------------')

Testing article regex:
Only the first 6 urls should be matching with the article regex
---------------------------------------------------------------
<re.Match object; span=(0, 43), match='https://www.powells.com/book/-4004117328190'>
<re.Match object; span=(0, 67), match='https://www.powells.com/GIFT/powells-rainbow-mug->
<re.Match object; span=(0, 80), match='https://www.powells.com/BOOK/read-rise-resist-yel>
<re.Match object; span=(0, 43), match='https://www.powells.com/book/-0092657025390'>
<re.Match object; span=(0, 67), match='https://www.powells.com/gift/powells-rainbow-mug->
<re.Match object; span=(0, 82), match='https://www.powells.com/book/read-rise-resist-blu>
None
None
None
None
None
None
None
None
None
None
None
None
None
---------------------------------------------------------------


Next we have to label pages which show list of books. We noted that books in blackwells.co.uk are grouped by category, so that books which share the same category are listed in the same page in a paginated way. 

```
https://www.powells.com/category/business
```

So urls referring to categories (seems to) have the following schema:

```
https://www.powells.com/category/<somethingElse>
```

Note that also search results are paginated:

```
https://www.powells.com/searchresults?keyword=game+of+thrones
https://www.powells.com/SearchResults?keyword=Martin+George+R+R
https://www.powells.com/SEARCHRESULTS?keyword=Martin+George+R+R
```

In that case the url schema follows this pattern:
```
https://www.powells.com/searchresults?<somethingElse>
```

Finally, the following pages show also books in a paginated fashion:

```
https://www.powells.com/new-arrivals
https://www.powells.com/used
```

In [10]:
catalogRegex = r'^https\:\/\/www\.powells\.com\/([cC][aA][tT][eE][gG][oO][rR][yY]\/.+|[sS][eE][aA][rR][cC][hH][rR][eE][sS][uU][lL][tT][sS]\?.+|new\-arrivals$|used$)'

print("Testing catalog regex:")
print("Only 7 urls should be matching with the bestsellers regex")
print("-----------------------------------------------------------")

for uri in testStrings:
    print(re.match(catalogRegex, uri))
print('---------------------------------------------------------------')

Testing catalog regex:
Only 7 urls should be matching with the bestsellers regex
-----------------------------------------------------------
None
None
None
None
None
None
<re.Match object; span=(0, 63), match='https://www.powells.com/SEARCHRESULTS?keyword=Mar>
<re.Match object; span=(0, 63), match='https://www.powells.com/SearchResults?keyword=Mar>
<re.Match object; span=(0, 61), match='https://www.powells.com/searchresults?keyword=gam>
<re.Match object; span=(0, 28), match='https://www.powells.com/used'>
<re.Match object; span=(0, 36), match='https://www.powells.com/new-arrivals'>
<re.Match object; span=(0, 103), match='https://www.powells.com/category/childrens-books?>
<re.Match object; span=(0, 85), match='https://www.powells.com/category/calendars-notebo>
None
None
None
None
None
None
---------------------------------------------------------------


## Add `label` column
Labelling each possible row of the dataframe

In [11]:
df['label'] = None
df.head()

Unnamed: 0,url,referer_url,src,shingle_vector,label
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 2, 1, 8, 3, 10, 0, 5)",
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 5, 0, 1)",
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)",
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 1, 1, 0, 0)",
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 0, 0, 4, 2, 2, 0, 0)",


In [12]:
def set_label(url, regex, old_label, new_label):
    if re.match(regex, url):
        if old_label is None:
            return new_label
        else:
            return 'collision'
    else:
        return old_label

Add product label

In [13]:
df['label'] = df.apply(lambda x: set_label(x['url'], productRegex, x['label'], 'product'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,shingle_vector,label


Add list label for paginated pages

In [14]:
df['label'] = df.apply(lambda x: set_label(x['url'], catalogRegex, x['label'], 'list'), axis=1)
print("There should be no row with 'collision' label")
print("---------------------------------------------")
df[df['label']=='collision'].head()

There should be no row with 'collision' label
---------------------------------------------


Unnamed: 0,url,referer_url,src,shingle_vector,label


Summary:

In [15]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 1609 row with no label
There are 417 row with product label
There are 8545 row with list label


In [16]:
pd.options.display.max_rows=1000
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
df[df['label'].isnull()][['url', 'referer_url']].head()

Unnamed: 0,url,referer_url
0,https://www.powells.com/blog/author/kristen-arnett,https://www.powells.com/
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/
3,https://www.powells.com/powells-presents,https://www.powells.com/
4,https://www.powells.com/locations,https://www.powells.com/


In [17]:
pd.reset_option('^display.', silent=True)

Note that there are some rows which aren't labeled. That is because these pages aren't generated neither by the 'product' template nor the 'list' template.

## Write dataset to a file

In [18]:
df.to_csv(FILEPATH, encoding='utf-8', index=False)

In [19]:
test = pd.read_csv(FILEPATH)

In [24]:
test.head(20)

Unnamed: 0,url,referer_url,src,shingle_vector,label
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 2, 1, 8, 3, 10, 0, 5)",
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 5, 0, 1)",
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)",
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 1, 1, 0, 0)",
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 0, 0, 4, 2, 2, 0, 0)",
5,https://www.powells.com/post/interviews/powell...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 1, 0, 0, 2, 3, 0, 1)",
6,https://www.powells.com/little-golden-books-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 4, 2, 0, 0, 0)",
7,https://www.powells.com/sell-books-stores,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 2, 1, 3, 2, 10, 0, 0)",
8,https://www.powells.com/blog,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 3, 0, 4)",
9,https://www.powells.com/nature-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)",


In [21]:
test.shape

(10571, 5)

In [22]:
print(fmt_string.format(len(test[test['label'].isnull()]),'no'))
print(fmt_string.format(len(test[test['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 1609 row with no label
There are 417 row with product label
There are 8545 row with list label
