In [1]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../../book_crawler/data'
SPIDERNAME = 'powells'
FILENAME = '2019-05-14T20-37-16.csv'
FILEPATH = '{}/{}/{}'.format(FILEPATH_PREFIX, SPIDERNAME, FILENAME)
FILEPATH

'../../../book_crawler/data/powells/2019-05-14T20-37-16.csv'

In [2]:
df = pd.read_csv(FILEPATH)

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(10571, 3)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10571 entries, 0 to 10570
Data columns (total 3 columns):
url            10571 non-null object
referer_url    10571 non-null object
src            10571 non-null object
dtypes: object(3)
memory usage: 247.8+ KB


In [8]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src
count,10571,10571,10571
unique,10571,6556,10571
top,https://www.powells.com/login?returnurl=%2fboo...,https://www.powells.com/ProductMoreIsbn?produc...,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
freq,1,28,1


## Data cleaning
During the crawling phase (using Scrapy), we restrained the crawled pages to be only the web pages matching with `www.powells.com` domain.

In [13]:
import re

In [14]:
blackwellsRegex = r'^https\:\/\/www\.powells\.com'

In [15]:
otherDomainsDataFrame = df[- df['url'].str.match(blackwellsRegex)]

In [16]:
otherDomainsDataFrame.head()

Unnamed: 0,url,referer_url,src


So Scrapy effectively crawled only pages from the `www.powells.com` domain

## Add `shingle_vector` label
Compute shingle vector for each page of the dataframe

In [17]:
#add top level folder to sys.path
import sys
sys.path.append('../../../')

from foxlink_clustering.clustering.shingler import compute_shingle_vector

In [18]:
src = df.iloc[0]['src']

The default value for window size in Foxlink is 3

In [19]:
DEFAULT_WINDOW_SIZE = 3
result = compute_shingle_vector(src, DEFAULT_WINDOW_SIZE)

In [20]:
result

(7, 2, 1, 8, 3, 10, 0, 5)

Set `shingle_vector` label for each row    

In [21]:
df['shingle_vector'] = df.apply(lambda x: compute_shingle_vector(x['src'], DEFAULT_WINDOW_SIZE), axis=1)

In [22]:
df.head()

Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 2, 1, 8, 3, 10, 0, 5)"
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 5, 0, 1)"
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)"
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 1, 1, 0, 0)"
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 0, 0, 4, 2, 2, 0, 0)"


## Write dataset to a file

In [23]:
df.to_csv('../../../datasets/powells.csv', encoding='utf-8', index=False)

In [24]:
test = pd.read_csv('../../../datasets/powells.csv')

In [25]:
test.head()

Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 2, 1, 8, 3, 10, 0, 5)"
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 5, 0, 1)"
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)"
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 1, 1, 0, 0)"
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 0, 0, 4, 2, 2, 0, 0)"


In [26]:
test.shape

(10571, 4)