In [1]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../../book_crawler/data'
SPIDERNAME = 'hive'
FILENAME = '2019-05-12T14-35-06.csv'
FILEPATH = '{}/{}/{}'.format(FILEPATH_PREFIX, SPIDERNAME, FILENAME)
FILEPATH

'../../../book_crawler/data/hive/2019-05-12T14-35-06.csv'

In [2]:
df = pd.read_csv(FILEPATH)

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src
0,https://www.hive.co.uk/,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d..."
1,https://www.hive.co.uk/books/study-zone,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d..."
2,https://www.hive.co.uk/search/books?fq=01120-1144,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d..."
3,https://www.hive.co.uk/books/young-adult,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d..."
4,https://www.hive.co.uk/books/travel,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(10824, 3)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10824 entries, 0 to 10823
Data columns (total 3 columns):
url            10824 non-null object
referer_url    10824 non-null object
src            10824 non-null object
dtypes: object(3)
memory usage: 253.8+ KB


In [8]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src
count,10824,10824,10824
unique,10824,4363,10801
top,https://www.hive.co.uk/Search/eBooks/Politics-...,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d..."
freq,1,18,2


## Data cleaning
During the crawling phase (using Scrapy), we restrained the crawled pages to be only the web pages matching with `https://www.hive.co.uk/`. Let's check that

In [18]:
import re

In [19]:
hiveRegex = r'^https\:\/\/www\.hive\.co\.uk\/'

In [20]:
otherDomainsDataFrame = df[- df['url'].str.match(hiveRegex)]

In [21]:
otherDomainsDataFrame.head()

Unnamed: 0,url,referer_url,src,shingle_vector


So Scrapy effectively crawled only pages from the `www.hive.co.uk` domain

## Add `shingle_vector` label
Compute shingle vector for each page of the dataframe

In [12]:
#add top level folder to sys.path
import sys
sys.path.append('../../../')

from foxlink_clustering.clustering.shingler import compute_shingle_vector

In [13]:
src = df.iloc[0]['src']

The default value for window size in Foxlink is 3

In [14]:
DEFAULT_WINDOW_SIZE = 3
result = compute_shingle_vector(src, DEFAULT_WINDOW_SIZE)

In [15]:
result

(0, 1, 2, 0, 0, 2, 1, 4)

Set `shingle_vector` label for each row    

In [16]:
df['shingle_vector'] = df.apply(lambda x: compute_shingle_vector(x['src'], DEFAULT_WINDOW_SIZE), axis=1)

In [17]:
df.head()

Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.hive.co.uk/,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 0, 0, 2, 1, 4)"
1,https://www.hive.co.uk/books/study-zone,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 0, 0, 2, 1, 4)"
2,https://www.hive.co.uk/search/books?fq=01120-1144,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 0, 2, 5, 0, 1, 4)"
3,https://www.hive.co.uk/books/young-adult,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 2, 1, 1, 2, 0)"
4,https://www.hive.co.uk/books/travel,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 0, 1, 2, 1, 4)"


## Write dataset to a file

In [22]:
df.to_csv('../../../datasets/hive.csv', encoding='utf-8', index=False)

In [23]:
test = pd.read_csv('../../../datasets/hive.csv')

In [24]:
test.head()

Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.hive.co.uk/,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 0, 0, 2, 1, 4)"
1,https://www.hive.co.uk/books/study-zone,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 0, 0, 2, 1, 4)"
2,https://www.hive.co.uk/search/books?fq=01120-1144,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 0, 2, 5, 0, 1, 4)"
3,https://www.hive.co.uk/books/young-adult,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 2, 1, 1, 2, 0)"
4,https://www.hive.co.uk/books/travel,https://www.hive.co.uk/,"<!DOCTYPE html>\r\n<html lang=""en-gb"" class=""d...","(0, 1, 2, 0, 1, 2, 1, 4)"


In [25]:
test.shape

(10824, 4)