In [1]:
%matplotlib inline
# Importing libraries
import matplotlib.pyplot as plt
import pandas as pd

FILEPATH_PREFIX = '../../../book_crawler/data'
SPIDERNAME = 'bookswagon'
FILENAME = '2019-05-12T17-30-22.csv'
FILEPATH = '{}/{}/{}'.format(FILEPATH_PREFIX, SPIDERNAME, FILENAME)
FILEPATH

'../../../book_crawler/data/bookswagon/2019-05-12T17-30-22.csv'

In [2]:
df = pd.read_csv(FILEPATH)

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src
0,https://www.bookswagon.com/,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."
1,https://www.bookswagon.com/view-books/0/new-ar...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."
2,https://www.bookswagon.com/travel-holiday-books,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."
3,https://www.bookswagon.com/all-categories/1000-0,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."
4,https://www.bookswagon.com/view-books/4/textbook,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(4447, 3)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


False

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4447 entries, 0 to 4446
Data columns (total 3 columns):
url            4447 non-null object
referer_url    4447 non-null object
src            4447 non-null object
dtypes: object(3)
memory usage: 104.3+ KB


In [8]:
print("Some stats")
print("----------------")
df.describe()

Some stats
----------------


Unnamed: 0,url,referer_url,src
count,4447,4447,4447
unique,4447,2192,4447
top,https://www.bookswagon.com/review/summer-dodo-...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."
freq,1,18,1


## Data cleaning
During the crawling phase (using Scrapy), we restrained the crawled pages to be only the web pages matching with `https://www.bookswagon.com/`. Let's check that

In [9]:
import re

In [10]:
bookswagonRegex = r'^https\:\/\/www\.bookswagon\.com\/'

In [11]:
otherDomainsDataFrame = df[- df['url'].str.match(bookswagonRegex)]

In [12]:
otherDomainsDataFrame.head()

Unnamed: 0,url,referer_url,src


So Scrapy effectively crawled only pages from the `www.bookswagon.com` domain

## Add `shingle_vector` label
Compute shingle vector for each page of the dataframe

In [13]:
#add top level folder to sys.path
import sys
sys.path.append('../../../')

from foxlink_clustering.clustering.shingler import compute_shingle_vector

In [14]:
src = df.iloc[0]['src']

The default value for window size in Foxlink is 3

In [15]:
DEFAULT_WINDOW_SIZE = 3
result = compute_shingle_vector(src, DEFAULT_WINDOW_SIZE)

In [16]:
result

(0, 3, 6, 4, 0, 2, 10, 1)

Set `shingle_vector` label for each row    

In [17]:
df['shingle_vector'] = df.apply(lambda x: compute_shingle_vector(x['src'], DEFAULT_WINDOW_SIZE), axis=1)

In [18]:
df.head()

Unnamed: 0,url,referer_url,src,shingle_vector
0,https://www.bookswagon.com/,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(0, 3, 6, 4, 0, 2, 10, 1)"
1,https://www.bookswagon.com/view-books/0/new-ar...,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)"
2,https://www.bookswagon.com/travel-holiday-books,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)"
3,https://www.bookswagon.com/all-categories/1000-0,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(7, 3, 3, 4, 0, 2, 10, 1)"
4,https://www.bookswagon.com/view-books/4/textbook,https://www.bookswagon.com/,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1...","(1, 3, 6, 4, 0, 2, 5, 1)"


## Write dataset to a file

In [19]:
df.to_csv('../../../datasets/bookswagon.csv', encoding='utf-8', index=False)

In [None]:
test = pd.read_csv('../../../datasets/bookswagon.csv')

In [None]:
test.head()

In [None]:
test.shape