## Our clustering algorithm evaluation
Evaluating our clustering algorithm on bookswagon.com pages. The aim is to calculate precision and recall for "book details" cluster and the "catalog" cluster in bookswagon.com.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import ast
import time
from sklearn.cluster import MeanShift, estimate_bandwidth, DBSCAN
FILEPATH = '../input/blackwells.csv'
FILEPATH

'../input/blackwells.csv'

In [2]:
df = pd.read_csv(FILEPATH, converters={'bitset': ast.literal_eval, 'tag_count': ast.literal_eval})

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector,label,tag_count,bitset
0,https://blackwells.co.uk/bookshop/basket,https://blackwells.co.uk/bookshop/home,"\n\n\n \n<!DOCTYPE html>\n<html lang=""e...","(0, 1, 5, 1, 1, 6, 3, 1)",,"[0.0019569471624266144, 0.0019569471624266144,...","[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, ..."
1,https://blackwells.co.uk/bookshop/search/,https://blackwells.co.uk/bookshop/home,"\n\n\n \n<!DOCTYPE html>\n<html lang=""e...","(0, 1, 5, 1, 1, 0, 3, 0)",list,"[0.0012970168612191958, 0.0012970168612191958,...","[0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, ..."
2,https://blackwells.co.uk/bookshop/home,https://blackwells.co.uk/bookshop/home,"\n\n\n \n<!DOCTYPE html>\n<html lang=""e...","(0, 1, 0, 1, 0, 0, 3, 1)",,"[0.0011655011655011655, 0.0011655011655011655,...","[1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, ..."
3,https://blackwells.co.uk/bookshop/product/9781...,https://blackwells.co.uk/bookshop/home,"\n\n\n \n<!DOCTYPE html>\n<html lang=""e...","(0, 1, 1, 1, 1, 0, 0, 1)",product,"[0.0008116883116883117, 0.0008116883116883117,...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, ..."
4,https://blackwells.co.uk/bookshop/mapping,https://blackwells.co.uk/bookshop/basket,"\n\n\n\n\n\n<!DOCTYPE html>\n<html lang=""en"" c...","(2, 22, 1, 1, 7, 15, 7, 5)",,"[0.008333333333333333, 0.008333333333333333, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(10919, 7)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


True

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10919 entries, 0 to 10918
Data columns (total 7 columns):
url               10919 non-null object
referer_url       10919 non-null object
src               10919 non-null object
shingle_vector    10919 non-null object
label             10899 non-null object
tag_count         10919 non-null object
bitset            10919 non-null object
dtypes: object(7)
memory usage: 597.2+ KB


In [8]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 20 row with no label
There are 10405 row with product label
There are 494 row with list label


## Run MeanShift clustering algorithm
### Import all necessary functions

In [9]:
#from astarwars_clustering.utils import utility

def pad_vector(vec, n):
    for i in range(n):
        vec.append(0)

def pad_matrix_elem(matrix, lastvec):
    maxlen=len(lastvec)
    matrixlen=len(matrix)
    for i in range(matrixlen):
        elem_topad=maxlen-len(matrix[i])
        pad_vector(matrix[i], elem_topad)

#cluster di cataloghi e di prodotti
def count_occurrences(vec,n):
    occ=0
    for el in vec:
        if el==n:
            occ=occ+1
    return occ

In [10]:
#from astarwars_clustering.clustering.structural_clustering import dbscanclustering, meanshiftclustering
def meanshiftclustering(featurematrix,bandwidth=None):
    start = time.time()
    clustering=None
    if bandwidth is not None:
        clustering = MeanShift(bandwidth=bandwidth).fit(featurematrix)
    else:
        clustering = MeanShift().fit(featurematrix)
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Elapsed time to calculate MeanShift clustering:{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    return clustering


#if eps is specified also min_samples will be not null for convention
def dbscanclustering(featurematrix,epsValue=None,min_samplesValue=None):
    start = time.time()
    clustering=None
    if epsValue is not None:
        clustering = DBSCAN(eps=epsValue, min_samples=min_samplesValue).fit(featurematrix)
    else:
        clustering = DBSCAN().fit(featurematrix)
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Elapsed time to calculate DBSCAN clustering:{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    return clustering

In [11]:
#from astarwars_clustering.clustering import clusteringevaluation

#predictedlabel rappresenta l'etichetta del clustering su cui vogliamo calcolare precision e recall
def calculate_precision_and_recall(df, clustering, selectedlabel, predictedlabel):

    labels = clustering.labels_
    df['predicted_labels'] = labels
    selectedelements = count_occurrences(labels, predictedlabel)

    truepositive = 0
    allpositives = len(df[df['label'] == selectedlabel])

    for index, row in df.iterrows():
        if row['label'] == selectedlabel and row['predicted_labels'] == predictedlabel:
            truepositive += 1

    fmt_string='{} is {}'
    recall = truepositive/allpositives
    precision = truepositive/selectedelements

    print(fmt_string.format('Recall', recall))
    print(fmt_string.format('Precision', precision))

    return precision, recall

In [12]:
sample=df
bitsetmat=sample['bitset'].tolist()
tagcountmat=sample['tag_count'].tolist()

In [13]:
clustering = meanshiftclustering(tagcountmat,0.07)

Elapsed time to calculate MeanShift clustering:00:17:06.90


In [14]:
print(fmt_string.format(len(sample[sample['label'].isnull()]),'no'))
print(fmt_string.format(len(sample[sample['label']=='product']), 'product'))
print(fmt_string.format(len(sample[sample['label']=='list']), 'list'))

There are 20 row with no label
There are 10405 row with product label
There are 494 row with list label


In [15]:
predictedLabels = clustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(len(noOfClusters)) + ' clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are 12 clusters


Cluster labels:


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [16]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,count_occurrences(predictedLabels,el)))

cluster n. 0 has 10411 pages
cluster n. 1 has 319 pages
cluster n. 2 has 180 pages
cluster n. 3 has 1 pages
cluster n. 4 has 1 pages
cluster n. 5 has 1 pages
cluster n. 6 has 1 pages
cluster n. 7 has 1 pages
cluster n. 8 has 1 pages
cluster n. 9 has 1 pages
cluster n. 10 has 1 pages
cluster n. 11 has 1 pages


In [17]:
sample[sample['predicted_label'] == 0]['url'].head(20)

0              https://blackwells.co.uk/bookshop/basket
2                https://blackwells.co.uk/bookshop/home
3     https://blackwells.co.uk/bookshop/product/9781...
6     https://blackwells.co.uk/bookshop/editorial/wi...
7     https://blackwells.co.uk/bookshop/editorial/co...
12    https://blackwells.co.uk/bookshop/editorial/Ki...
16           https://blackwells.co.uk/bookshop/Students
21    https://blackwells.co.uk/bookshop/product/Kudo...
23    https://blackwells.co.uk/bookshop/product/Will...
24    https://blackwells.co.uk/bookshop/product/Will...
26    https://blackwells.co.uk/bookshop/product/Rele...
35    https://blackwells.co.uk/bookshop/product/Game...
38    https://blackwells.co.uk/bookshop/product/The-...
40    https://blackwells.co.uk/bookshop/product/The-...
41    https://blackwells.co.uk/bookshop/product/Mr-S...
42    https://blackwells.co.uk/bookshop/product/Our-...
43    https://blackwells.co.uk/bookshop/product/Pete...
44    https://blackwells.co.uk/bookshop/product/

In [18]:
sample[sample['predicted_label'] == 1]['url'].head(10)

13      https://blackwells.co.uk/bookshop/category/_top
15    https://blackwells.co.uk/bookshop/category/_bi...
17    https://blackwells.co.uk/bookshop/category/_ar...
22      https://blackwells.co.uk/bookshop/wellcomeshop/
33        https://blackwells.co.uk/bookshop/bestsellers
34    https://blackwells.co.uk/bookshop/category/_bi...
37    https://blackwells.co.uk/bookshop/category/_ar...
39    https://blackwells.co.uk/bookshop/collection/O...
58    https://blackwells.co.uk/bookshop/category/_ar...
82    https://blackwells.co.uk/bookshop/category/_ar...
Name: url, dtype: object

In [19]:
sample[sample['predicted_label'] == 2]['url'].head(30)

1               https://blackwells.co.uk/bookshop/search/
18      https://blackwells.co.uk/bookshop/editorial/sl...
20      https://blackwells.co.uk/bookshop/category/978...
27      https://blackwells.co.uk/bookshop/category/nul...
28      https://blackwells.co.uk/bookshop/category/nul...
29        https://blackwells.co.uk/bookshop/category/null
30      https://blackwells.co.uk/bookshop/category/nul...
31      https://blackwells.co.uk/bookshop/category/nul...
32      https://blackwells.co.uk/bookshop/category/nul...
36      https://blackwells.co.uk/bookshop/search/autho...
236     https://blackwells.co.uk/bookshop/search/autho...
348     https://blackwells.co.uk/bookshop/search/autho...
349     https://blackwells.co.uk/bookshop/search/autho...
362     https://blackwells.co.uk/bookshop/search?autho...
469     https://blackwells.co.uk/bookshop/search/autho...
506     https://blackwells.co.uk/bookshop/search/autho...
557     https://blackwells.co.uk/bookshop/search/autho...
735     https:

## Evaluate recall and precision

In [20]:
p1,r1=calculate_precision_and_recall(sample,clustering,'list',1)

Recall is 0.6376518218623481
Precision is 0.987460815047022


In [21]:
p1,r1=calculate_precision_and_recall(sample,clustering,'product',0)

Recall is 1.0
Precision is 0.9994236864854481


# DBSCAN algorithm

In [22]:
dbsclustering=dbscanclustering(tagcountmat,0.055,20)

Elapsed time to calculate DBSCAN clustering:00:00:48.13


In [23]:
predictedLabels = dbsclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(len(noOfClusters)) + ' clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are 3 clusters


Cluster labels:


array([-1,  0,  1])

In [24]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,count_occurrences(predictedLabels,el)))

cluster n. 0 has 15 pages
cluster n. 1 has 10409 pages
cluster n. 2 has 495 pages


In [25]:
sample[sample['predicted_label'] == 0]['url'].head(20)

2                https://blackwells.co.uk/bookshop/home
3     https://blackwells.co.uk/bookshop/product/9781...
6     https://blackwells.co.uk/bookshop/editorial/wi...
12    https://blackwells.co.uk/bookshop/editorial/Ki...
16           https://blackwells.co.uk/bookshop/Students
21    https://blackwells.co.uk/bookshop/product/Kudo...
23    https://blackwells.co.uk/bookshop/product/Will...
24    https://blackwells.co.uk/bookshop/product/Will...
26    https://blackwells.co.uk/bookshop/product/Rele...
35    https://blackwells.co.uk/bookshop/product/Game...
38    https://blackwells.co.uk/bookshop/product/The-...
40    https://blackwells.co.uk/bookshop/product/The-...
41    https://blackwells.co.uk/bookshop/product/Mr-S...
42    https://blackwells.co.uk/bookshop/product/Our-...
43    https://blackwells.co.uk/bookshop/product/Pete...
44    https://blackwells.co.uk/bookshop/product/Roof...
45    https://blackwells.co.uk/bookshop/product/Harr...
46    https://blackwells.co.uk/bookshop/product/

In [26]:
sample[sample['predicted_label'] == 1]['url'].head(20)

13       https://blackwells.co.uk/bookshop/category/_top
15     https://blackwells.co.uk/bookshop/category/_bi...
17     https://blackwells.co.uk/bookshop/category/_ar...
20     https://blackwells.co.uk/bookshop/category/978...
27     https://blackwells.co.uk/bookshop/category/nul...
28     https://blackwells.co.uk/bookshop/category/nul...
29       https://blackwells.co.uk/bookshop/category/null
30     https://blackwells.co.uk/bookshop/category/nul...
31     https://blackwells.co.uk/bookshop/category/nul...
32     https://blackwells.co.uk/bookshop/category/nul...
33         https://blackwells.co.uk/bookshop/bestsellers
34     https://blackwells.co.uk/bookshop/category/_bi...
36     https://blackwells.co.uk/bookshop/search/autho...
37     https://blackwells.co.uk/bookshop/category/_ar...
39     https://blackwells.co.uk/bookshop/collection/O...
58     https://blackwells.co.uk/bookshop/category/_ar...
82     https://blackwells.co.uk/bookshop/category/_ar...
108    https://blackwells.co.uk

In [27]:
sample[sample['predicted_label'] == 2]['url'].head(20)

Series([], Name: url, dtype: object)

In [28]:
p1,r1=calculate_precision_and_recall(sample,dbsclustering,'list',1)

Recall is 0.9959514170040485
Precision is 0.9939393939393939


In [29]:
p1,r1=calculate_precision_and_recall(sample,dbsclustering,'product',0)

Recall is 1.0
Precision is 0.9996157171678355
