## Our clustering algorithm evaluation
Evaluating our clustering algorithm on bookswagon.com pages. The aim is to calculate precision and recall for "book details" cluster and the "catalog" cluster in bookswagon.com.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import ast
import time
from sklearn.cluster import MeanShift, estimate_bandwidth, DBSCAN
FILEPATH = '../input/bookoutlet.csv'
FILEPATH

'../input/bookoutlet.csv'

In [2]:
df = pd.read_csv(FILEPATH, converters={'bitset': ast.literal_eval, 'tag_count': ast.literal_eval})

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector,label,tag_count,bitset
0,https://bookoutlet.com/,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 4, 0, 1, 3, 0)",,"[0.001607717041800643, 0.001607717041800643, 0...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 0, 0, 1, 3, 0)",,"[0.002036659877800407, 0.002036659877800407, 0...","[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, ..."
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 8, 1, 1, 1, 0)",,"[0.0019455252918287938, 0.0019455252918287938,...","[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ..."
3,https://bookoutlet.com/Store/Browse?N=isTopTen...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list,"[0.000998003992015968, 0.000998003992015968, 0...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,https://bookoutlet.com/Store/Browse?N=isGiftCe...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list,"[0.001445086705202312, 0.001445086705202312, 0...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(16387, 7)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


True

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16387 entries, 0 to 16386
Data columns (total 7 columns):
url               16387 non-null object
referer_url       16387 non-null object
src               16387 non-null object
shingle_vector    16387 non-null object
label             16381 non-null object
tag_count         16387 non-null object
bitset            16387 non-null object
dtypes: object(7)
memory usage: 896.2+ KB


In [8]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 6 row with no label
There are 4801 row with product label
There are 11580 row with list label


## Run MeanShift clustering algorithm
### Import all necessary functions

In [9]:
#from astarwars_clustering.utils import utility

def pad_vector(vec, n):
    for i in range(n):
        vec.append(0)

def pad_matrix_elem(matrix, lastvec):
    maxlen=len(lastvec)
    matrixlen=len(matrix)
    for i in range(matrixlen):
        elem_topad=maxlen-len(matrix[i])
        pad_vector(matrix[i], elem_topad)

#cluster di cataloghi e di prodotti
def count_occurrences(vec,n):
    occ=0
    for el in vec:
        if el==n:
            occ=occ+1
    return occ

In [10]:
#from astarwars_clustering.clustering.structural_clustering import dbscanclustering, meanshiftclustering
def meanshiftclustering(featurematrix,bandwidth=None):
    start = time.time()
    clustering=None
    if bandwidth is not None:
        clustering = MeanShift(bandwidth=bandwidth).fit(featurematrix)
    else:
        clustering = MeanShift().fit(featurematrix)
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Elapsed time to calculate MeanShift clustering:{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    return clustering


#if eps is specified also min_samples will be not null for convention
def dbscanclustering(featurematrix,epsValue=None,min_samplesValue=None):
    start = time.time()
    clustering=None
    if epsValue is not None:
        clustering = DBSCAN(eps=epsValue, min_samples=min_samplesValue).fit(featurematrix)
    else:
        clustering = DBSCAN().fit(featurematrix)
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Elapsed time to calculate DBSCAN clustering:{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    return clustering

In [11]:
#from astarwars_clustering.clustering import clusteringevaluation

#predictedlabel rappresenta l'etichetta del clustering su cui vogliamo calcolare precision e recall
def calculate_precision_and_recall(df, clustering, selectedlabel, predictedlabel):

    labels = clustering.labels_
    df['predicted_labels'] = labels
    selectedelements = count_occurrences(labels, predictedlabel)

    truepositive = 0
    allpositives = len(df[df['label'] == selectedlabel])

    for index, row in df.iterrows():
        if row['label'] == selectedlabel and row['predicted_labels'] == predictedlabel:
            truepositive += 1

    fmt_string='{} is {}'
    recall = truepositive/allpositives
    precision = truepositive/selectedelements

    print(fmt_string.format('Recall', recall))
    print(fmt_string.format('Precision', precision))

    return precision, recall

In [12]:
sample=df
bitsetmat=sample['bitset'].tolist()
tagcountmat=sample['tag_count'].tolist()

In [13]:
clustering = meanshiftclustering(tagcountmat,0.07)

Elapsed time to calculate MeanShift clustering:00:23:11.78


In [14]:
print(fmt_string.format(len(sample[sample['label'].isnull()]),'no'))
print(fmt_string.format(len(sample[sample['label']=='product']), 'product'))
print(fmt_string.format(len(sample[sample['label']=='list']), 'list'))

There are 6 row with no label
There are 4801 row with product label
There are 11580 row with list label


In [15]:
predictedLabels = clustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [0 1 2 3 4]clusters


Cluster labels:


array([0, 1, 2, 3, 4])

In [16]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,count_occurrences(predictedLabels,el)))

cluster n. 0 has 11227 pages
cluster n. 1 has 4823 pages
cluster n. 2 has 129 pages
cluster n. 3 has 207 pages
cluster n. 4 has 1 pages


In [17]:
sample[sample['predicted_label'] == 0]['url'].head(20)

0                               https://bookoutlet.com/
1                     https://bookoutlet.com/Store/Sale
2            https://bookoutlet.com/Store/OtherBrowsing
3     https://bookoutlet.com/Store/Browse?N=isTopTen...
4     https://bookoutlet.com/Store/Browse?N=isGiftCe...
6     https://bookoutlet.com/Store/Browse?N=isRetail...
10          https://bookoutlet.com/Loyalty/ReferAFriend
17      https://bookoutlet.com/landing/student-discount
18            https://bookoutlet.com/Store/Browse?Nce=6
24      https://bookoutlet.com/Store/Browse?Nc=2&Ns=421
38         https://bookoutlet.com/Store/Browse?Npb=2445
42        https://bookoutlet.com/Store/Browse?Na=321769
54    https://bookoutlet.com/Store/Browse?Npb=2445&s...
58    https://bookoutlet.com/Store/Browse?Na=321769&...
60    https://bookoutlet.com/Store/Browse?Na=321769&...
61    https://bookoutlet.com/Store/Browse?Na=321769&...
63        https://bookoutlet.com/Store/Browse?Na=329917
66        https://bookoutlet.com/Store/Browse?Na

In [18]:
sample[sample['predicted_label'] == 1]['url'].head(10)

22    https://bookoutlet.com/Store/Details/978006220...
23    https://bookoutlet.com/Store/Details/978152474...
25    https://bookoutlet.com/Store/Details/978147677...
26    https://bookoutlet.com/Store/Details/978067163...
31    https://bookoutlet.com/Store/Details/978125010...
34    https://bookoutlet.com/Store/Details/978031033...
36    https://bookoutlet.com/Store/Details/978125006...
39    https://bookoutlet.com/Store/Details/978031230...
41    https://bookoutlet.com/Store/Details/978006443...
43    https://bookoutlet.com/Store/Details/978140884...
Name: url, dtype: object

In [19]:
sample[sample['predicted_label'] == 2]['url'].head(30)

8                     https://bookoutlet.com/Store/Browse
14              https://bookoutlet.com/Store/Browse?Nc=71
16              https://bookoutlet.com/Store/Browse?Nc=31
1085          https://bookoutlet.com/Store/Browse?Nse=622
1102    https://bookoutlet.com/Store/Browse?Nse=622&si...
1104    https://bookoutlet.com/Store/Browse?Nse=622&si...
1105    https://bookoutlet.com/Store/Browse?Nse=622&pa...
1110    https://bookoutlet.com/Store/Browse?Nse=622&si...
1120    https://bookoutlet.com/Store/Browse?Nse=622&pa...
1122    https://bookoutlet.com/Store/Browse?Nse=622&pa...
1127    https://bookoutlet.com/Store/Browse?Nse=622&pa...
2123         https://bookoutlet.com/Store/Browse?Nse=1330
2140          https://bookoutlet.com/Store/Browse?Nse=575
2154    https://bookoutlet.com/Store/Browse?Nse=575&si...
2174    https://bookoutlet.com/Store/Browse?Nse=575&pa...
2506         https://bookoutlet.com/Store/Browse?Nse=1724
2524    https://bookoutlet.com/Store/Browse?Nse=1724&s...
2525    https:

## Evaluate recall and precision

In [20]:
p1,r1=calculate_precision_and_recall(sample,clustering,'list',0)

Recall is 0.9686528497409327
Precision is 0.999109290104213


In [21]:
p1,r1=calculate_precision_and_recall(sample,clustering,'product',1)

Recall is 0.9989585503020204
Precision is 0.9944018245905039


# DBSCAN algorithm

In [22]:
dbsclustering=dbscanclustering(tagcountmat,0.055,20)

Elapsed time to calculate DBSCAN clustering:00:00:22.98


In [23]:
predictedLabels = dbsclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [-1  0  1  2]clusters


Cluster labels:


array([-1,  0,  1,  2])

In [24]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,count_occurrences(predictedLabels,el)))

cluster n. 0 has 8 pages
cluster n. 1 has 11450 pages
cluster n. 2 has 4796 pages
cluster n. 3 has 133 pages


In [25]:
p1,r1=calculate_precision_and_recall(sample,dbsclustering,'list',0)

Recall is 0.9879101899827288
Precision is 0.9991266375545852


In [26]:
p1,r1=calculate_precision_and_recall(sample,dbsclustering,'product',1)

Recall is 0.9989585503020204
Precision is 1.0
