## Our clustering algorithm evaluation
Evaluating our clustering algorithm on bookswagon.com pages. The aim is to calculate precision and recall for "book details" cluster and the "catalog" cluster in bookswagon.com.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import ast
import time
from sklearn.cluster import MeanShift, estimate_bandwidth, DBSCAN
FILEPATH = '../input/powells.csv'
FILEPATH

'../input/powells.csv'

In [2]:
df = pd.read_csv(FILEPATH, converters={'bitset': ast.literal_eval, 'tag_count': ast.literal_eval})

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector,label,tag_count,bitset
0,https://www.powells.com/blog/author/kristen-ar...,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(7, 2, 1, 8, 3, 10, 0, 5)",,"[0.002680965147453083, 0.002680965147453083, 0...","[0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, ..."
1,https://www.powells.com/blog/category/interviews,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 1, 1, 0, 3, 5, 0, 1)",,"[0.0011467889908256881, 0.0011467889908256881,...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ..."
2,https://www.powells.com/nonfiction-sale,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 3, 0, 0, 0)",,"[0.0013054830287206266, 0.0013054830287206266,...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ..."
3,https://www.powells.com/powells-presents,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(0, 2, 2, 8, 1, 1, 0, 0)",,"[0.0022026431718061676, 0.0022026431718061676,...","[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, ..."
4,https://www.powells.com/locations,https://www.powells.com/,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...","(2, 0, 0, 4, 2, 2, 0, 0)",,"[0.001976284584980237, 0.001976284584980237, 0...","[0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, ..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(10571, 7)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


True

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10571 entries, 0 to 10570
Data columns (total 7 columns):
url               10571 non-null object
referer_url       10571 non-null object
src               10571 non-null object
shingle_vector    10571 non-null object
label             8962 non-null object
tag_count         10571 non-null object
bitset            10571 non-null object
dtypes: object(7)
memory usage: 578.2+ KB


In [8]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 1609 row with no label
There are 417 row with product label
There are 8545 row with list label


## Run MeanShift clustering algorithm
### Import all necessary functions

In [9]:
#from astarwars_clustering.utils import utility

def pad_vector(vec, n):
    for i in range(n):
        vec.append(0)

def pad_matrix_elem(matrix, lastvec):
    maxlen=len(lastvec)
    matrixlen=len(matrix)
    for i in range(matrixlen):
        elem_topad=maxlen-len(matrix[i])
        pad_vector(matrix[i], elem_topad)

#cluster di cataloghi e di prodotti
def count_occurrences(vec,n):
    occ=0
    for el in vec:
        if el==n:
            occ=occ+1
    return occ

In [10]:
#from astarwars_clustering.clustering.structural_clustering import dbscanclustering, meanshiftclustering
def meanshiftclustering(featurematrix,bandwidth=None):
    start = time.time()
    clustering=None
    if bandwidth is not None:
        clustering = MeanShift(bandwidth=bandwidth).fit(featurematrix)
    else:
        clustering = MeanShift().fit(featurematrix)
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Elapsed time to calculate MeanShift clustering:{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    return clustering


#if eps is specified also min_samples will be not null for convention
def dbscanclustering(featurematrix,epsValue=None,min_samplesValue=None):
    start = time.time()
    clustering=None
    if epsValue is not None:
        clustering = DBSCAN(eps=epsValue, min_samples=min_samplesValue).fit(featurematrix)
    else:
        clustering = DBSCAN().fit(featurematrix)
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Elapsed time to calculate DBSCAN clustering:{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    return clustering

In [11]:
#from astarwars_clustering.clustering import clusteringevaluation

#predictedlabel rappresenta l'etichetta del clustering su cui vogliamo calcolare precision e recall
def calculate_precision_and_recall(df, clustering, selectedlabel, predictedlabel):

    labels = clustering.labels_
    df['predicted_labels'] = labels
    selectedelements = count_occurrences(labels, predictedlabel)

    truepositive = 0
    allpositives = len(df[df['label'] == selectedlabel])

    for index, row in df.iterrows():
        if row['label'] == selectedlabel and row['predicted_labels'] == predictedlabel:
            truepositive += 1

    fmt_string='{} is {}'
    recall = truepositive/allpositives
    precision = truepositive/selectedelements

    print(fmt_string.format('Recall', recall))
    print(fmt_string.format('Precision', precision))

    return precision, recall

In [12]:
sample=df
bitsetmat=sample['bitset'].tolist()
tagcountmat=sample['tag_count'].tolist()

In [13]:
clustering = meanshiftclustering(bitsetmat,10)

Elapsed time to calculate MeanShift clustering:02:09:59.82


In [14]:
print(fmt_string.format(len(sample[sample['label'].isnull()]),'no'))
print(fmt_string.format(len(sample[sample['label']=='product']), 'product'))
print(fmt_string.format(len(sample[sample['label']=='list']), 'list'))

There are 1609 row with no label
There are 417 row with product label
There are 8545 row with list label


In [15]:
predictedLabels = clustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]clusters


Cluster labels:


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])

In [16]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,count_occurrences(predictedLabels,el)))

cluster n. 0 has 8084 pages
cluster n. 1 has 1332 pages
cluster n. 2 has 447 pages
cluster n. 3 has 413 pages
cluster n. 4 has 137 pages
cluster n. 5 has 120 pages
cluster n. 6 has 13 pages
cluster n. 7 has 4 pages
cluster n. 8 has 3 pages
cluster n. 9 has 2 pages
cluster n. 10 has 1 pages
cluster n. 11 has 1 pages
cluster n. 12 has 1 pages
cluster n. 13 has 1 pages
cluster n. 14 has 1 pages
cluster n. 15 has 1 pages
cluster n. 16 has 1 pages
cluster n. 17 has 1 pages
cluster n. 18 has 1 pages
cluster n. 19 has 1 pages
cluster n. 20 has 1 pages
cluster n. 21 has 1 pages
cluster n. 22 has 1 pages
cluster n. 23 has 1 pages
cluster n. 24 has 1 pages
cluster n. 25 has 1 pages


In [17]:
sample[sample['predicted_label'] == 0]['url'].head(20)

88     https://www.powells.com/SearchResults?keyword=...
198    https://www.powells.com/SearchResults?keyword=...
208    https://www.powells.com/searchresults?keyword=...
210    https://www.powells.com/searchresults?keyword=...
212    https://www.powells.com/searchresults?keyword=...
213    https://www.powells.com/searchresults?keyword=...
218    https://www.powells.com/searchresults?keyword=...
219    https://www.powells.com/searchresults?keyword=...
220    https://www.powells.com/searchresults?keyword=...
222    https://www.powells.com/searchresults?keyword=...
224    https://www.powells.com/searchresults?keyword=...
225    https://www.powells.com/searchresults?keyword=...
231    https://www.powells.com/searchresults?keyword=...
238    https://www.powells.com/searchresults?keyword=...
247    https://www.powells.com/searchresults?keyword=...
257    https://www.powells.com/searchresults?keyword=...
272    https://www.powells.com/searchresults?keyword=...
278    https://www.powells.com/

In [18]:
sample[sample['predicted_label'] == 1]['url'].head(10)

16    https://www.powells.com/login?returnurl=%2fpos...
17    https://www.powells.com/login?returnurl=%2finf...
18    https://www.powells.com/login?returnurl=%2flit...
31     https://www.powells.com/login?returnurl=%2flogin
34    https://www.powells.com/login?returnurl=%2finf...
35    https://www.powells.com/login?returnurl=%2fpos...
38    https://www.powells.com/login?returnurl=%2f%e2...
41    https://www.powells.com/login?returnurl=%2fpos...
42    https://www.powells.com/login?returnurl=%2fsho...
44    https://www.powells.com/login?returnurl=%2fpos...
Name: url, dtype: object

In [19]:
sample[sample['predicted_label'] == 2]['url'].head(30)

1391    https://www.powells.com/searchresults?keyword=...
1411    https://www.powells.com/searchresults?keyword=...
1422    https://www.powells.com/searchresults?keyword=...
1425    https://www.powells.com/searchresults?keyword=...
1434    https://www.powells.com/searchresults?keyword=...
1438    https://www.powells.com/searchresults?keyword=...
1439    https://www.powells.com/searchresults?keyword=...
1522    https://www.powells.com/searchresults?keyword=...
1554    https://www.powells.com/searchresults?keyword=...
1557    https://www.powells.com/searchresults?keyword=...
1567    https://www.powells.com/searchresults?keyword=...
1591    https://www.powells.com/searchresults?keyword=...
1597    https://www.powells.com/searchresults?keyword=...
1608    https://www.powells.com/searchresults?keyword=...
1612    https://www.powells.com/searchresults?keyword=...
1638    https://www.powells.com/searchresults?keyword=...
1650    https://www.powells.com/searchresults?keyword=...
1655    https:

## Evaluate recall and precision

In [20]:
p1,r1=calculate_precision_and_recall(sample,clustering,'list',0)

Recall is 0.946050321825629
Precision is 1.0


In [21]:
p1,r1=calculate_precision_and_recall(sample,clustering,'product',3)

Recall is 0.9904076738609112
Precision is 1.0


# DBSCAN algorithm

In [22]:
dbsclustering=dbscanclustering(bitsetmat,10,20)

Elapsed time to calculate DBSCAN clustering:00:08:39.99


In [23]:
predictedLabels = dbsclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [-1  0  1  2  3  4  5]clusters


Cluster labels:


array([-1,  0,  1,  2,  3,  4,  5])

In [24]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,count_occurrences(predictedLabels,el)))

cluster n. 0 has 38 pages
cluster n. 1 has 413 pages
cluster n. 2 has 1332 pages
cluster n. 3 has 120 pages
cluster n. 4 has 137 pages
cluster n. 5 has 8084 pages
cluster n. 6 has 447 pages


In [25]:
p1,r1=calculate_precision_and_recall(sample,dbsclustering,'list',4)

Recall is 0.946050321825629
Precision is 1.0


In [26]:
p1,r1=calculate_precision_and_recall(sample,dbsclustering,'product',0)

Recall is 0.9904076738609112
Precision is 1.0
