## Our clustering algorithm evaluation
Evaluating our clustering algorithm on bookswagon.com pages. The aim is to calculate precision and recall for "book details" cluster and the "catalog" cluster in bookswagon.com.

In [3]:
# Importing libraries
import numpy as np
import pandas as pd
import ast
FILEPATH = '../../../datasets/bookoutlet.csv'
FILEPATH

'../../../datasets/bookoutlet.csv'

In [4]:
df = pd.read_csv(FILEPATH, converters={'bitset': ast.literal_eval, 'tag_count': ast.literal_eval})

## Data analisys
Some preliminary analisys of the dataset

In [5]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector,label,tag_count,bitset
0,https://bookoutlet.com/,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 4, 0, 1, 3, 0)",,"[0.001607717041800643, 0.001607717041800643, 0...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 0, 0, 1, 3, 0)",,"[0.002036659877800407, 0.002036659877800407, 0...","[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, ..."
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 8, 1, 1, 1, 0)",,"[0.0019455252918287938, 0.0019455252918287938,...","[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ..."
3,https://bookoutlet.com/Store/Browse?N=isTopTen...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list,"[0.000998003992015968, 0.000998003992015968, 0...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,https://bookoutlet.com/Store/Browse?N=isGiftCe...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list,"[0.001445086705202312, 0.001445086705202312, 0...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [6]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(16387, 7)

In [7]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


True

In [8]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [9]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16387 entries, 0 to 16386
Data columns (total 7 columns):
url               16387 non-null object
referer_url       16387 non-null object
src               16387 non-null object
shingle_vector    16387 non-null object
label             16381 non-null object
tag_count         16387 non-null object
bitset            16387 non-null object
dtypes: object(7)
memory usage: 896.2+ KB


In [11]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 6 row with no label
There are 4801 row with product label
There are 11580 row with list label


## Run MeanShift clustering algorithm

In [12]:
#add top level folder to sys.path
import sys
sys.path.append('../../../')

In [14]:
from astarwars_clustering.clustering import clusteringevaluation
from astarwars_clustering.utils import utility
from astarwars_clustering.clustering.structural_clustering import dbscanclustering, meanshiftclustering

In [15]:
sample=df.sample(3000)
print(fmt_string.format(len(sample[sample['label'].isnull()]),'no'))
print(fmt_string.format(len(sample[sample['label']=='product']), 'product'))
print(fmt_string.format(len(sample[sample['label']=='list']), 'list'))

There are 3 row with no label
There are 889 row with product label
There are 2108 row with list label


In [29]:
bitsetmat=sample['bitset'].tolist()
tagcountmat=sample['tag_count'].tolist()
msclustering = meanshiftclustering(bitsetmat,10)

Elapsed time to calculate MeanShift clustering:00:04:23.88


In [31]:
predictedLabels = msclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [0 1 2 3 4]clusters


Cluster labels:


array([0, 1, 2, 3, 4])

In [32]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,utility.count_occurrences(predictedLabels,el)))

cluster n. 0 has 2108 pages
cluster n. 1 has 889 pages
cluster n. 2 has 1 pages
cluster n. 3 has 1 pages
cluster n. 4 has 1 pages


In [34]:
sample[sample['predicted_label'] == 0]['url'].head(10)

1738     https://bookoutlet.com/Store/Browse?Na=323404&...
2865         https://bookoutlet.com/Store/Browse?Na=225976
3224     https://bookoutlet.com/Store/Browse?Nse=12982&...
2691     https://bookoutlet.com/Store/Browse?Na=56186&p...
4594     https://bookoutlet.com/Store/Browse?Npb=5140&p...
6363          https://bookoutlet.com/Store/Browse?Na=11283
12874        https://bookoutlet.com/Store/Browse?Nse=12433
15187    https://bookoutlet.com/Store/Browse?Na=22657&s...
2514     https://bookoutlet.com/Store/Browse?Na=305999&...
9810     https://bookoutlet.com/Store/Browse?Na=266623&...
Name: url, dtype: object

In [42]:
sample[sample['predicted_label'] == 1][['url','label']].head(20)

Unnamed: 0,url,label
1275,https://bookoutlet.com/Store/Details/978014319...,product
6038,https://bookoutlet.com/Store/Details/978068986...,product
4925,https://bookoutlet.com/Store/Details/978006057...,product
196,https://bookoutlet.com/Store/Details/978144345...,product
13064,https://bookoutlet.com/Store/Details/978068983...,product
5204,https://bookoutlet.com/Store/Details/978146546...,product
2997,https://bookoutlet.com/Store/Details/978006051...,product
1915,https://bookoutlet.com/Store/Details/978145558...,product
7838,https://bookoutlet.com/Store/Details/978081441...,product
9079,https://bookoutlet.com/Store/Details/978006441...,product


## Evaluate recall and precision

In [37]:
p1,r1=clusteringevaluation.calculate_precision_and_recall(sample,msclustering,'list',0)

Recall is 1.0
Precision is 1.0


In [43]:
p1,r1=clusteringevaluation.calculate_precision_and_recall(sample,msclustering,'product',1)

Recall is 1.0
Precision is 1.0


# DBSCAN clustering

In [64]:
dbsclustering=dbscanclustering(bitsetmat,10,20)

Elapsed time to calculate DBSCAN clustering:00:00:40.00


In [65]:
predictedLabels = dbsclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [-1  0  1]clusters


Cluster labels:


array([-1,  0,  1])

In [67]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,utility.count_occurrences(predictedLabels,el)))

cluster n. 0 has 3 pages
cluster n. 1 has 2108 pages
cluster n. 2 has 889 pages


In [68]:
p1,r1=clusteringevaluation.calculate_precision_and_recall(sample,dbsclustering,'list',0)

Recall is 1.0
Precision is 1.0


In [69]:
p2,r2=clusteringevaluation.calculate_precision_and_recall(sample,dbsclustering,'product',1)

Recall is 1.0
Precision is 1.0
