## Our clustering algorithm evaluation
Evaluating our clustering algorithm on bookswagon.com pages. The aim is to calculate precision and recall for "book details" cluster and the "catalog" cluster in bookswagon.com.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import ast
FILEPATH = '../../../datasets/bookoutlet.csv'
FILEPATH

'../../../datasets/bookoutlet.csv'

In [2]:
df = pd.read_csv(FILEPATH, converters={'bitset': ast.literal_eval, 'tag_count': ast.literal_eval})

## Data analisys
Some preliminary analisys of the dataset

In [3]:
print("First 5 rows")
print("------------")
df.head()

First 5 rows
------------


Unnamed: 0,url,referer_url,src,shingle_vector,label,tag_count,bitset
0,https://bookoutlet.com/,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 4, 0, 1, 3, 0)",,"[0.001607717041800643, 0.001607717041800643, 0...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,https://bookoutlet.com/Store/Sale,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 0, 0, 1, 3, 0)",,"[0.002036659877800407, 0.002036659877800407, 0...","[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, ..."
2,https://bookoutlet.com/Store/OtherBrowsing,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 8, 1, 1, 1, 0)",,"[0.0019455252918287938, 0.0019455252918287938,...","[0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ..."
3,https://bookoutlet.com/Store/Browse?N=isTopTen...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list,"[0.000998003992015968, 0.000998003992015968, 0...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,https://bookoutlet.com/Store/Browse?N=isGiftCe...,https://bookoutlet.com/,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n<head>\...","(0, 3, 1, 1, 1, 1, 0, 0)",list,"[0.001445086705202312, 0.001445086705202312, 0...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [4]:
print("No. of rows and columns")
print("-----------------------")
df.shape

No. of rows and columns
-----------------------


(16387, 7)

In [5]:
print("Check null values")
print("-----------------")
df.isnull().any().any()

Check null values
-----------------


True

In [6]:
print("Check duplicate values")
print("----------------------")
len(df['url'].unique()) != df.shape[0]

Check duplicate values
----------------------


False

In [7]:
print("DataFrame column types")
print("----------------------")
df.info()

DataFrame column types
----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16387 entries, 0 to 16386
Data columns (total 7 columns):
url               16387 non-null object
referer_url       16387 non-null object
src               16387 non-null object
shingle_vector    16387 non-null object
label             16381 non-null object
tag_count         16387 non-null object
bitset            16387 non-null object
dtypes: object(7)
memory usage: 896.2+ KB


In [8]:
fmt_string = 'There are {} row with {} label'
print(fmt_string.format(len(df[df['label'].isnull()]),'no'))
print(fmt_string.format(len(df[df['label']=='product']), 'product'))
print(fmt_string.format(len(df[df['label']=='list']), 'list'))

There are 6 row with no label
There are 4801 row with product label
There are 11580 row with list label


## Run MeanShift clustering algorithm

In [9]:
#add top level folder to sys.path
import sys
sys.path.append('../../../')

In [10]:
from astarwars_clustering.clustering import clusteringevaluation
from astarwars_clustering.utils import utility
from astarwars_clustering.clustering.structural_clustering import dbscanclustering, meanshiftclustering

In [11]:
sample=df.sample(3000)
print(fmt_string.format(len(sample[sample['label'].isnull()]),'no'))
print(fmt_string.format(len(sample[sample['label']=='product']), 'product'))
print(fmt_string.format(len(sample[sample['label']=='list']), 'list'))

There are 4 row with no label
There are 856 row with product label
There are 2140 row with list label


In [12]:
bitsetmat=sample['bitset'].tolist()
tagcountmat=sample['tag_count'].tolist()
msclustering = meanshiftclustering(tagcountmat,0.07)

Elapsed time to calculate MeanShift clustering:00:00:53.57


In [13]:
predictedLabels = msclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [0 1 2 3]clusters


Cluster labels:


array([0, 1, 2, 3])

In [14]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,utility.count_occurrences(predictedLabels,el)))

cluster n. 0 has 2091 pages
cluster n. 1 has 884 pages
cluster n. 2 has 24 pages
cluster n. 3 has 1 pages


In [15]:
sample[sample['predicted_label'] == 0]['url'].head(10)

6465          https://bookoutlet.com/Store/Browse?Npb=6447
4383     https://bookoutlet.com/Store/Browse?Na=88047&p...
9006     https://bookoutlet.com/Store/Browse?Na=239823&...
10833        https://bookoutlet.com/Store/Browse?Na=194881
9690     https://bookoutlet.com/Store/Browse?Na=182844&...
11726    https://bookoutlet.com/Store/Browse?Na=174465&...
2519          https://bookoutlet.com/Store/Browse?Na=32076
14489         https://bookoutlet.com/Store/Browse?Na=22030
7698     https://bookoutlet.com/Store/Browse?Na=330119&...
10504    https://bookoutlet.com/Store/Browse?Na=65077&p...
Name: url, dtype: object

In [16]:
sample[sample['predicted_label'] == 1]['url'].head(10)

6116     https://bookoutlet.com/Store/Details/978068986...
10279    https://bookoutlet.com/Store/Details/978148890...
6746     https://bookoutlet.com/Store/Details/978885440...
1957     https://bookoutlet.com/Store/Details/978145555...
5500     https://bookoutlet.com/Store/Details/978068981...
4839     https://bookoutlet.com/Store/Details/978014312...
4791     https://bookoutlet.com/Store/Details/978111904...
1550     https://bookoutlet.com/Store/Details/978143916...
3306     https://bookoutlet.com/Store/Details/978000813...
8534     https://bookoutlet.com/Store/Details/978150116...
Name: url, dtype: object

## Evaluate recall and precision

In [17]:
p1,r1=clusteringevaluation.calculate_precision_and_recall(sample,msclustering,'list',0)

Recall is 0.9752336448598131
Precision is 0.9980870396939263


In [18]:
p1,r1=clusteringevaluation.calculate_precision_and_recall(sample,msclustering,'product',1)

Recall is 0.9988317757009346
Precision is 0.9671945701357466


# DBSCAN clustering

In [19]:
dbsclustering=dbscanclustering(tagcountmat,0.055,20)

Elapsed time to calculate DBSCAN clustering:00:00:02.11


In [20]:
predictedLabels = dbsclustering.labels_
noOfClusters = np.unique(predictedLabels)
sample['predicted_label'] = predictedLabels
print('There are ' + str(noOfClusters) + 'clusters')
print()
print()
print('Cluster labels:')
noOfClusters

There are [-1  0  1]clusters


Cluster labels:


array([-1,  0,  1])

In [21]:
cluster_fmt = 'cluster n. {} has {} pages'
noOfPages = 0

for index ,el in enumerate(noOfClusters):
    print(cluster_fmt.format(index ,utility.count_occurrences(predictedLabels,el)))

cluster n. 0 has 24 pages
cluster n. 1 has 855 pages
cluster n. 2 has 2121 pages


In [22]:
p1,r1=clusteringevaluation.calculate_precision_and_recall(sample,dbsclustering,'list',1)

Recall is 0.9892523364485981
Precision is 0.9981140971239981


In [23]:
p2,r2=clusteringevaluation.calculate_precision_and_recall(sample,dbsclustering,'product',0)

Recall is 0.9988317757009346
Precision is 1.0
