# Crawl Data Analysis: Clustering Analysis

This notebook analyses the output of the HDBSCAN clustering algorithm with various parameters.

## Euclidean Distance

#### Clustering output

In [1]:
import pandas as pd

output_3 = pd.read_csv('/mnt/ssd/amathur/dark-patterns-output/output_3_euclidean', sep='\s+', header=None, names=['cluster', 'count'])
output_5 = pd.read_csv('/mnt/ssd/amathur/dark-patterns-output/output_5_euclidean', sep='\s+', header=None, names=['cluster', 'count'])
output_10 = pd.read_csv('/mnt/ssd/amathur/dark-patterns-output/output_10_euclidean', sep='\s+', header=None, names=['cluster', 'count'])

#### Number of clusters in each

In [2]:
output_3.shape

(76539, 2)

In [3]:
output_5.shape

(40516, 2)

In [4]:
output_10.shape

(19192, 2)

#### Size of noise cluster in each

In [5]:
output_3[output_3['cluster'] == -1]['count']

0    765081
Name: count, dtype: int64

In [6]:
output_5[output_5['cluster'] == -1]['count']

0    838239
Name: count, dtype: int64

In [7]:
output_10[output_10['cluster'] == -1]['count']

0    905973
Name: count, dtype: int64

#### Read in the cluster labels

In [8]:
cluster_3_labels = pd.read_pickle('/mnt/ssd/amathur/dark-patterns-output/labels_3_euclidean.pickle')
cluster_5_labels = pd.read_pickle('/mnt/ssd/amathur/dark-patterns-output/labels_5_euclidean.pickle')
cluster_10_labels = pd.read_pickle('/mnt/ssd/amathur/dark-patterns-output/labels_10_euclidean.pickle')

#### Attach the cluster labels to the segments

In [9]:
import json
from tqdm import tqdm

hostname = []
inner_text = []

counter = 0
with open('/mnt/ssd/amathur/dark-patterns-output/segments.json') as f:
    for line in tqdm(f):
        seg = json.loads(line)
        
        hostname.append(seg['hostname'])
        inner_text.append(seg['inner_text'])   

frame = pd.DataFrame({'hostname': hostname, 
                      'inner_text': inner_text, 
                      'cluster_3_euclidean': cluster_3_labels.values,
                      'cluster_5_euclidean': cluster_5_labels.values,
                      'cluster_10_euclidean': cluster_10_labels.values})

1850895it [00:28, 64794.55it/s]


In [10]:
frame.shape

(1850895, 5)

In [11]:
frame.to_csv('/mnt/ssd/amathur/dark-patterns-output/clusters_bow_euclidean.csv', encoding='utf-8', index=False)

In [12]:
frame.to_pickle('/mnt/ssd/amathur/dark-patterns-output/clusters_bow_euclidean.pickle')

#### Attach the cluster labels to the features

In [9]:
import numpy as np

features = pd.read_csv('/mnt/ssd/amathur/dark-patterns-output/svd_bow_output.arr', sep=' ', header=None, dtype=np.float64).values

Calculating the Silhouette Coefficient for each clustering output: 

In [None]:
from sklearn.metrics import silhouette_score

silhouette_score(features, cluster_3_labels.values)

In [None]:
silhouette_score(features, cluster_5_labels.values)

In [None]:
silhouette_score(features, cluster_10_labels.values)