# Crawl Data Analysis: Clustering

This notebook tries various clustering techniques on our web crawl data.

In [10]:
import matplotlib.pyplot as plt
import numpy as np

## Read from database

Read the crawl data from the database. Here we read in the `site_visits` and `segments` tables and join them.

In [None]:
import sqlite3
import pandas as pd

db = '/n/fs/darkpatterns/crawl/2018-12-08_segmentation_pilot2/2018-12-08_segmentation_pilot2.sqlite'
con = sqlite3.connect(db)
site_visits = pd.read_sql_query('''SELECT * from site_visits''', con)

In [None]:
print('Number of site visits: %s' % str(site_visits.shape))
print('site_visits columns: %s' % str(list(site_visits.columns.values)))

Report how many unique domains we have.

In [None]:
from urlparse import urlparse

site_visits['domain'] = site_visits['site_url'].apply(lambda x: urlparse(x).netloc)
grouped = site_visits.groupby(['domain']).count().sort_values('visit_id', ascending=False)

In [None]:
print('Number of unique domains: %s' % str(grouped.shape[0]))

In [None]:
segments = pd.read_sql_query('''SELECT * from segments''', con)

In [None]:
print('Number of segments: %s' % str(segments.shape))
print('segments columns: %s' % str(list(segments.columns.values)))

In [None]:
segments = segments.reset_index().set_index('visit_id').join(site_visits.reset_index()[['visit_id', 'site_url', 'domain']].set_index('visit_id'), how='inner')

## Preprocess data

Ignore `body` tags and null `inner_text`, and add columns for number of newlines and length of `inner_text`.

In [None]:
segments['inner_text'] = segments['inner_text'].str.strip()
segments = segments[(segments['node_name'] != 'BODY') & (segments['inner_text'] != '')]
segments['newline_count'] = segments['inner_text'].apply(lambda x: len(x.split('\n')))
segments['inner_text_length'] = segments['inner_text'].apply(lambda x: len(x))

In [None]:
print('segments[\'newline_count\'].describe(): \n %s' % segments['newline_count'].describe().to_string())
print('segments[\'inner_text_length\'].describe(): \n %s' % segments['inner_text_length'].describe().to_string())

Replace numbers with a placeholder.

In [None]:
segments['inner_text_processed'] = segments['inner_text'].str.replace(r'\d+', 'DPNUM')
segments['longest_text_processed']= segments['longest_text'].str.replace(r'\d+', 'DPNUM')

Replace redundant segments.

In [None]:
segments = segments.groupby(['domain']).apply(lambda x: x.drop_duplicates(subset=['inner_text_processed'], keep='last'))

In [None]:
print('Number of segments: %s' % str(segments.shape))
print('segments columns: %s' % str(list(segments.columns.values)))

## Create feature vectors

First we define the a function to tokenize text as we convert text into feature vectors. 

In [2]:
from nltk.stem.porter import PorterStemmer
import nltk

stemmer = PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

def tokenize(line):
    if (line is None):
        line = ''
    tokens = [stemmer.stem(t) for t in nltk.word_tokenize(line) if len(t) != 0 and t not in stopwords and not t.isdigit()]
    return tokens

[1 2 3]


Now select one of the following cells to run to create a feature representation.

### 1. Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

data = segments['inner_text_processed']
vec = CountVectorizer(tokenizer=tokenize, binary=binary_rep, strip_accents='ascii').fit(data)

In [None]:
print('Length of vocabulary %s' % str(len(vec.vocabulary_)))

In [None]:
vec = vec.transform(data)
features = normalize(vec, axis=0)

### 2. TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = segments['inner_text_processed']
vec = TfidfVectorizer(tokenizer=tokenize, binary=binary_rep, strip_accents='ascii').fit(data)

In [None]:
print('Length of vocabulary %s' % str(len(vec.vocabulary_)))

In [None]:
features = vec.transform(data)

### 3. Word Vectors

We compute a vector for each segment as follows: compute the word vector for each word in the segment's `inner_text`, and then average over all words in that segment.

While it's simple, there are clearly downsides to this approach:

- We lose information about word ordering
- All words are equally weighted, so words that really characterize the text are not prioritized

In [None]:
import spacy

data = segments['inner_text_processed']
nlp = spacy.load('en_core_web_lg')
vecs = []
for doc in nlp.pipe(data.str.replace(r'\d+', '').astype('unicode').values, batch_size=10000, n_threads=7):
    if doc.is_parsed:
        vecs.append(doc.vector)
    else:
        vecs.append(None)
features = np.array(vecs)

### PCA
Try using PCA to reduce the dimension of the data.

Note that the feature matrix may need to be transposed so that examples are in columns (`num_features` x `num_examples`).

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
# pca = PCA(tol=1e-5)
pca.fit(features)

In [None]:
print('Matrix of PCs: %s' % str(pca.components_.shape))
print('Data matrix: %s' % str(features.shape))

Projected data is given by $U^T X$, where $U$ is matrix with PCs in columns (`orig_dim` x `reduced_dim`), and $X$ is the data matrix with examples in columns (`orig_dim` x `num_examples`).

In [None]:
features_proj = pca.components_.dot(features)

In [None]:
print('feature matrix shape (after PCA): %s' % str(features_proj.shape))

## Clustering

Run one of the following clustering algorithms.

### 1. Hierarchical clustering

In [None]:
from scipy.spatial import distance
import fastcluster

featdense = features.todense()
distances = distance.pdist(featdense, metric='cosine')
distances = distance.squareform(distances, checks=False)

In [None]:
clusters = fastcluster.linkage(distances, method='ward', preserve_input=False)
np.save('linkage.matrix', clusters)

Plot a dendogram of the resulting clusters.

In [None]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    clusters,
    leaf_rotation=90.,
    leaf_font_size=8.,
)
plt.show()

### 2. DBSCAN clustering

In [None]:
from sklearn.cluster import DBSCAN

clusterer = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=10, metric='cosine')
cluster_labels = clusterer.fit(features)
segments['cluster'] = pd.Series(cluster_labels.labels_).values

In [None]:
print('segments[\'cluster\'].value_counts(): \n %s' % segments['cluster'].value_counts().to_string())

### 3. HDBSCAN clustering

In [None]:
from sklearn.preprocessing import normalize
import hdbscan

features = normalize(features, axis=1) # Normalize each segment since using euclidean distance metric
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean')
cluster_labels = clusterer.fit_predict(features)
segments['cluster'] = pd.Series(cluster_labels).values

In [None]:
print('segments[\'cluster\'].value_counts(): \n %s' % segments['cluster'].value_counts().to_string())

## Visualize results

Produce a CSV file that shows the segments in each cluster.

In [None]:
inner_texts = segments['inner_text']
cluster_labels = segments['cluster']
print("segments['inner_text'] is %s, segments['cluster'] is %s (should be the same)" % (str(inner_texts.shape), str(cluster_labels.shape)))
assert inner_texts.shape == cluster_labels.shape

Group the segments by cluster.

In [None]:
from collections import defaultdict

segments_by_cluster = defaultdict(lambda: [])
for i in range(inner_texts.shape[0]):
segments_by_cluster[str(cluster_labels[i])].append(inner_texts[i])

Write CSV file.

In [None]:
import unicodecsv as csv

outfile = 'clusters.csv'
with open(outfile, 'wb') as f:
writer = csv.writer(f)
for cluster, segments in segments_by_cluster.iteritems():
    segments_str = '\n\n'.join(segments)
    writer.writerow([cluster, segments_str])