In [3]:
import sqlite3
import pandas as pd
from urlparse import urlparse
import hashlib
from PIL import Image
import os
import re
from __future__ import unicode_literals
from nltk.stem.porter import PorterStemmer
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import fastcluster
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster, dendrogram
from collections import defaultdict
import ast
from scipy.sparse import coo_matrix, hstack
import numpy as np
from sklearn.preprocessing import MinMaxScaler

ImportError: No module named matplotlib.pyplot

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

## Read data about the sites

In [3]:
con = sqlite3.connect('2018-12-03_segmentation_pilot/2018-12-03_segmentation_pilot.sqlite')

In [4]:
sites = pd.read_sql_query('''SELECT * from site_visits''', con)

In [5]:
sites.shape

(62, 3)

In [6]:
list(sites.columns.values)

['visit_id', 'crawl_id', 'site_url']

## Read data about the segments

In [7]:
segments = pd.read_sql_query('''SELECT * from segments''', con)

In [8]:
segments.shape

(28051, 22)

In [9]:
list(segments.columns.values)

['id',
 'crawl_id',
 'visit_id',
 'node_name',
 'node_id',
 'top',
 'left',
 'width',
 'height',
 'style',
 'inner_text',
 'outer_html',
 'longest_text',
 'longest_text_width',
 'longest_text_height',
 'longest_text_top',
 'longest_text_left',
 'longest_text_style',
 'num_buttons',
 'num_imgs',
 'num_anchors',
 'time_stamp']

For this analysis, only consider those segments with text.

In [10]:
segments = segments.loc[segments['inner_text'] != '']

Let's also ignore all segments with body tags

In [11]:
segments = segments.loc[segments['node_name'] != 'BODY']

In [12]:
segments.shape

(19929, 22)

Let's perform some pre-processing. First, let's swap all numbers with DPNUM placeholder.

In [13]:
segments['inner_text_processed'] = segments['inner_text'].str.replace(r'\d+', 'DPNUM')
segments['longest_text_processed'] = segments['longest_text'].str.replace(r'\d+', 'DPNUM')

Next let's consider the individual nodes, particularly those that were updated

In [14]:
def handle_node_update(gdata):
    return gdata.drop_duplicates(subset=['inner_text_processed', 'longest_text_processed'], keep='last')

segments = segments.groupby(['visit_id']).apply(handle_node_update)

In [15]:
list(segments.columns.values)

['id',
 'crawl_id',
 'visit_id',
 'node_name',
 'node_id',
 'top',
 'left',
 'width',
 'height',
 'style',
 'inner_text',
 'outer_html',
 'longest_text',
 'longest_text_width',
 'longest_text_height',
 'longest_text_top',
 'longest_text_left',
 'longest_text_style',
 'num_buttons',
 'num_imgs',
 'num_anchors',
 'time_stamp',
 u'inner_text_processed',
 u'longest_text_processed']

In [16]:
segments.shape

(4036, 24)

## Join the two dataframes

In [17]:
dataset = segments.set_index('visit_id').join(sites.set_index('visit_id'), lsuffix='_1', rsuffix='_2')

In [18]:
dataset.shape

(4036, 25)

In [19]:
list(dataset.columns.values)

['id',
 'crawl_id_1',
 'node_name',
 'node_id',
 'top',
 'left',
 'width',
 'height',
 'style',
 'inner_text',
 'outer_html',
 'longest_text',
 'longest_text_width',
 'longest_text_height',
 'longest_text_top',
 'longest_text_left',
 'longest_text_style',
 'num_buttons',
 'num_imgs',
 'num_anchors',
 'time_stamp',
 u'inner_text_processed',
 u'longest_text_processed',
 'crawl_id_2',
 'site_url']

Let's get the visit_id back

In [20]:
dataset = dataset.reset_index()

In [21]:
list(dataset.columns.values)

[u'visit_id',
 'id',
 'crawl_id_1',
 'node_name',
 'node_id',
 'top',
 'left',
 'width',
 'height',
 'style',
 'inner_text',
 'outer_html',
 'longest_text',
 'longest_text_width',
 'longest_text_height',
 'longest_text_top',
 'longest_text_left',
 'longest_text_style',
 'num_buttons',
 'num_imgs',
 'num_anchors',
 'time_stamp',
 u'inner_text_processed',
 u'longest_text_processed',
 'crawl_id_2',
 'site_url']

Let's tokenize inner_text first:

In [22]:
stemmer = PorterStemmer()

def tokenize(line):
    if (line is None):
        line = ''
    printable = set(string.printable)
    line = ''.join(filter(lambda x: x in printable, line)) 
    
    tokens = nltk.word_tokenize(line)
    
    tokens = [f for f in tokens if f != '']
    tokens = [stemmer.stem(f) for f in tokens]

    return tokens

In [23]:
countVec = CountVectorizer(tokenizer=tokenize, binary=True).fit(dataset['inner_text_processed'])

What is the length of the vocabulary?

In [24]:
len(countVec.vocabulary_)

5357

Let's create the bag of words representation.

In [25]:
lineVec = countVec.transform(dataset['inner_text_processed'])

In [26]:
lineVec.shape

(4036, 5357)

First, scale the remaining columns:

In [27]:
scaler = MinMaxScaler()

In [28]:
cols = dataset[['num_buttons', 'num_imgs', 'num_anchors', 'top', 'left']]
cols = scaler.fit_transform(cols)

  return self.partial_fit(X, y)


In [29]:
cols

array([[0.        , 0.        , 0.        , 0.40589533, 0.41014195],
       [0.        , 0.        , 0.        , 0.40568516, 0.41009731],
       [0.        , 0.        , 0.00436681, 0.40579025, 0.44830819],
       ...,
       [0.09090909, 0.        , 0.        , 0.44509248, 0.44433533],
       [0.        , 0.        , 0.        , 0.42749054, 0.45920007],
       [0.        , 0.        , 0.        , 0.42785834, 0.44585305]])

Add these columns in

In [30]:
lineVec = hstack((lineVec, cols))

Shape of the vector?

In [31]:
lineVec.shape

(4036, 5362)

Let's compute the euclidean distance

In [32]:
dist = euclidean_distances(lineVec)

Next, let's convert this to vector format. This is necessary as the linkage method below requires it in this format.

In [33]:
distVec = squareform(dist, checks = False)

In [34]:
res = fastcluster.linkage(distVec, method = 'ward', preserve_input = False)

In [35]:
dataset['cluster'] = fcluster(res, 5, criterion='distance')

In [37]:
dataset.to_csv('clusters.csv', encoding='utf-8', columns=['site_url', 'cluster', 'inner_text', 'top', 'left', 'width', 'height', 'time_stamp'], index=False)

In [205]:
dataset.shape

(4894, 25)