In [1]:
import os
import re
import sys
from collections import Counter
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score
from sklearn.cross_validation import cross_val_predict, LabelKFold
# from formasaurus.utils import download

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import get_link_text, get_text_around_selector_list, get_link_href
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.evaluate import get_annotation_folds
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr

In [2]:
storage = Storage()
urls = [rec['Page URL'] for rec in storage.iter_records()]
X_raw, y = storage.get_Xy()

Not all links are matched {'<a>1</a>'}
Not all links are matched {'<a>6</a>'}


In [3]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def link_to_features(link):
    text = normalize(get_link_text(link))

    href = get_link_href(link)
    p = urlsplit(href)

    query_parsed = parse_qsl(p.query)
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = ngrams_wb(
        " ".join([normalize(p) for p in query_param_names]), 3, 5, True
    )

    elem = link.root
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')
    elem_id = _elem_attr(elem, 'id')

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(self_and_children_classes + ' ' + parent_classes)

    return {
        'bias': 3.0,
        'isdigit': text.isdigit(),
        'isalpha': text.isalpha(),
        'elem_target': elem_target,
        'elem_rel': elem_rel,
        'num_tokens%s' % _num_tokens_feature(text): 1.0,

        'text': ngrams_wb(replace_digits(text), 2, 5),
        'text_exact': replace_digits(text.strip()[:20].strip()),
        'class': ngrams_wb(css_classes, 4, 5),
        'query': query_param_names_ngrams,

        'path_has_page': 'page' in p.path.lower(),
        'path_has_pageXX': re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path_has_number': any(part.isdigit() for part in p.path.split('/')),

        'href_has_year': re.search('20\d\d', href) is not None,
    }


def page_to_features(xseq):
    features = [link_to_features(a) for a in xseq]    
    around = get_text_around_selector_list(xseq, max_length=15)
    k = 0.2
    for feat, (before, after) in zip(features, around):
        feat['text-before'] = {n: k for n in ngrams_wb(before, 5, 5)}
        feat['text-after'] = {n: k for n in ngrams_wb(after, 5, 5)}        
    return features

X = [page_to_features(xseq) for xseq in X_raw]

CPU times: user 6.9 s, sys: 177 ms, total: 7.08 s
Wall time: 7.29 s


In [4]:
# X[60][12]

In [5]:
# TRAIN_SIZE = 80
# X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
# X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.002, 
    c2=0.05, 
    max_iterations=100, 
    all_possible_transitions=True,
    verbose=False,
)
# crf.fit(X_train, y_train, X_test, y_test)

In [6]:
%%time
folds = get_annotation_folds(urls, 6)
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=-1)
print(flat_classification_report(y, y_pred, labels=['PAGE', 'NEXT', 'PREV'], digits=3))
print("Sequence accuracy: {:0.3f}".format(sequence_accuracy_score(y, y_pred)))

             precision    recall  f1-score   support

       PAGE      0.939     0.981     0.960       785
       NEXT      0.937     0.747     0.831        99
       PREV      0.966     0.781     0.864        73

avg / total      0.941     0.941     0.939       957

Sequence accuracy: 0.722
CPU times: user 4.42 s, sys: 539 ms, total: 4.96 s
Wall time: 27.8 s


In [7]:
crf.fit(X, y)
# crf.attributes_
crf.num_attributes_

5954

In [8]:
# [a for a in sorted(crf.attributes_) if a.startswith('id')]

## What are important features?

In [9]:
# XXX: weight for correlated features don't show their importance
# XXX: weights for features of different scale don't show their importance
# (e.g. coefficients to text-after and text-before features are high, but only 
# because input is scaled down for these features)

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(150))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-50:])

Top positive:
3.218521 PAGE     isdigit
2.216244 NEXT     class:next
2.039500 O        href_has_year
1.986420 PAGE     text_exact:X
1.757851 NEXT     text:>
1.684981 PAGE     num_tokens=1
1.671320 NEXT     text_exact:»
1.587229 PAGE     text:X
1.578780 PREV     text:<
1.557080 NEXT     text_exact:>
1.527921 PREV     class:prev
1.509971 O        query:m
1.500081 PREV     text_exact:«
1.399768 NEXT     text:»
1.379406 PREV     text:«
1.279033 PAGE     text:XX
1.237740 PREV     text_exact:<
1.205448 O        text-after:page:
1.205448 O        text-after:Jump
1.202873 O        text-after:to
1.198439 PAGE     class:last
1.161882 O        num_tokens>2
1.018063 O        text_exact:>>
1.009589 PAGE     text_exact:<<
1.009589 PAGE     text:<<
0.998993 O        text:>>
0.948619 O        text-after:iten:
0.948619 O        text-after:eiten
0.913744 O        text-after:Seite
0.908013 PAGE     text_exact:XX
0.907451 PREV     text:前の
0.903052 NEXT     path_has_pageXX
0.900852 O        bias
0.874936 O

In [10]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Transitions:")
print_transitions(Counter(crf.transition_features_).most_common())

Transitions:
O      -> O       2.025053
PAGE   -> NEXT    1.492881
PREV   -> PAGE    1.436855
PREV   -> NEXT    0.785028
NEXT   -> O       0.557403
PAGE   -> PREV    0.506773
NEXT   -> PAGE    0.390690
O      -> PREV    0.350086
PREV   -> PREV    -0.233181
PAGE   -> PAGE    -0.319011
NEXT   -> PREV    -0.716874
NEXT   -> NEXT    -0.843192
O      -> NEXT    -1.348979
O      -> PAGE    -1.372117
PAGE   -> O       -1.504501
PREV   -> O       -1.828010


## Let's check errors the model is making

In [11]:
folds = get_annotation_folds(urls, 6)
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=-1)
errors = np.asarray(y) != np.asarray(y_pred)
error_rows = np.asarray(list(storage.iter_records()))[errors]
error_links = np.asarray(X_raw)[errors]
error_y_pred = y_pred[errors]
error_y_true = np.asarray(y)[errors]

In [12]:
for links, yseq_pred, yseq_true, row in zip(error_links, error_y_pred, error_y_true, error_rows):
    print(row['Page URL'])
    for label_correct, label_pred, link in zip(yseq_true, yseq_pred, links.extract()):
        if label_correct != label_pred:
            print("%4s %4s %s" % (label_correct, label_pred, link))
    print("\n")

http://www.newschittagong24.com/?cat=1
PAGE NEXT <a href="http://www.newschittagong24.com/?cat=1&amp;paged=1422" class="last" title="শেষ »">শেষ »</a>


http://www.newschittagong24.com/?cat=1&paged=5
PAGE    O <a href="http://www.newschittagong24.com/?cat=1" class="first" title="« প্রথম">« প্রথম</a>
PAGE NEXT <a href="http://www.newschittagong24.com/?cat=1&amp;paged=1422" class="last" title="শেষ »">শেষ »</a>


http://www.newschittagong24.com/?cat=1&paged=1422
PAGE PREV <a href="http://www.newschittagong24.com/?cat=1" class="first" title="« প্রথম">« প্রথম</a>


https://www.icontact.com/blog
NEXT    O <a href="https://www.icontact.com/blog/page/2" class="next_posts">Older Articles <span class="fa fa-arrow-right"></span></a>


https://www.icontact.com/blog/page/2
PREV    O <a href="https://www.icontact.com/blog/" class="prev_posts"><span class="fa fa-arrow-left"></span> Newer Articles</a>
NEXT    O <a href="https://www.icontact.com/blog/page/3" class="next_posts">Older Articles <span class

## Unused code

In [13]:
def _url_parts(url):
    p = urlsplit(url)
    args = parse_qsl(p.query)
    argnames = [name for name, value in args]
    return p.netloc, set(p.path.split('/')) | set(args) | set(argnames)

def url_distance(url1, url2):        
    netloc1, parts1 = _url_parts(url1)
    netloc2, parts2 = _url_parts(url2)
    if netloc1 != netloc2:
        return 1.0
    if not parts1 and not parts2:
        return 0.0
    return 1 - len(parts1 & parts2) / len(parts1 | parts2)

#         dist = url_distance(url, href)
#         if dist == 0:
#             feat['url-distance=0'] = 1.0
#         elif dist == 1.0:
#             feat['url-distance=1'] = 1.0
#         else:
#             feat['url-distance=k'] = dist


url_distance('http://example.com/foo/345?page=2', 'http://example.com/foo/345?page=4')

0.33333333333333337

In [14]:
# def guess_page_number(link):
#     text = get_link_text(link).strip()
#     if text.isdigit():
#         return int(text)
#     return None
    
# def number_pattern2(pattern):
#     txt = re.sub('X+', 'X', pattern)
# #     txt = re.sub('C+', 'C', txt)
#     return txt
    
#     pagenums = [guess_page_number(a) for a in xseq]
# #     print(pagenums)
#     for i in range(1, len(xseq)):
#         if pagenums[i-1] is None or pagenums[i] is None:
#             features[i]['page-diff:None'] = 1.0
#         else:
#             diff = pagenums[i] - pagenums[i-1]
#             if diff == 1:
#                 features[i]['page-diff==1'] = 1.0
#             else:
#                 features[i]['page-diff<>1'] = 1.0
