In [1]:
import os
import re
import sys
from collections import Counter
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score
from sklearn.cross_validation import cross_val_predict, LabelKFold
# from formasaurus.utils import download

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import get_link_text, get_text_around_selector_list, get_link_href
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.evaluate import get_annotation_folds
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr

In [2]:
storage = Storage()
urls = [rec['Page URL'] for rec in storage.iter_records()]
X_raw, y = storage.get_Xy()
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

Not all links are matched {'<a>1</a>'}
Not all links are matched {'<a>6</a>'}
Not all links are matched {'<a onclick="display(\'grid\');">Mosaico</a>'}
Not all links are matched {'<a onclick="display(\'grid\');">Mosaico</a>'}
Not all links are matched {'<a class="btn prev disabled" type="backward"><i class="icon-chevron-left"> </i></a>'}
pages: 168  domains: 70


In [3]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def link_to_features(link):
    text = normalize(get_link_text(link))

    href = get_link_href(link)
    p = urlsplit(href)

    query_parsed = parse_qsl(p.query)
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    )

    elem = link.root
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')
    elem_id = _elem_attr(elem, 'id')

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(self_and_children_classes + ' ' + parent_classes)

    return {
        'bias': 3.0,
        'isdigit': text.isdigit(),
        'isalpha': text.isalpha(),
        'elem_target': elem_target,
        'elem_rel': elem_rel,
        'num_tokens%s' % _num_tokens_feature(text): 1.0,

        'text': ngrams_wb(replace_digits(text), 2, 5),
        'text_exact': replace_digits(text.strip()[:20].strip()),
        'class': ngrams_wb(css_classes, 4, 5),
        'query': query_param_names_ngrams,

        'path_has_page': 'page' in p.path.lower(),
        'path_has_pageXX': re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path_has_number': any(part.isdigit() for part in p.path.split('/')),

        'href_has_year': re.search('20\d\d', href) is not None,
    }


def page_to_features(xseq):
    features = [link_to_features(a) for a in xseq]    
    around = get_text_around_selector_list(xseq, max_length=15)
    k = 0.2
    for feat, (before, after) in zip(features, around):
        feat['text-before'] = {n: k for n in ngrams_wb(before, 5, 5)}
        feat['text-after'] = {n: k for n in ngrams_wb(after, 5, 5)}        
    return features

X = [page_to_features(xseq) for xseq in X_raw]

CPU times: user 10.2 s, sys: 164 ms, total: 10.4 s
Wall time: 10.5 s


In [4]:
# X[60][12]

In [5]:
# TRAIN_SIZE = 80
# X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
# X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.001, 
    c2=0.05, 
    max_iterations=100, 
    all_possible_transitions=True,
    verbose=False,
)
# crf.fit(X_train, y_train, X_test, y_test)

In [6]:
%%time
folds = get_annotation_folds(urls, 6)
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=-1)
print(flat_classification_report(y, y_pred, labels=['PAGE', 'NEXT', 'PREV'], digits=3))
print("Sequence accuracy: {:0.3f}".format(sequence_accuracy_score(y, y_pred)))

             precision    recall  f1-score   support

       PAGE      0.881     0.912     0.896       978
       NEXT      0.922     0.764     0.836       140
       PREV      0.963     0.790     0.868       100

avg / total      0.893     0.885     0.887      1218

Sequence accuracy: 0.607
CPU times: user 4.44 s, sys: 431 ms, total: 4.87 s
Wall time: 20.8 s


In [7]:
crf.fit(X, y)
# crf.attributes_
crf.num_attributes_

9600

In [8]:
# [a for a in sorted(crf.attributes_) if a.startswith('id')]

## What are important features?

In [9]:
# XXX: weight for correlated features don't show their importance
# XXX: weights for features of different scale don't show their importance
# (e.g. coefficients to text-after and text-before features are high, but only 
# because input is scaled down for these features)

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(150))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-50:])

Top positive:
2.613692 NEXT     class:next
2.571457 PAGE     isdigit
2.203651 NEXT     text:>
2.151351 PREV     text:<
1.924049 NEXT     text_exact:>
1.862073 PAGE     text_exact:X
1.797784 PREV     text_exact:<
1.652110 PREV     class:prev
1.609996 NEXT     text_exact:»
1.603310 PAGE     text:X
1.581155 PREV     text_exact:«
1.566280 PAGE     text:>|
1.511860 O        text_exact:>>
1.471550 PAGE     class:last
1.464397 PAGE     text-before:1
1.388722 PREV     text:«
1.385084 NEXT     text:»
1.383527 O        href_has_year
1.374747 PAGE     text:<<
1.345415 O        num_tokens>2
1.287747 O        text-after:Seite
1.258773 O        text-after:iten:
1.258773 O        text-after:eiten
1.244647 PAGE     text_exact:XX
1.237721 O        text-after:to
1.235507 O        text-after:page:
1.235507 O        text-after:Jump
1.225438 PAGE     text-before:per
1.222749 PAGE     text-before:page
1.209185 PAGE     text:|<
1.169614 O        bias
1.141643 PAGE     text_exact:<<
1.124517 O        class:g-

In [10]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Transitions:")
print_transitions(Counter(crf.transition_features_).most_common())

Transitions:
O      -> O       2.952523
PAGE   -> NEXT    1.837488
PREV   -> PAGE    1.639196
PREV   -> NEXT    0.822399
PAGE   -> PREV    0.730051
NEXT   -> PAGE    0.635045
O      -> PREV    0.241094
NEXT   -> O       0.122725
PAGE   -> PAGE    -0.176183
NEXT   -> PREV    -0.541947
PREV   -> PREV    -0.635343
NEXT   -> NEXT    -1.135175
O      -> NEXT    -1.440473
PREV   -> O       -1.696434
PAGE   -> O       -1.912586
O      -> PAGE    -2.129679


## Let's check errors the model is making

In [11]:
folds = get_annotation_folds(urls, 6)
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=-1)
errors = np.asarray(y) != np.asarray(y_pred)
error_rows = np.asarray(list(storage.iter_records()))[errors]
error_links = np.asarray(X_raw)[errors]
error_y_pred = y_pred[errors]
error_y_true = np.asarray(y)[errors]

In [12]:
for links, yseq_pred, yseq_true, row in zip(error_links, error_y_pred, error_y_true, error_rows):
    print(row['Page URL'])
    for label_correct, label_pred, link in zip(yseq_true, yseq_pred, links.extract()):
        if label_correct != label_pred:
            print("%4s %4s %s" % (label_correct, label_pred, link))
    print("\n")

https://www.mypapershop.com/patricks-day-supplies.html
NEXT    O <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&amp;Category_Code=patricks-day-supplies&amp;CatListingOffset=24&amp;Offset=24&amp;Per_Page=24&amp;Sort_By=disp_order" class="searchspring-next">▶</a>
NEXT    O <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&amp;Category_Code=patricks-day-supplies&amp;CatListingOffset=24&amp;Offset=24&amp;Per_Page=24&amp;Sort_By=disp_order" class="searchspring-next">▶</a>


https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&Store_Code=MPS&Screen=CTGY&Category_Code=patricks-day-supplies&CatListingOffset=24&Offset=24&Per_Page=24&Sort_By=disp_order
NEXT PAGE <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&a

## Unused code

In [13]:
def _url_parts(url):
    p = urlsplit(url)
    args = parse_qsl(p.query)
    argnames = [name for name, value in args]
    return p.netloc, set(p.path.split('/')) | set(args) | set(argnames)

def url_distance(url1, url2):        
    netloc1, parts1 = _url_parts(url1)
    netloc2, parts2 = _url_parts(url2)
    if netloc1 != netloc2:
        return 1.0
    if not parts1 and not parts2:
        return 0.0
    return 1 - len(parts1 & parts2) / len(parts1 | parts2)

#         dist = url_distance(url, href)
#         if dist == 0:
#             feat['url-distance=0'] = 1.0
#         elif dist == 1.0:
#             feat['url-distance=1'] = 1.0
#         else:
#             feat['url-distance=k'] = dist


url_distance('http://example.com/foo/345?page=2', 'http://example.com/foo/345?page=4')

0.33333333333333337

In [14]:
# def guess_page_number(link):
#     text = get_link_text(link).strip()
#     if text.isdigit():
#         return int(text)
#     return None
    
# def number_pattern2(pattern):
#     txt = re.sub('X+', 'X', pattern)
# #     txt = re.sub('C+', 'C', txt)
#     return txt
    
#     pagenums = [guess_page_number(a) for a in xseq]
# #     print(pagenums)
#     for i in range(1, len(xseq)):
#         if pagenums[i-1] is None or pagenums[i] is None:
#             features[i]['page-diff:None'] = 1.0
#         else:
#             diff = pagenums[i] - pagenums[i-1]
#             if diff == 1:
#                 features[i]['page-diff==1'] = 1.0
#             else:
#                 features[i]['page-diff<>1'] = 1.0
