In [1]:
import os
import re
import sys
from collections import Counter
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score
from sklearn.cross_validation import cross_val_predict, LabelKFold
# from formasaurus.utils import download

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list, 
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.evaluate import get_annotation_folds
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr

In [2]:
storage = Storage()
urls = [rec['Page URL'] for rec in storage.iter_records()]
X_raw, y = storage.get_Xy()
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

Not all links are matched {'<a>1</a>'}
Not all links are matched {'<a>6</a>'}
Not all links are matched {'<a onclick="display(\'grid\');">Mosaico</a>'}
Not all links are matched {'<a onclick="display(\'grid\');">Mosaico</a>'}
Not all links are matched {'<a class="btn prev disabled" type="backward"><i class="icon-chevron-left"> </i></a>'}
pages: 196  domains: 81


In [3]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def link_to_features(link):
    text = normalize(get_link_text(link))

    href = get_link_href(link)
    p = urlsplit(href)

    query_parsed = parse_qsl(p.query)
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    )

    elem = get_selector_root(link)
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(self_and_children_classes + ' ' + parent_classes)

    return {
        'bias': 3.0,
        'isdigit': text.isdigit(),
        'isalpha': text.isalpha(),
        'elem-target': elem_target,
        'elem-rel': elem_rel,
        'num-tokens%s' % _num_tokens_feature(text): 1.0,

        'text': ngrams_wb(replace_digits(text), 2, 5),
        'text-exact': replace_digits(text.strip()[:20].strip()),
        'class': ngrams_wb(css_classes, 4, 5),
        'query': query_param_names_ngrams,

        'path-has-page': 'page' in p.path.lower(),
        'path-has-pageXX': re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path-has-number': any(part.isdigit() for part in p.path.split('/')),

        'href-has-year': re.search('20\d\d', href) is not None,
    }


def page_to_features(xseq):
    features = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)

    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(features, around):
        feat['text-before'] = {n: k for n in ngrams_wb(normalize(before), 5, 5)}
        feat['text-after'] = {n: k for n in ngrams_wb(normalize(after), 5, 5)}
    return features


X = [page_to_features(xseq) for xseq in X_raw]

CPU times: user 13.9 s, sys: 322 ms, total: 14.2 s
Wall time: 14.6 s


In [4]:
# X[60][12]

In [5]:
# TRAIN_SIZE = 80
# X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
# X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.001, 
    c2=0.05, 
    max_iterations=100, 
    all_possible_transitions=True,
    verbose=False,
)
# crf.fit(X_train, y_train, X_test, y_test)

In [6]:
%%time
folds = get_annotation_folds(urls, 6)
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=-1)
print(flat_classification_report(y, y_pred, labels=['PAGE', 'NEXT', 'PREV'], digits=3))
print("Sequence accuracy: {:0.3f}".format(sequence_accuracy_score(y, y_pred)))

             precision    recall  f1-score   support

       PAGE      0.886     0.925     0.905      1178
       NEXT      0.945     0.781     0.855       155
       PREV      0.902     0.821     0.860       112

avg / total      0.894     0.902     0.896      1445

Sequence accuracy: 0.622
CPU times: user 8.92 s, sys: 1.06 s, total: 9.98 s
Wall time: 1min 4s


In [7]:
crf.fit(X, y)
# crf.attributes_
crf.num_attributes_

12622

In [8]:
# [a for a in sorted(crf.attributes_) if a.startswith('id')]

## What are important features?

In [9]:
# XXX: weight for correlated features don't show their importance
# XXX: weights for features of different scale don't show their importance
# (e.g. coefficients to text-after and text-before features are high, but only 
# because input is scaled down for these features)

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(150))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-50:])

Top positive:
2.932516 PAGE     text:末页
2.932516 PAGE     text-exact:末页
2.900327 PAGE     isdigit
2.582526 NEXT     class:next
2.418373 PREV     query:p
2.362808 NEXT     text:>
2.288422 O        elem-rel:nofollow
2.129136 PREV     text:<
2.005038 NEXT     text-exact:»
1.957721 PREV     class:prev
1.944739 PAGE     text-exact:首页
1.892849 PAGE     text:首页
1.882324 O        num-tokens>2
1.827425 NEXT     text-exact:>
1.732028 PREV     text-exact:«
1.719730 PREV     text-exact:<
1.687878 PAGE     text-exact:X
1.666720 O        text:..
1.519860 NEXT     text:»
1.468086 NEXT     text-exact:下一页
1.468086 NEXT     text:下一
1.468086 NEXT     text:下一页
1.412462 O        text-exact:>>
1.407156 PREV     text:«
1.399919 PAGE     text:<<
1.376962 O        href-has-year
1.346167 O        text-after:...
1.318125 O        query:m
1.284652 PREV     text-exact:
1.281044 PREV     num-tokens=0
1.277307 PAGE     text:X
1.275660 PAGE     text-after:ninfo
1.275660 PAGE     text-after:oninf
1.275660 PAGE     tex

In [10]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Transitions:")
print_transitions(Counter(crf.transition_features_).most_common())

Transitions:
O      -> O       1.962734
PAGE   -> NEXT    1.674836
PREV   -> PAGE    1.600434
NEXT   -> PAGE    1.102763
PAGE   -> PREV    0.926758
PREV   -> NEXT    0.783433
O      -> PREV    0.334344
PAGE   -> PAGE    -0.105393
NEXT   -> O       -0.424473
NEXT   -> PREV    -0.626934
PREV   -> PREV    -0.703653
NEXT   -> NEXT    -1.249935
PAGE   -> O       -1.563525
O      -> PAGE    -1.567571
PREV   -> O       -1.638279
O      -> NEXT    -1.671897


## Let's check errors the model is making

In [11]:
folds = get_annotation_folds(urls, 6)
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=-1)
errors = np.asarray(y) != np.asarray(y_pred)
error_rows = np.asarray(list(storage.iter_records()))[errors]
error_links = np.asarray(X_raw)[errors]
error_y_pred = y_pred[errors]
error_y_true = np.asarray(y)[errors]

In [12]:
for links, yseq_pred, yseq_true, row in zip(error_links, error_y_pred, error_y_true, error_rows):
    print(row['Page URL'])
    for label_correct, label_pred, link in zip(yseq_true, yseq_pred, links.extract()):
        if label_correct != label_pred:
            print("%4s %4s %s" % (label_correct, label_pred, link))
    print("\n")

https://www.mypapershop.com/patricks-day-supplies.html
NEXT    O <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&amp;Category_Code=patricks-day-supplies&amp;CatListingOffset=24&amp;Offset=24&amp;Per_Page=24&amp;Sort_By=disp_order" class="searchspring-next">▶</a>
NEXT    O <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&amp;Category_Code=patricks-day-supplies&amp;CatListingOffset=24&amp;Offset=24&amp;Per_Page=24&amp;Sort_By=disp_order" class="searchspring-next">▶</a>


http://www.newschittagong24.com/?cat=1
PAGE NEXT <a href="http://www.newschittagong24.com/?cat=1&amp;paged=1422" class="last" title="শেষ »">শেষ »</a>


http://www.newschittagong24.com/?cat=1&paged=5
PAGE    O <a href="http://www.newschittagong24.com/?cat=1" class="first" title="« প্রথম">« প্রথম</a>
PAGE NEXT <a href="http://www.newschittagong24.com/?cat=1

## Unused code

In [13]:
def _url_parts(url):
    p = urlsplit(url)
    args = parse_qsl(p.query)
    argnames = [name for name, value in args]
    return p.netloc, set(p.path.split('/')) | set(args) | set(argnames)

def url_distance(url1, url2):        
    netloc1, parts1 = _url_parts(url1)
    netloc2, parts2 = _url_parts(url2)
    if netloc1 != netloc2:
        return 1.0
    if not parts1 and not parts2:
        return 0.0
    return 1 - len(parts1 & parts2) / len(parts1 | parts2)

#         dist = url_distance(url, href)
#         if dist == 0:
#             feat['url-distance=0'] = 1.0
#         elif dist == 1.0:
#             feat['url-distance=1'] = 1.0
#         else:
#             feat['url-distance=k'] = dist


url_distance('http://example.com/foo/345?page=2', 'http://example.com/foo/345?page=4')

0.33333333333333337

In [14]:
# def guess_page_number(link):
#     text = get_link_text(link).strip()
#     if text.isdigit():
#         return int(text)
#     return None
    
# def number_pattern2(pattern):
#     txt = re.sub('X+', 'X', pattern)
# #     txt = re.sub('C+', 'C', txt)
#     return txt
    
#     pagenums = [guess_page_number(a) for a in xseq]
# #     print(pagenums)
#     for i in range(1, len(xseq)):
#         if pagenums[i-1] is None or pagenums[i] is None:
#             features[i]['page-diff:None'] = 1.0
#         else:
#             diff = pagenums[i] - pagenums[i-1]
#             if diff == 1:
#                 features[i]['page-diff==1'] = 1.0
#             else:
#                 features[i]['page-diff<>1'] = 1.0