In [1]:
# !pip3 install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter lab clean
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [2]:
from __future__ import print_function
from ipywidgets import interact, Box, HBox, VBox
import ipywidgets as widgets
from IPython.display import display

import re
from dataclasses import dataclass, asdict
from typing import List

In [3]:
import sqlite3
import json

class dbopen(object):
    def __init__(self, path='data.db'):
        self.path = path
        
    def __enter__(self):
        self.conn = sqlite3.connect(self.path)
        self.cursor = self.conn.cursor()
        return self.cursor
    
    def __exit__(self, exc_class, exc, traceback):
        self.conn.commit()
        self.conn.close()

## Debounce

Decorator to debounce. E.g. `@debounce(0.25)`

References:
- https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Events.html?highlight=throttle#Debouncing

In [4]:
import asyncio

class Timer:
    def __init__(self, timeout, callback):
        self._timeout = timeout
        self._callback = callback
        self._task = asyncio.ensure_future(self._job())

    async def _job(self):
        await asyncio.sleep(self._timeout)
        self._callback()

    def cancel(self):
        self._task.cancel()

def debounce(wait):
    """ Decorator that will postpone a function's
        execution until after `wait` seconds
        have elapsed since the last time it was invoked. """
    def decorator(fn):
        timer = None
        def debounced(*args, **kwargs):
            nonlocal timer
            def call_it():
                fn(*args, **kwargs)
            if timer is not None:
                timer.cancel()
            timer = Timer(wait, call_it)
        return debounced
    return decorator

In [5]:
@dataclass
class Annotation:
    entity: str
    label: str
    start: int
    end: int
        
@dataclass
class Sentence:
    text: str
    annotations: List[Annotation]

In [6]:
def f(x):
    return x

output = widgets.Output()
with output:
    interact(f, x=10)

In [7]:
# Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install BeautifulSoup4

In [8]:
import nltk
import requests
from collections import Counter
from bs4 import BeautifulSoup
from functools import partial
import json

In [123]:
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

class Annotator:
    def __init__(self, url, label):
        self.url = url
        self.label = label
        self.result = []
        self.idx = -1
        self.keywords = []
    
    def scrape(self):
        print('Fetching:', url)
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
        page = requests.get(url, 
                            headers=headers,
                            timeout=5)
        # Deal with weird characters.
        page.encoding = page.apparent_encoding
        if page.status_code != 200:
            raise Exception("unable to fetch page")
        print('[Status]:', page.status_code)
        self.soup = BeautifulSoup(page.text, 'html.parser')
        self.clear_elements()

        
    def clear_elements(self):
        """Clear HTML elements that are not providing value, e.g. header, footer, code, pre..."""
        htmltags = ['header', 'footer', 'nav', 'code', 'pre', 'script', 'style', 'head']
        for tag in htmltags:
            htmlelement = self.soup.find_all(tag)
            for el in htmlelement:
                el.decompose()
    
    def inspect_tokens(self, n=10):
        text = self.soup.get_text()
        tokens = [word for sent in nltk.sent_tokenize(text.lower()) 
                  for word in nltk.word_tokenize(sent)]
        tokens = [token for token in tokens if token not in STOPWORDS and len(token) > 1 and token.isalnum()]
        tokens = Counter(tokens)
        return tokens.most_common(n)

    def prepare_sentences(self, min_word=3):
        text = ' '.join(annotator.soup.findAll(text=True))
        sentences = nltk.sent_tokenize(clear_newlines_and_whitespaces(text))
        self.sentences = [sentence for sentence in sentences if len(sentence.split(' ')) > min_word]
        self.keywords = list(dict(self.inspect_tokens()).keys())
        self.result = [None] * len(self.sentences)
    
    def accept(self):
        if self.idx >= len(self.sentences) - 1:
            return
        self.idx += 1

    def undo(self):
        if self.idx <= 0:
            return
        self.idx -= 1
    
    def reset(self):
        self.idx = -1
        self.result = [None] * len(self.sentences)

    def build_regexp(self):
        # Sort regex from longest to shortest - this will match "golang" first, then "go" instead of the opposite.
        longest_to_shortest_keywords = sorted(self.keywords, 
                                              key=len, 
                                              reverse=True)

        # Set boundary matching for keyword, so that "go" won't match "going"
        def escape_all(s):
            # Word boundary only applies for character, so non-character like c++ will not match.
            if s.isalpha():
                return r'\b{}\b'.format(re.escape(s))
            return r'{}'.format(re.escape(s))

        keywords_to_match = '|'.join(map(escape_all, longest_to_shortest_keywords))
        pattern = r'{}'.format(keywords_to_match)
        return partial(re.finditer, pattern, flags=re.IGNORECASE|re.MULTILINE|re.UNICODE)

    def preprocess(self, sentence):
        regexp = self.build_regexp()
        annotations: List[Annotation] = []

        for m in regexp(sentence):
            s, e = m.span()
            annotations.append(Annotation(sentence[s:e], self.label, s, e))

        self.result[self.idx] = Sentence(sentence, annotations)

    def update_keywords(self, new_kw):
        tags = new_kw.split(',')
        tags = map(lambda s: s.strip(), tags)
        tags = filter(lambda s: len(s) > 0, tags)
        tags = list(set(list(tags)))
        if len(tags) == 0: return
        self.keywords = tags
    
    def current_result(self):
        return self.result[self.idx]
    
    def current_sentence(self):
        return self.sentences[self.idx]
    
    def save(self):
        data_to_db = [asdict(item) for item in self.result if item is not None]
        with dbopen() as c:
            url = self.url
            text = self.soup.get_text()
            data = json.dumps(data_to_db)
            payload = (url, text, data, text, data)
            c.execute('INSERT INTO skill (url, text, data) VALUES (?, ?, ?) ON CONFLICT (url) DO UPDATE SET text = ?, data = ?', 
                      payload)
            print(c.execute('SELECT id, url FROM skill').fetchall())

In [133]:
url = 'https://appinventiv.com/blog/go-vs-rust/'

In [134]:
annotator = Annotator(url, 'SKILL')

In [135]:
annotator.scrape()

Fetching: https://appinventiv.com/blog/go-vs-rust/
[Status]: 200


In [136]:
annotator.inspect_tokens()

[('go', 44),
 ('rust', 43),
 ('language', 17),
 ('vs', 16),
 ('languages', 11),
 ('speed', 10),
 ('development', 9),
 ('better', 8),
 ('would', 8),
 ('performance', 8)]

In [137]:
annotator.prepare_sentences()

In [138]:
def clear_newlines_and_whitespaces(s):
    return re.sub(r'\s+', ' ', s).strip()

In [139]:
button_prev = widgets.Button(description="Prev")
button_next = widgets.Button(description="Next")
button_reset = widgets.Button(description='Reset')

hbox = HBox([button_prev, button_next, button_reset])

def handle_prev(e):
    annotator.undo()
    render()

def handle_next(e):
    annotator.accept()
    render()

def handle_reset(e):
    annotator.reset()
    render()

button_prev.on_click(handle_prev)
button_next.on_click(handle_next)
button_reset.on_click(handle_reset)

html = widgets.HTML(value='', placeholder='Enter html', description='')
input_keyword = widgets.Text(value=','.join(annotator.keywords),
                             description='Keywords')

@debounce(0.25)
def update_keyword(new_kw):
    annotator.update_keywords(new_kw)
    render()
    
interact(update_keyword, new_kw=input_keyword)

display(hbox,
        html)

def render():
    idx = annotator.idx
    sentence = annotator.current_sentence()
    sentences = annotator.sentences
    if idx < 0:
        html.value = "Press next to start"
        return
    if idx >= len(sentences) - 1:
        html.value = 'Completed'
        return

    display(sentence, idx)
    annotator.preprocess(sentence)
    
    annotations = annotator.current_result().annotations
    for annotation in annotations:
        # Boundaries only applies for char, so if the word has non-char, like c++, it will fail.
        pattern = r'\b{}\b'.format(re.escape(annotation.entity)) if annotation.entity.isalpha() else r'{}'.format(re.escape(annotation.entity))
        sentence = re.sub(pattern,
                          f'<b style="background: #FFFBCC">{annotation.entity}</b>', 
                          sentence, flags=re.IGNORECASE|re.MULTILINE|re.UNICODE)
    
    html.value = f"Total: {len(sentences)}, Current: {idx+1}"
    html.value += '<br/>'
    html.value += f'{idx+1}. {sentence}'
    html.value += '<br/>'

interactive(children=(Text(value='go,rust,language,vs,languages,speed,development,better,would,performance', d…

HBox(children=(Button(description='Prev', style=ButtonStyle()), Button(description='Next', style=ButtonStyle()…

HTML(value='', placeholder='Enter html')

In [140]:
annotator.save()

[(9, 'https://appinventiv.com/blog/go-vs-rust/'), (3, 'https://buttercms.com/blog/vue-vs-react-which-is-the-better-framework'), (8, 'https://hackr.io/blog/kotlin-vs-java'), (4, 'https://www.edureka.co/blog/what-is-scala/'), (1, 'https://www.fullstackacademy.com/blog/nine-best-programming-languages-to-learn'), (2, 'https://www.ignite.digital/10-best-programming-languages-to-learn-in-2020/'), (6, 'https://www.sam-solutions.com/blog/top-10-programming-languages-and-their-use-cases/')]


In [141]:
with dbopen() as c:
    result = c.execute('select id, url, substr(data, 0, 100) from skill').fetchall()
    for id, url, data in result:
        print('id:', id)
        print('url:', url)
        print('data:', data)
        print()

id: 1
url: https://www.fullstackacademy.com/blog/nine-best-programming-languages-to-learn
data: [{"text": "The 9 Best Programming Languages to Learn in 2020 | Fullstack Academy Programs New York 

id: 2
url: https://www.ignite.digital/10-best-programming-languages-to-learn-in-2020/
data: [{"text": "10 Best programming languages to learn in 2020 - Ignite Digital Talent Skip to content J

id: 3
url: https://buttercms.com/blog/vue-vs-react-which-is-the-better-framework
data: [{"text": "Vue vs React: Which is the better framework?", "annotations": [{"entity": "Vue", "label"

id: 4
url: https://www.edureka.co/blog/what-is-scala/
data: [{"text": "What is Scala?", "annotations": [{"entity": "Scala", "label": "SKILL", "start": 8, "end"

id: 6
url: https://www.sam-solutions.com/blog/top-10-programming-languages-and-their-use-cases/
data: [{"text": "Most Popular Programming Languages 2020 [And Key Use Cases] | SaM Solutions", "annotatio

id: 8
url: https://hackr.io/blog/kotlin-vs-java
data: [{"t