In [1]:
# !pip3 install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter lab clean
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [2]:
from __future__ import print_function
from ipywidgets import interact, Box, HBox, VBox
import ipywidgets as widgets
from IPython.display import display

import re
from dataclasses import dataclass
from typing import List

## Debounce

Decorator to debounce. E.g. `@debounce(0.25)`

References:
- https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Events.html?highlight=throttle#Debouncing

In [3]:
import asyncio

class Timer:
    def __init__(self, timeout, callback):
        self._timeout = timeout
        self._callback = callback
        self._task = asyncio.ensure_future(self._job())

    async def _job(self):
        await asyncio.sleep(self._timeout)
        self._callback()

    def cancel(self):
        self._task.cancel()

def debounce(wait):
    """ Decorator that will postpone a function's
        execution until after `wait` seconds
        have elapsed since the last time it was invoked. """
    def decorator(fn):
        timer = None
        def debounced(*args, **kwargs):
            nonlocal timer
            def call_it():
                fn(*args, **kwargs)
            if timer is not None:
                timer.cancel()
            timer = Timer(wait, call_it)
        return debounced
    return decorator

In [4]:
@dataclass
class Annotation:
    entity: str
    label: str
    start: int
    end: int
        
@dataclass
class Sentence:
    text: str
    annotations: List[Annotation]

In [5]:
def f(x):
    return x

output = widgets.Output()
with output:
    interact(f, x=10)

In [6]:
# Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install BeautifulSoup4

In [7]:
import nltk
import requests
from collections import Counter
from bs4 import BeautifulSoup
from functools import partial
import json

In [8]:
url = 'https://buttercms.com/blog/vue-vs-react-which-is-the-better-framework'

In [9]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
page = requests.get(url, headers=headers,timeout=5)
# Deal with weird characters.
page.encoding = page.apparent_encoding
if page.status_code != 200:
    raise Exception("unable to fetch page")

In [10]:
soup = BeautifulSoup(page.text, 'html.parser')
text = soup.get_text()
len(text)

14937

In [11]:
tokens = [word for sent in nltk.sent_tokenize(text.lower()) 
             for word in nltk.word_tokenize(sent)]
stopwords = set(nltk.corpus.stopwords.words('english'))
tokens = [token for token in tokens if token not in stopwords and len(token) > 1]
tokens = Counter(tokens)
tokens.most_common(10)

[('react', 52),
 ('vue', 50),
 ('content', 25),
 ('developers', 18),
 ('butter', 11),
 ('api', 11),
 ('buttercms', 10),
 ("''", 10),
 ('use', 9),
 ('state', 9)]

In [12]:
def clear_newlines_and_whitespaces(s):
    return re.sub(r'\s+', ' ', s).strip()

In [13]:
sentences = nltk.sent_tokenize(text)
sentences = list(map(clear_newlines_and_whitespaces, sentences))
len(sentences), sentences[0]

(76, 'Vue vs React: Which is the better framework?')

In [14]:
keywords = dict(tokens.most_common(10)).keys()
keywords

dict_keys(['react', 'vue', 'content', 'developers', 'butter', 'api', 'buttercms', "''", 'use', 'state'])

In [15]:
label = 'SKILL'
result: List[Sentence] = [None] * len(sentences)
idx = -1

In [16]:
def to_next():
    global idx
    if idx >= len(sentences) - 1:
        return
    idx = idx + 1

def to_prev():
    global idx
    if idx <= 0:
        return
    idx = idx - 1
    
def reset():
    global idx
    global result
    result = [None] * len(sentences)
    idx = -1

def reject():
    pass

def build_regexp():
    # Sort regex from longest to shortest - this will match "golang" first, then "go" instead of the opposite.
    longest_to_shortest_keywords = sorted(keywords, key=lambda item: len(item), reverse=True)
    
    # Set boundary matching for keyword, so that "go" won't match "going"
    def escape_all(s):
        # Word boundary only applies for character, so non-character like c++ will not match.
        if s.isalpha():
            return r'\b{}\b'.format(re.escape(s))
        return r'{}'.format(re.escape(s))
    
    keywords_to_match = '|'.join(map(escape_all, longest_to_shortest_keywords))
    pattern = r'{}'.format(keywords_to_match)
    return partial(re.finditer, pattern, flags=re.IGNORECASE|re.MULTILINE|re.UNICODE)

def preprocess(sentence):
    global keywords
    global result
    
    # Create raw regex string to match the boundary, so that searching for "go" won't match "going".
    regexp = build_regexp()
    annotations: List[Annotation] = []
        
    for m in regexp(sentence):
        s, e = m.span()
        annotations.append(Annotation(sentence[s:e], label, s, e))
    
    display(idx)
    result[idx] = Sentence(sentence, annotations)
    
def update_keywords(new_kw):
    global keywords
    tags = new_kw.split(',')
    tags = map(lambda s: s.strip(), tags)
    tags = filter(lambda s: len(s) > 0, tags)
    tags = list(set(list(tags)))
    keywords = tags

In [19]:
button_prev = widgets.Button(description="Prev")
button_next = widgets.Button(description="Next")
button_reset = widgets.Button(description='Reset')

hbox = HBox([button_prev, button_next, button_reset])

def handle_prev(e):
    to_prev()
    render()

def handle_next(e):
    to_next()
    render()

def handle_reset(e):
    reset()
    render()

button_prev.on_click(handle_prev)
button_next.on_click(handle_next)
button_reset.on_click(handle_reset)

html = widgets.HTML(value='', placeholder='Enter html', description='')
input_keyword = widgets.Text(value=','.join(keywords),
                             description='Keywords')

@debounce(0.25)
def update_keyword(new_kw):
    update_keywords(new_kw)
    render()
    
interact(update_keyword, new_kw=input_keyword)

display(hbox,
        html)

def render():
    global keywords
    if idx < 0:
        html.value = "Press next to start"
        return
    if idx >= len(sentences) - 1:
        html.value = 'Completed'
        return

    sentence = sentences[idx]
    display(sentence, idx)
    preprocess(sentence)
    
    annotations = result[idx].annotations
    for annotation in annotations:
        sentence = re.sub(r'\b{}\b'.format(annotation.entity),
                          f'<b style="background: #FFFBCC">{annotation.entity}</b>', 
                          sentence, flags=re.IGNORECASE|re.MULTILINE|re.UNICODE)
    
    html.value = f"Total: {len(sentences)}, Current: {idx+1}"
    html.value += '<br/>'
    html.value += f'{idx+1}. {sentence}'
    html.value += '<br/>'

interactive(children=(Text(value="content,api,use,developers,vue,state,buttercms,butter,react,''", description…

HBox(children=(Button(description='Prev', style=ButtonStyle()), Button(description='Next', style=ButtonStyle()…

HTML(value='', placeholder='Enter html')

"| ButterCMS ButterCMS Solutions Ecommerce Improve conversion and product offerings Agencies Manage your clients' CMS in one place SaaS Scale content with company growth Marketplaces Extend your reach and boost organic traffic Features Flexible Content Modeling Make content changes dead simple for your content editors Components Components enable your marketers to compose flexible page layouts and easily reorder those layouts."

1

1

In [36]:
from dataclasses import asdict
data_to_db = [asdict(item) for item in result if item is not None]
len(data_to_db), data_to_db[0]

(75,
 {'text': 'Vue vs React: Which is the better framework?',
  'annotations': [{'entity': 'Vue', 'label': 'SKILL', 'start': 0, 'end': 3},
   {'entity': 'React', 'label': 'SKILL', 'start': 7, 'end': 12}]})

In [33]:
import sqlite3
import json

class dbopen(object):
    def __init__(self, path='data.db'):
        self.path = path
        
    def __enter__(self):
        self.conn = sqlite3.connect(self.path)
        self.cursor = self.conn.cursor()
        return self.cursor
    
    def __exit__(self, exc_class, exc, traceback):
        self.conn.commit()
        self.conn.close()

In [40]:
with dbopen() as c:
    c.execute('INSERT INTO skill (url, text, data) VALUES (?, ?, ?)', (url, text, json.dumps(data_to_db),))
    print(c.execute('SELECT id, url FROM skill').fetchall())

IntegrityError: UNIQUE constraint failed: skill.url