In [None]:
# !pip3 install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter lab clean
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [1]:
from __future__ import print_function
from ipywidgets import interact, Box, HBox, VBox
import ipywidgets as widgets
from IPython.display import display

import re
from dataclasses import dataclass
from typing import List

## Debounce

Decorator to debounce. E.g. `@debounce(0.25)`

References:
- https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Events.html?highlight=throttle#Debouncing

In [2]:
import asyncio

class Timer:
    def __init__(self, timeout, callback):
        self._timeout = timeout
        self._callback = callback
        self._task = asyncio.ensure_future(self._job())

    async def _job(self):
        await asyncio.sleep(self._timeout)
        self._callback()

    def cancel(self):
        self._task.cancel()

def debounce(wait):
    """ Decorator that will postpone a function's
        execution until after `wait` seconds
        have elapsed since the last time it was invoked. """
    def decorator(fn):
        timer = None
        def debounced(*args, **kwargs):
            nonlocal timer
            def call_it():
                fn(*args, **kwargs)
            if timer is not None:
                timer.cancel()
            timer = Timer(wait, call_it)
        return debounced
    return decorator

In [3]:
@dataclass
class Annotation:
    text: str
    start: int
    end: int
        
@dataclass
class Sentence:
    text: str
    annotations: List[Annotation]

In [4]:
def f(x):
    return x

output = widgets.Output()
with output:
    interact(f, x=10)

In [5]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install BeautifulSoup4



In [6]:
import nltk
import requests
from collections import Counter
from bs4 import BeautifulSoup
from functools import partial
import json

In [32]:
url = 'https://www.ignite.digital/10-best-programming-languages-to-learn-in-2020/'

In [8]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
page = requests.get(url, headers=headers,timeout=5)
# Deal with weird characters.
page.encoding = page.apparent_encoding
if page.status_code != 200:
    raise Exception("unable to fetch page")

In [9]:
soup = BeautifulSoup(page.text, 'html.parser')
text = soup.get_text()
len(text)

11874

In [10]:
tokens = [word for sent in nltk.sent_tokenize(text.lower()) 
             for word in nltk.word_tokenize(sent)]
stopwords = set(nltk.corpus.stopwords.words('english'))
tokens = [token for token in tokens if token not in stopwords and len(token) > 1]
tokens = Counter(tokens)
tokens.most_common(10)

[('programming', 31),
 ('language', 22),
 ('languages', 13),
 ('used', 13),
 ('python', 12),
 ('learn', 11),
 ('use', 11),
 ('best', 9),
 ('web', 9),
 ('applications', 9)]

In [11]:
def clear_newlines_and_whitespaces(s):
    return re.sub(r'\s+', ' ', s).strip()

In [12]:
sentences = nltk.sent_tokenize(text)
sentences = list(map(clear_newlines_and_whitespaces, sentences))
len(sentences), sentences[0]

(80,
 '10 Best programming languages to learn in 2020 - Ignite Digital Talent Skip to content Job Search Find PermanentContract permanent Jobs AnywhereB2 4QAB40 1PABA1 1SUBA11SUBasingstokeBathBH8 8HXBirminghamBournemouthBracknellBrightonBristolBS1 6QFBS37 5HZCambridgeCardiffCB1 2JWChippenhamCornwallE14EC2M 7PYG1 3SLGermanyGlasgowHampshireIlfordIrelandKarlsruheLeedsLeicesterLondonMaidenheadManchesterNewburyOX1 1HSOxfordOxfordshireReadingRedhillReigateremoteRG11LYRG12 1FLRG14 5DGRG21 5AHRH2 0BDSL61EWSM4 5AZSO15 1GPSomersetSouth GloucestershireSouth LondonSouthamptonSwindonTelfordTF3 4LZW1T 2HFWaterfordWC1A 1DDWC2E 9JTWiltshireYork Anywhere earning Any Salary£20k +£30k +£40k +£50k +£60k + Any Salary including words: Search: Search Jobs About UsJobsFor EmployersBlogContact Ignite Digital Talent About UsJobsFor EmployersBlogContact10 Best programming languages to learn in 2020 Ollie HepburnTweetPinShareWhatsAppEmailShare10The digital age is upon us, and knowing the best programming language

In [13]:
keywords = dict(tokens.most_common(10)).keys()
keywords

dict_keys(['programming', 'language', 'languages', 'used', 'python', 'learn', 'use', 'best', 'web', 'applications'])

In [14]:
tag = 'SKILL'
result: List[Sentence] = [None] * len(sentences)
idx = -1

In [15]:

def to_next():
    global idx
    if idx >= len(sentences) - 1:
        return
    idx = idx + 1

def to_prev():
    global idx
    if idx <= 0:
        return
    idx = idx - 1
    
def reset():
    global idx
    global result
    result = [None] * len(sentences)
    idx = -1

def reject():
    pass

def build_regexp():
    # Sort regex from longest to shortest - this will match "golang" first, then "go" instead of the opposite.
    longest_to_shortest_keywords = sorted(keywords, key=lambda item: len(item), reverse=True)
    
    # Set boundary matching for keyword, so that "go" won't match "going"
    def escape_all(s):
        # Word boundary only applies for character, so non-character like c++ will not match.
        if s.isalpha():
            return r'\b{}\b'.format(re.escape(s))
        return r'{}'.format(re.escape(s))
    
    keywords_to_match = '|'.join(map(escape_all, longest_to_shortest_keywords))
    pattern = r'{}'.format(keywords_to_match)
    return partial(re.finditer, pattern, flags=re.IGNORECASE|re.MULTILINE|re.UNICODE)

def preprocess(sentence):
    global keywords
    global result
    
    # Create raw regex string to match the boundary, so that searching for "go" won't match "going".
    regexp = build_regexp()
    annotations: List[Annotation] = []
        
    for m in regexp(sentence):
        s, e = m.span()
        annotations.append(Annotation(sentence[s:e], s, e))
    
    display(idx)
    result[idx] = Sentence(sentence, annotations)
    
def update_keywords(new_kw):
    global keywords
    tags = new_kw.split(',')
    tags = map(lambda s: s.strip(), tags)
    tags = filter(lambda s: len(s) > 0, tags)
    tags = list(set(list(tags)))
    keywords = tags

In [16]:
button_prev = widgets.Button(description="Prev")
button_next = widgets.Button(description="Next")
button_reset = widgets.Button(description='Reset')

hbox = HBox([button_prev, button_next, button_reset])

def handle_prev(e):
    to_prev()
    render()

def handle_next(e):
    to_next()
    render()

def handle_reset(e):
    reset()
    render()

button_prev.on_click(handle_prev)
button_next.on_click(handle_next)
button_reset.on_click(handle_reset)

html = widgets.HTML(value='', placeholder='Enter html', description='')
input_keyword = widgets.Text(value=','.join(keywords),
                             description='Keywords')

@debounce(0.25)
def update_keyword(new_kw):
    update_keywords(new_kw)
    render()
    
interact(update_keyword, new_kw=input_keyword)

display(hbox,
        html)

def render():
    global keywords
    if idx < 0:
        html.value = "Press next to start"
        return
    if idx >= len(sentences) - 1:
        html.value = 'Completed'
        return

    sentence = sentences[idx]
    display(sentence, idx)
    preprocess(sentence)
    
    annotations = result[idx].annotations
    for annotation in annotations:
        sentence = sentence.replace(annotation.text, f'<b style="background: #FFFBCC">{annotation.text}</b>')
    
    html.value = f"Total: {len(sentences)}, Current: {idx+1}"
    html.value += '<br/>'
    html.value += f'{idx+1}. {sentence}'
    html.value += '<br/>'

interactive(children=(Text(value='programming,language,languages,used,python,learn,use,best,web,applications',…

HBox(children=(Button(description='Prev', style=ButtonStyle()), Button(description='Next', style=ButtonStyle()…

HTML(value='', placeholder='Enter html')

In [63]:
import sqlite3

conn = sqlite3.connect('skills.db')

In [181]:
# c = conn.cursor()
# c.execute("""
# CREATE TABLE IF NOT EXISTS skill (
#     url text UNIQUE NOT NULL,
#     created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
#     updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
#     text text NOT NULL,
#     data json NOT NULL
# )""")

# c.execute("""
# CREATE TRIGGER update_timestamp
# AFTER UPDATE ON skill 
# BEGIN
#     update skill set timestamp = current_timestamp WHERE url = NEW.url;
# END
# """)
# conn.commit()

<sqlite3.Cursor at 0x1227a4ce0>

In [91]:
c = conn.cursor()
c.execute("select url, substr(text, 0, 100), data from skill")
for (url, text, data) in c.fetchall():
    print(f'url: {url}')
    print(f'text: {clear_newlines_and_whitespaces(text)}')
    annotations = json.loads(data)
    print(f'found {len(annotations)} sentences')
    print()

url: https://www.fullstackacademy.com/blog/nine-best-programming-languages-to-learn
text: The 9 Best Programming Languages to Learn in 2020 | Fullstack Academy
found 88 sentences

url: https://www.ignite.digital/10-best-programming-languages-to-learn-in-2020/
text: 10 Best programming languages to learn in 2020 - Ignite Digital Talent Skip to conten
found 79 sentences



In [56]:
objs = []
for item in result:
    if item is None: continue
    obj = {"text": item.text,
           "annotations": [{"start": ann.start,
                            "end": ann.end,
                            "text": ann.text}
                           for ann in item.annotations]}
    objs.append(obj)
len(objs)

79

In [57]:
c.execute("INSERT INTO skill (url, text, data) VALUES (?, ?, ?)", (url, text, json.dumps(objs),))

IntegrityError: UNIQUE constraint failed: skill.url

In [61]:
conn.commit()

OperationalError: database is locked

In [None]:
c.execute("select url from skill")
c.fetchmany()

In [62]:
conn.close()