# sqlite3


Scripts to run database migration, table creation etc.

In [1]:
import sqlite3
import json

In [2]:
class dbopen(object):
    def __init__(self, path='data.db'):
        self.path = path
        
    def __enter__(self):
        self.conn = sqlite3.connect(self.path)
        self.cursor = self.conn.cursor()
        return self.cursor
    
    def __exit__(self, exc_class, exc, traceback):
        self.conn.commit()
        self.conn.close()

In [3]:
def create_table():
    with dbopen() as c:
        c.execute("""
        CREATE TABLE IF NOT EXISTS skill (
            id integer PRIMARY KEY AUTOINCREMENT,
            url text UNIQUE NOT NULL,
            text text NOT NULL,
            data json NOT NULL,
            created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
            updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
        )""")

        c.execute("""
        CREATE TRIGGER IF NOT EXISTS update_timestamp
        AFTER UPDATE ON skill 
        BEGIN
            update skill set updated_at = current_timestamp WHERE url = NEW.url;
        END
        """)

In [4]:
with dbopen('data.db') as c:
    result = c.execute('select id, url from skill').fetchall()
    print(result)

[(9, 'https://appinventiv.com/blog/go-vs-rust/'), (3, 'https://buttercms.com/blog/vue-vs-react-which-is-the-better-framework'), (8, 'https://hackr.io/blog/kotlin-vs-java'), (4, 'https://www.edureka.co/blog/what-is-scala/'), (1, 'https://www.fullstackacademy.com/blog/nine-best-programming-languages-to-learn'), (2, 'https://www.ignite.digital/10-best-programming-languages-to-learn-in-2020/'), (6, 'https://www.sam-solutions.com/blog/top-10-programming-languages-and-their-use-cases/')]


# Preparing Training Data

In order to train the data for Spacy's NER, we first need to convert the format to the training data's format. We will load the data that is already stored in the database and change it to the following format:

```python
train_data = [
    ('React, Vue is good', {"entities": [(0, 5, 'SKILL'), (7, 10, 'SKILL')]})
]
```


References:
- https://spacy.io/usage/training
- https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

In [5]:
train_data = []
with dbopen() as c:
    result = c.execute('select id, data from skill').fetchall()
    for id, data in result:
        json_data = json.loads(data)
        for row in json_data:
            txt, annotations = row['text'], row['annotations']
            
            annotation_format = [(annotation['start'], 
                                  annotation['end'], 
                                  annotation['label']) for annotation in annotations]
            item_format = (txt, {"entities": annotation_format})
            train_data.append(item_format)

In [6]:
len(train_data)

706

# Updating the Named Entity Recognizer

In [7]:
import spacy

# Load pre-existing spacy model.
nlp = spacy.load('en_core_web_sm')

# Getting the pipeline component.
ner = nlp.get_pipe('ner')

In [8]:
# Adding labels to the `ner`.
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [9]:
# Apart from `ner`, the model has other pipeline components. 
# These components should not get affected in training.
# Disable the other pipeline components that you do not want to change.
# Training will be performed with the unaffected_pipes disabled

pipe_exceptions = ['ner', 'trf_wordpiecer', 'trf_tok2vec']
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
unaffected_pipes

['tagger', 'parser']

In [10]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [11]:
%time
# Training the model.
with nlp.disable_pipes(*unaffected_pipes):
    

    for iteration in range(30): # 1
        random.shuffle(train_data) # 2
        losses = {}
        
        batches = minibatch(train_data, 
                            size=compounding(4.0, 32.0, 1.001)) # 3
            
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts,
                       annotations,
                       drop=0.5, # Make it harder to memorize data.
                       losses=losses)
        print('Iteration:', iteration, 'Losses:', losses)
print('Completed')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
scala.util.Sorting.quickS..." with entities "[(25, 30, 'SKILL'), (151, 156, 'SKILL')]". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Iteration: 0 Losses: {'ner': 13531.097412541509}
Iteration: 1 Losses: {'ner': 12047.109838575125}
Iteration: 2 Losses: {'ner': 11484.289995610714}
Iteration: 3 Losses: {'ner': 11353.18250232935}
Iteration: 4 Losses: {'ner': 11238.585773326457}
Iteration: 5 Losses: {'ner': 10996.600533246994}
Iteration: 6 Losses: {'ner': 10827.820795580745}
Iteration: 7 Losses: {'ner': 10880.216635167599}
Iteration: 8 Losses: {'ner': 10854.939557492733}
Iteration: 9 Losses: {'ner': 10873.098056912422}
Iteration: 10 Losses: {'ner': 10603.298558929004}
Iteration: 11 Losses: {'ner': 10716.066584825516}
Iteration: 12 Losses: {'ner': 10665.229402981699}
Iteration: 13 Losses: {'ner': 10666.979851424694}
Iteration: 14 Losses: {'ner': 10586.12006700039}
Iteration: 15 Losses: {'ner': 10492.14964979887}
Iteration: 16 Losses: {'ner': 10504.174914717674}
Iteration: 17 Losses: {'ner': 10495.253044366837}
Iteration: 18 Losses: {'ner': 10407.500497195462}
Iteration: 19 Losses: {'ner': 10522.784501433372}
Iteration: 20

### Notes

1. To train the `ner` model, the model has to be looped over the example for sufficient number of iterations. If you train it for like just 5-6 iterations, it may not be effective. NOTE: find out what is the most effective number.
2. Before every iteration it's a good practice to shuffle the examples randomly through random.shuffle() function. 
This ensures that the model does not make generalization based on the order of the examples.
3. Training data is passed in batch. We use `minibatch()` function over the training data that will return data in batches. 

# Testing the model

In [12]:
doc = nlp('I want to learn Scala, go is also good.')
print('Entities', [(ent.text, ent.label_) 
                  for ent in doc.ents])

Entities [('Scala', 'SKILL'), ('go', 'SKILL')]


In [13]:
doc = nlp('can I learn js? Using a single threaded language like node.js is good for IO')
print('Entities', [(ent.text, ent.label_) 
                  for ent in doc.ents])

Entities []


# Saving the model

In [14]:
from datetime import datetime
today = datetime.now().strftime('%Y_%m_%d')
today

'2020_08_07'

In [15]:
import os
target_dir = f'content/{today}/'

if not os.path.exists(target_dir):
    os.makedirs(target_dir)
output_dir = Path(target_dir)
nlp.to_disk(output_dir)
print('Saved model to', output_dir)

Saved model to content/2020_08_07


# Loading the model and predict

In [16]:
print('Loading from', target_dir)

Loading from content/2020_08_07/


In [18]:
nlp_updated = spacy.load(output_dir)

test_data = ['Kotlin is an awesome programming language', 
             'Should I learn ReactJS in 2020?',
             'Should I learn react in 2020?',
             'Peter is learning Python, Go, JavaScript and Rust',
             'Apple uses Swift programming language for iOS']
for data in test_data:
    doc = nlp_updated(data)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Kotlin', 'SKILL')]
Entities []
Entities [('react', 'SKILL')]
Entities [('Python', 'SKILL'), ('Go', 'SKILL'), ('JavaScript', 'SKILL'), ('Rust', 'SKILL')]
Entities [('Swift', 'SKILL')]
