## Load Packages

In [20]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 

In [21]:
nlp1 = spacy.load('en_core_web_sm')

## Working of NER

In [22]:
docx1 = nlp1(u"Who is Nishanth?")

In [23]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

In [24]:
docx2 = nlp1(u"Who is Kamal Khumar?")

In [25]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Kamal Khumar 7 19 PERSON


## Train and Test Data

In [39]:
TRAIN_DATA = [
    ('I am taking cosc 301', {
        'entities': [(12, 20, 'COR')]
    }),
    ('I am taking phil 331', {
        'entities': [(12, 20, 'COR')]
    }),
    ('I am taking data 301', {
        'entities': [(12, 20, 'COR')]
    }),
    ('I love engl 351 so much', {
        'entities': [(7, 15, "COR")]
    }),
    ('Why is span 419 so hard', {
        'entities': [(7, 15, "COR")]
    }),
    ('I just enrolled in math 101 and I am excited to start learning about calculus.', {
    'entities': [(20, 28, "COR")]
    }),
    ('Last semester, I took psyc 200 and it was one of my favorite classes.', {
        'entities': [(18, 26, "COR")]
    }),
    ('I am struggling in econ 301, but I am determined to improve my understanding of macroeconomics.', {
        'entities': [(19, 27, "COR")]
    }),
    ('I am considering taking chem 110 next semester.', {
        'entities': [(25, 33, "COR")]
    }),
    ('I have a final exam in stat 400 next week and I have been studying non-stop to prepare.', {
        'entities': [(22, 30, "COR")]
    }),
    ('I am struggling in mus 210, but I am determined to improve my musical abilities.', {
        'entities': [(19, 26, "COR")]
    })
]

TEST_DATA = [
     ('How is math 101?', {
        'entities': [(7, 15, 'PER')]
    }),
     ('Who is cosc 554?', {
        'entities': [(7, 19, 'PER')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    }),
    ('what is phil 331 about', {
        'entities': [(12, 20, 'COR')]
    })
]

TEST_Sentences = ["I am taking COSC 301", "I'm taking PSYC 210 and learning about the different approaches to psychology.",    "I just enrolled in math 101 and I'm excited to start learning about calculus.",    "I'm considering taking MATH 202 next semester, but I'm not sure if I'm ready for the challenge.",    "Last semester, I took PSYC 200 and it was one of my favorite classes.",    "I'm struggling in ECON 301, but I'm determined to improve my understanding of macroeconomics.",    "This summer, I'm enrolled in SPAN 301 and looking forward to improving my language skills.",    "I have a final exam in STAT 400 next week and I have been studying non-stop to prepare.",    "I loved taking HIST 205 and learning about the Civil Rights Movement.",    "I am considering taking PHIL 331 next semester to fulfill my ethics requirement.", "I'm taking ART 101 this semester to fulfill my fine arts requirement.",    "I just enrolled in CS 201 and I'm excited to start learning about algorithms.",    "I'm struggling in GYM 210, but I'm determined to improve my physical fitness.",    "This summer, I'm enrolled in BIO 301 and looking forward to learning about genetics.",    "I'm considering taking PE 201 next semester to fulfill my physical education requirement."]
sentences = [    "I woke up early this morning and went for a run.",    "Sheila is a great singer and songwriter.",    "The pizza at that new restaurant is amazing.",    "I can't wait for summer vacation to start.",    "The movie we saw last night was really funny.",    "My dog is always excited to see me when I come home.",    "I'm thinking about starting a new hobby, like painting or woodworking.",    "I'm really proud of my little brother for getting into college.",    "I love listening to music while I'm driving.",    "I have a lot of work to do, but I'm trying to stay focused.",    "I can't believe how fast time is flying by.",    "I'm going to the beach this weekend with some friends.",    "I'm trying to eat healthier and exercise more.",    "I'm thinking about taking a cooking class to improve my skills.",    "I'm really enjoying this book that I'm reading right now.",    "I'm trying to learn a new language, but it's difficult.",    "I'm excited to see what the future holds.",    "I'm grateful for all the support I've received from my family and friends.",    "I'm always looking for ways to improve myself and grow as a person."]


## Define our variables

In [40]:
model = None
# output_dir=Path("C:\\Users\\nithi\\Documents\\ner")
output_dir=Path("NerModels")
n_iter=100

## Load the model

In [41]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


## Set up the pipeline

In [42]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

## Train the Recognizer

In [43]:
from spacy.training import Example

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update(
                [example],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 112.22it/s]


{'ner': 71.75096745789051}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 106.29it/s]


{'ner': 20.62442627787823}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 118.63it/s]


{'ner': 12.729863587112789}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 119.93it/s]


{'ner': 11.773973840874742}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 118.28it/s]


{'ner': 12.239193911853967}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 123.06it/s]


{'ner': 10.066534044280207}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 116.82it/s]


{'ner': 9.52878390995864}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 122.76it/s]


{'ner': 10.190890536953043}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 121.67it/s]


{'ner': 4.774805301474298}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 121.47it/s]


{'ner': 3.0994983894618664}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 110.84it/s]


{'ner': 0.7727659025758478}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 122.65it/s]


{'ner': 1.9360509260779342}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 124.80it/s]


{'ner': 0.28857574874537306}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 124.69it/s]


{'ner': 0.0022834607994119934}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 123.64it/s]


{'ner': 5.291604426667024}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 123.91it/s]


{'ner': 1.7740236629359274}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 125.28it/s]


{'ner': 0.06877754889724502}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 124.28it/s]


{'ner': 1.6144758946648616}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 125.52it/s]


{'ner': 0.00630651765638292}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 121.46it/s]


{'ner': 1.666977546866079}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 110.72it/s]


{'ner': 0.2828312971811244}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 116.82it/s]


{'ner': 0.0001290884850905492}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 117.19it/s]


{'ner': 1.5868710100870258}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 118.62it/s]


{'ner': 0.0008146441695769086}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 117.87it/s]


{'ner': 0.13452777859108234}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 117.96it/s]


{'ner': 0.1258960778485864}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 116.83it/s]


{'ner': 8.575201208365285e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 118.47it/s]


{'ner': 0.0004911754290514856}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 118.99it/s]


{'ner': 1.722125593953533e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 119.76it/s]


{'ner': 0.0008869231134449916}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 117.14it/s]


{'ner': 1.7002182822679136e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 123.66it/s]


{'ner': 0.00012602331433197037}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 127.10it/s]


{'ner': 5.6892928937885236e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.69it/s]


{'ner': 1.5593075439765754e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.36it/s]


{'ner': 0.0004352572083195625}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.28it/s]


{'ner': 6.389197377010254e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.20it/s]


{'ner': 6.1158502994604965e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.30it/s]


{'ner': 0.004777154850645347}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.62it/s]


{'ner': 0.009881988556603787}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.00it/s]


{'ner': 3.570714238119732e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.98it/s]


{'ner': 4.54292073340965e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.12it/s]


{'ner': 2.1052928334640505e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.19it/s]


{'ner': 1.608333734604596e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.84it/s]


{'ner': 2.9004360150361452e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.74it/s]


{'ner': 9.220475213114075e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.09it/s]


{'ner': 6.709621160889043e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.31it/s]


{'ner': 0.0002034089714915925}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 131.16it/s]


{'ner': 1.085744578014083e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.19it/s]


{'ner': 2.474283173012168e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.42it/s]


{'ner': 0.0005132381905404548}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.04it/s]


{'ner': 0.01813832629871636}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.74it/s]


{'ner': 1.4150601507201092e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.93it/s]


{'ner': 6.703885382540837e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 123.27it/s]


{'ner': 0.00030730373963563213}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 121.33it/s]


{'ner': 0.8944927139074922}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 126.13it/s]


{'ner': 0.00013572981366428226}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.12it/s]


{'ner': 0.00010375179204750446}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 110.94it/s]


{'ner': 4.022107521641572e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.79it/s]


{'ner': 0.0325671048293405}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.17it/s]


{'ner': 3.981766795186756e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.04it/s]


{'ner': 6.361820111584707e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.31it/s]


{'ner': 9.686642266574221e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.81it/s]


{'ner': 7.819452940334253e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.98it/s]


{'ner': 2.0342949337913004e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 127.66it/s]


{'ner': 1.3516382951433084e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 120.69it/s]


{'ner': 4.520359844213109e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.63it/s]


{'ner': 9.59345570350574e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.20it/s]


{'ner': 4.270889425617967e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.72it/s]


{'ner': 1.3104016672666093e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.47it/s]


{'ner': 1.8857186314214634e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.03it/s]


{'ner': 1.4775696689874959e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.66it/s]


{'ner': 7.169203008148959e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.47it/s]


{'ner': 1.0152202895031126e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.49it/s]


{'ner': 7.3822857008434145e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.14it/s]


{'ner': 0.1454759225539951}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.01it/s]


{'ner': 2.247048737716948e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 122.37it/s]


{'ner': 0.0037563024587874737}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 121.79it/s]


{'ner': 3.283993147097197e-05}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.61it/s]


{'ner': 2.2211936917607938e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.10it/s]


{'ner': 2.4407646756311936e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 123.40it/s]


{'ner': 3.291603597289971e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.94it/s]


{'ner': 1.6274838171756227e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.90it/s]


{'ner': 4.301372458474138e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.13it/s]


{'ner': 1.004580809032939e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.64it/s]


{'ner': 5.90926110814081e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.56it/s]


{'ner': 6.692186409672567e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.81it/s]


{'ner': 0.01118066297175901}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.95it/s]


{'ner': 2.7347964883757713e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.17it/s]


{'ner': 1.3489336056699493e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.62it/s]


{'ner': 3.499989765760642e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.61it/s]


{'ner': 7.344436843135839e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 119.39it/s]


{'ner': 1.1497664255972875e-06}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 115.78it/s]


{'ner': 1.4643090964293665e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 118.74it/s]


{'ner': 2.9212977315749932e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.59it/s]


{'ner': 3.375647710315969e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.80it/s]


{'ner': 1.4529358860328276e-09}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 128.48it/s]


{'ner': 3.245639390790732e-07}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.87it/s]


{'ner': 2.5151659589621365e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 129.33it/s]


{'ner': 1.409711697664176e-08}


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 130.92it/s]

{'ner': 6.946587431377889e-07}





## Test the trained model

In [44]:
# for text, _ in TRAIN_DATA:
#     doc = nlp(text)
#     print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#     print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

## Save the model

In [45]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)        

Saved model to NerModels


## Test the saved model

In [46]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TEST_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from NerModels
Entities []
Tokens [('How', '', 2), ('is', '', 2), ('math', '', 2), ('101', '', 2), ('?', '', 2)]
Entities [('cosc 554', 'COR')]
Tokens [('Who', '', 2), ('is', '', 2), ('cosc', 'COR', 3), ('554', 'COR', 1), ('?', '', 2)]
Entities []
Tokens [('I', '', 2), ('like', '', 2), ('London', '', 2), ('and', '', 2), ('Berlin', '', 2), ('.', '', 2)]
Entities [('phil 331', 'COR')]
Tokens [('what', '', 2), ('is', '', 2), ('phil', 'COR', 3), ('331', 'COR', 1), ('about', '', 2)]


In [47]:
# should be empty - no course name mentioned in set.
for sent in TEST_Sentences:
    doc = nlp2(sent)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#     print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('COSC 301', 'COR')]
Entities [('PSYC 210', 'COR')]
Entities [('math 101', 'COR')]
Entities [('MATH 202', 'COR')]
Entities [('PSYC 200', 'COR')]
Entities [('ECON 301', 'COR')]
Entities [('SPAN 301', 'COR')]
Entities [('STAT 400', 'COR')]
Entities [('HIST 205', 'COR'), ('Civil Rights', 'COR')]
Entities [('PHIL 331', 'COR')]
Entities [('ART 101', 'COR')]
Entities [('CS 201', 'COR')]
Entities [('GYM 210', 'COR')]
Entities [('BIO 301', 'COR')]
Entities [('PE 201', 'COR')]


In [48]:
# should be empty - no course name mentioned in set.
for sent in sentences:
    doc = nlp2(sent)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#     print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []


## Clean version

In [49]:
# from __future__ import unicode_literals, print_function
# import plac
# import random
from pathlib import Path
import spacy
# from tqdm import tqdm 

In [61]:
TRAIN_DATA = [
    ('I am taking cosc 301', {
        'entities': [(12, 20, 'COR')]
    }),
    ('I am taking phil 331', {
        'entities': [(12, 20, 'COR')]
    }),
    ('I am taking data 301', {
        'entities': [(12, 20, 'COR')]
    }),
    ('I love engl 351 so much', {
        'entities': [(7, 15, "COR")]
    }),
    ('Why is span 419 so hard', {
        'entities': [(7, 15, "COR")]
    }),
    ('I just enrolled in math 101 and I am excited to start learning about calculus.', {
    'entities': [(20, 28, "COR")]
    }),
    ('Last semester, I took psyc 200 and it was one of my favorite classes.', {
        'entities': [(18, 26, "COR")]
    }),
    ('I am struggling in econ 301, but I am determined to improve my understanding of macroeconomics.', {
        'entities': [(19, 27, "COR")]
    }),
    ('I am considering taking chem 110 next semester.', {
        'entities': [(25, 33, "COR")]
    }),
    ('I have a final exam in stat 400 next week and I have been studying non-stop to prepare.', {
        'entities': [(22, 30, "COR")]
    }),
    ('I am struggling in mus 210, but I am determined to improve my musical abilities.', {
        'entities': [(19, 26, "COR")]
    })
]

TEST_DATA = [
     ('How is math 101?', {
        'entities': [(7, 15, 'PER')]
    }),
     ('Who is cosc 554?', {
        'entities': [(7, 19, 'PER')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    }),
    ('what is phil 331 about', {
        'entities': [(12, 20, 'COR')]
    })
]

TEST_Sentences = ["I am taking COSC 301", "I'm taking PSYC 210 and learning about the different approaches to psychology.",    "I just enrolled in math 101 and I'm excited to start learning about calculus.",    "I'm considering taking MATH 202 next semester, but I'm not sure if I'm ready for the challenge.",    "Last semester, I took PSYC 200 and it was one of my favorite classes.",    "I'm struggling in ECON 301, but I'm determined to improve my understanding of macroeconomics.",    "This summer, I'm enrolled in SPAN 301 and looking forward to improving my language skills.",    "I have a final exam in STAT 400 next week and I have been studying non-stop to prepare.",    "I loved taking HIST 205 and learning about the Civil Rights Movement.",    "I am considering taking PHIL 331 next semester to fulfill my ethics requirement.", "I'm taking ART 101 this semester to fulfill my fine arts requirement.",    "I just enrolled in CS 201 and I'm excited to start learning about algorithms.",    "I'm struggling in GYM 210, but I'm determined to improve my physical fitness.",    "This summer, I'm enrolled in BIO 301 and looking forward to learning about genetics.",    "I'm considering taking PE 201 next semester to fulfill my physical education requirement."]
sentences = [    "I woke up early this morning and went for a run.",    "Sheila is a great singer and songwriter.",    "The pizza at that new restaurant is amazing.",    "I can't wait for summer vacation to start.",    "The movie we saw last night was really funny.",    "My dog is always excited to see me when I come home.",    "I'm thinking about starting a new hobby, like painting or woodworking.",    "I'm really proud of my little brother for getting into college.",    "I love listening to music while I'm driving.",    "I have a lot of work to do, but I'm trying to stay focused.",    "I can't believe how fast time is flying by.",    "I'm going to the beach this weekend with some friends.",    "I'm trying to eat healthier and exercise more.",    "I'm thinking about taking a cooking class to improve my skills.",    "I'm really enjoying this book that I'm reading right now.",    "I'm trying to learn a new language, but it's difficult.",    "I'm excited to see what the future holds.",    "I'm grateful for all the support I've received from my family and friends.",    "I'm always looking for ways to improve myself and grow as a person."]


In [62]:
model = None
output_dir=Path("NerModels")

In [63]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for sent in TEST_Sentences:
    doc = nlp2(sent)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#     print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from NerModels
Entities [('COSC 301', 'COR')]
Entities [('PSYC 210', 'COR')]
Entities [('math 101', 'COR')]
Entities [('MATH 202', 'COR')]
Entities [('PSYC 200', 'COR')]
Entities [('ECON 301', 'COR')]
Entities [('SPAN 301', 'COR')]
Entities [('STAT 400', 'COR')]
Entities [('HIST 205', 'COR'), ('Civil Rights', 'COR')]
Entities [('PHIL 331', 'COR')]
Entities [('ART 101', 'COR')]
Entities [('CS 201', 'COR')]
Entities [('GYM 210', 'COR')]
Entities [('BIO 301', 'COR')]
Entities [('PE 201', 'COR')]


In [64]:
# should be empty - no course name mentioned in set.
for sent in sentences:
    doc = nlp2(sent)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#     print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []


In [59]:
doc = nlp2("cosc 121 is my fac language")

In [60]:
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('cosc 121', 'COR')]
