In [34]:
import os
import numpy as np
import pandas as pd
import spacy
import plac
import random
from pathlib import Path
from tqdm import tqdm
import time
from datetime import datetime

# Functions

In [4]:
def write_data(data_list:list, file_name:str):
    
    """Takes list of data and file name, creates and writes the file"""
    
    with open(file_name, 'w') as file:
        for data in data_list:
            file.write(str(data)+'\n')

In [5]:
def load_train_data(file_name:str, label:str):
    
    """load train data from file file_name and assign label
    return training data as list for spacy"""
    
    if (label.upper()=='NONE'):
        data_list = []
        with open(file_name,'r') as file:
            for line in file.readlines():
                text = line.strip()
                data_list.append((text, {'entities': []}))
            file.close()
        return data_list
    
    else:
        data_list = []
        with open(file_name,'r') as file:
            for line in file.readlines():
                text = line.strip()
                data_list.append((text, {'entities': [(0,len(text),label.upper())]}))
            file.close()
        return data_list
    

In [6]:
def load_test_data(file:str, true_label:str):
    
    """load test data from file and return df"""
    
    test_data_ls=[]
    with open(file, 'r') as f:
        for line in f.readlines():
            test_data_ls.append(line.strip())
        f.close()
    df = pd.DataFrame({'data':test_data_ls, 'true_label':true_label.upper()})
    return df

# Train data

In [92]:
train_addr = load_train_data('train\\train_addr.txt','addr')
train_email = load_train_data('train\\train_email.txt','email')
train_none = load_train_data('train\\train_none.txt','none')
train_phone = load_train_data('train\\train_phone.txt','phone')

print(len(train_addr))
print(train_addr[:10])
print(len(train_email))
print(train_email[:10])
print(len(train_none))
print(train_none[:10])
print(len(train_phone))
print(train_phone[:10])

1700
[('2 RECTOR STREET NEW YORK NY 10006', {'entities': [(0, 33, 'ADDR')]}), ('115 S. LASALLE ST. CHICAGO', {'entities': [(0, 26, 'ADDR')]}), ('1818 State Route 3, Fulton NY 13069', {'entities': [(0, 35, 'ADDR')]}), ('P.O. Box 929 4189 Nunc Road, Lebanon KY 69409', {'entities': [(0, 45, 'ADDR')]}), ('3535 CANAL STREET NEW ORLEANS 70119', {'entities': [(0, 35, 'ADDR')]}), ('6140A Univ Drive, Huntsville AL 35806', {'entities': [(0, 37, 'ADDR')]}), ('5949 SHERRY LANE  SUITE 1500 DALLAS TX 75225', {'entities': [(0, 44, 'ADDR')]}), ('5050 POPLAR AVE., STE 1430 NJ 7044', {'entities': [(0, 34, 'ADDR')]}), ('589 5TH AVENUE NEW YORK NY 10017', {'entities': [(0, 32, 'ADDR')]}), ('701 Mcmeans Ave, Bay Minette AL 36507', {'entities': [(0, 37, 'ADDR')]})]
1300
[('Lauren_Campagne@gmail.com', {'entities': [(0, 25, 'EMAIL')]}), ('Edward_Briere@gmail.com', {'entities': [(0, 23, 'EMAIL')]}), ('Phillip_Dedmond@gmail.com', {'entities': [(0, 25, 'EMAIL')]}), ('Frank_Dent318@yahoo.com', {'entities': [(0, 2

In [93]:
TRAIN_DATA = train_addr + train_email + train_none + train_phone

print(len(TRAIN_DATA))
TRAIN_DATA[:10]

6700


[('2 RECTOR STREET NEW YORK NY 10006', {'entities': [(0, 33, 'ADDR')]}),
 ('115 S. LASALLE ST. CHICAGO', {'entities': [(0, 26, 'ADDR')]}),
 ('1818 State Route 3, Fulton NY 13069', {'entities': [(0, 35, 'ADDR')]}),
 ('P.O. Box 929 4189 Nunc Road, Lebanon KY 69409',
  {'entities': [(0, 45, 'ADDR')]}),
 ('3535 CANAL STREET NEW ORLEANS 70119', {'entities': [(0, 35, 'ADDR')]}),
 ('6140A Univ Drive, Huntsville AL 35806', {'entities': [(0, 37, 'ADDR')]}),
 ('5949 SHERRY LANE  SUITE 1500 DALLAS TX 75225',
  {'entities': [(0, 44, 'ADDR')]}),
 ('5050 POPLAR AVE., STE 1430 NJ 7044', {'entities': [(0, 34, 'ADDR')]}),
 ('589 5TH AVENUE NEW YORK NY 10017', {'entities': [(0, 32, 'ADDR')]}),
 ('701 Mcmeans Ave, Bay Minette AL 36507', {'entities': [(0, 37, 'ADDR')]})]

In [94]:
TRAIN_DATA[-10:]

[('399.670-0945', {'entities': [(0, 12, 'PHONE')]}),
 ('+3(558) 904.1728', {'entities': [(0, 16, 'PHONE')]}),
 ('+1 995-765-4530', {'entities': [(0, 15, 'PHONE')]}),
 ('551 978 8144', {'entities': [(0, 12, 'PHONE')]}),
 ('+7.(693).480 4334', {'entities': [(0, 17, 'PHONE')]}),
 ('+2.(294)-415-5499', {'entities': [(0, 17, 'PHONE')]}),
 ('+16102347339', {'entities': [(0, 12, 'PHONE')]}),
 ('+3 (768).819.4668', {'entities': [(0, 17, 'PHONE')]}),
 ('+0-(381)-635.0731', {'entities': [(0, 17, 'PHONE')]}),
 ('+1 (945) 879-2933', {'entities': [(0, 17, 'PHONE')]})]

# Model

### load

In [95]:
# load model
nlp = spacy.load('en_core_web_sm')  
print("Model loaded")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Model loaded


In [96]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [97]:
ner

<spacy.pipeline.pipes.EntityRecognizer at 0x299010054c0>

### train

In [98]:
n_iter = 50
drop = 0.5

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    with open('log\\log_train_'+str(datetime.now().strftime("%Y%m%d_%H%M%S"))+'.txt','w') as logfile:
        logfile.write('Training points: '+str(len(TRAIN_DATA))+'\n')
        logfile.write('Iterations: '+str(n_iter)+ '\n')
        logfile.write('drop: '+ str(drop) + '\n\n')
        start_outer = time.time()
        
        for itn in range(n_iter):
            start_inner = time.time()
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update(
                    [text],  
                    [annotations],  
                    drop=drop,  
                    sgd=optimizer,
                    losses=losses)
            
            stop_inner = time.time()
            print(str(itn) +'\t'+ 't:'+str(stop_inner - start_inner) +'\t'+ str(losses))
            logfile.write(str(itn)+'\t'+'t: '+str(stop_inner - start_inner)+'\t'+str(losses)+'\n')
        
        stop_outer = time.time()
        print(f'\n\n Total Training time: {stop_outer-start_outer} seconds')
        logfile.write('\n\nTotal Training time: '+ str(stop_outer - start_outer)+ ' s')
        logfile.close()


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:47<00:00, 12.71it/s]
  0%|                                                                                 | 2/6700 [00:00<09:36, 11.61it/s]

t:527.3010256290436	{'ner': 17638.887911547314}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:08<00:00, 12.21it/s]
  0%|                                                                                 | 2/6700 [00:00<09:59, 11.17it/s]

t:548.7311816215515	{'ner': 16149.14939398244}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:57<00:00, 12.45it/s]
  0%|                                                                                 | 2/6700 [00:00<07:51, 14.21it/s]

t:537.9407660961151	{'ner': 15521.476014318034}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:32<00:00, 13.07it/s]
  0%|                                                                                 | 2/6700 [00:00<07:18, 15.27it/s]

t:512.5814142227173	{'ner': 15641.776689201917}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:42<00:00, 12.81it/s]
  0%|                                                                                 | 2/6700 [00:00<09:13, 12.09it/s]

t:522.8810827732086	{'ner': 15532.878592632664}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:24<00:00, 11.86it/s]
  0%|                                                                                 | 1/6700 [00:00<15:25,  7.24it/s]

t:564.8687679767609	{'ner': 15458.144909538147}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [11:18<00:00,  9.88it/s]
  0%|                                                                                         | 0/6700 [00:00<?, ?it/s]

t:678.4474263191223	{'ner': 15321.594589240247}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [11:25<00:00,  9.77it/s]
  0%|                                                                                 | 2/6700 [00:00<07:34, 14.75it/s]

t:685.4371466636658	{'ner': 15528.711187297919}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [11:26<00:00,  9.76it/s]
  0%|                                                                                 | 1/6700 [00:00<11:31,  9.69it/s]

t:686.1377377510071	{'ner': 15253.763574390665}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [11:10<00:00,  9.99it/s]
  0%|                                                                                 | 2/6700 [00:00<05:38, 19.77it/s]

t:670.7308793067932	{'ner': 15248.977620899917}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [07:25<00:00, 15.05it/s]
  0%|                                                                                 | 2/6700 [00:00<09:59, 11.17it/s]

t:445.32083010673523	{'ner': 15393.161695057768}


100%|███████████████████████████████████████████████████████████████████████████| 6700/6700 [15:52:34<00:00,  8.53s/it]
  0%|                                                                                 | 2/6700 [00:00<07:16, 15.35it/s]

t:57154.440643548965	{'ner': 15509.54621102179}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [07:24<00:00, 15.09it/s]
  0%|                                                                                 | 2/6700 [00:00<06:33, 17.01it/s]

t:444.1333954334259	{'ner': 15299.732333014781}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [07:31<00:00, 14.84it/s]
  0%|                                                                                 | 2/6700 [00:00<07:26, 15.01it/s]

t:451.4244031906128	{'ner': 15387.526214542726}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:19<00:00, 13.41it/s]
  0%|                                                                                 | 2/6700 [00:00<08:52, 12.58it/s]

t:499.7312581539154	{'ner': 15218.680126901034}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:59<00:00, 12.43it/s]
  0%|                                                                                 | 2/6700 [00:00<09:35, 11.64it/s]

t:539.1376550197601	{'ner': 15282.173661301555}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:02<00:00, 13.88it/s]
  0%|                                                                                 | 2/6700 [00:00<08:32, 13.07it/s]

t:482.7048680782318	{'ner': 15404.734739058347}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:30<00:00, 11.75it/s]
  0%|                                                                                 | 2/6700 [00:00<09:16, 12.03it/s]

t:570.4524850845337	{'ner': 15458.806988648857}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:16<00:00, 12.04it/s]
  0%|                                                                                 | 2/6700 [00:00<09:01, 12.38it/s]

t:556.5537662506104	{'ner': 15308.487770590888}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:07<00:00, 12.24it/s]
  0%|                                                                                 | 2/6700 [00:00<08:24, 13.28it/s]

t:547.2020313739777	{'ner': 15474.149386067449}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:05<00:00, 12.27it/s]
  0%|                                                                                 | 2/6700 [00:00<08:12, 13.59it/s]

t:545.8417272567749	{'ner': 15502.651523485572}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:29<00:00, 11.77it/s]
  0%|                                                                                 | 2/6700 [00:00<07:42, 14.49it/s]

t:569.3568961620331	{'ner': 15284.096107997122}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:21<00:00, 11.93it/s]
  0%|                                                                                 | 1/6700 [00:00<12:26,  8.98it/s]

t:561.7874991893768	{'ner': 15490.785928772262}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:39<00:00, 11.57it/s]
  0%|                                                                                         | 0/6700 [00:00<?, ?it/s]

t:579.1214370727539	{'ner': 15475.558139838387}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:48<00:00, 11.39it/s]
  0%|                                                                                 | 2/6700 [00:00<08:22, 13.33it/s]

t:588.4969365596771	{'ner': 15464.272327768194}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:01<00:00, 12.36it/s]
  0%|                                                                                 | 2/6700 [00:00<09:23, 11.89it/s]

t:541.8881475925446	{'ner': 15359.981092274724}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:56<00:00, 12.50it/s]
  0%|                                                                                 | 2/6700 [00:00<09:30, 11.74it/s]

t:536.1348550319672	{'ner': 15549.49531828538}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [10:02<00:00, 11.12it/s]
  0%|                                                                                 | 2/6700 [00:00<09:16, 12.04it/s]

t:602.6763031482697	{'ner': 15608.52637454117}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:48<00:00, 11.39it/s]
  0%|                                                                                 | 2/6700 [00:00<09:42, 11.49it/s]

t:588.3951013088226	{'ner': 15487.922938745101}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [10:16<00:00, 10.87it/s]
  0%|                                                                                 | 2/6700 [00:00<09:56, 11.24it/s]

t:616.1677544116974	{'ner': 15525.811214801815}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:52<00:00, 12.58it/s]
  0%|                                                                                 | 2/6700 [00:00<08:10, 13.64it/s]

t:532.5624620914459	{'ner': 15415.328714291289}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:20<00:00, 13.38it/s]
  0%|                                                                                 | 2/6700 [00:00<07:35, 14.70it/s]

t:500.77957677841187	{'ner': 15276.29907814089}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:19<00:00, 13.41it/s]
  0%|                                                                                 | 2/6700 [00:00<07:52, 14.16it/s]

t:499.6730456352234	{'ner': 15544.648153099139}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:16<00:00, 13.49it/s]
  0%|                                                                                 | 2/6700 [00:00<08:04, 13.83it/s]

t:496.5537829399109	{'ner': 15465.01425478943}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:55<00:00, 12.51it/s]
  0%|                                                                                 | 2/6700 [00:00<09:22, 11.90it/s]

t:535.5794496536255	{'ner': 15440.296046555113}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:24<00:00, 11.87it/s]
  0%|                                                                                 | 2/6700 [00:00<08:53, 12.56it/s]

t:564.6243178844452	{'ner': 15501.422416101772}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:08<00:00, 12.21it/s]
  0%|                                                                                 | 2/6700 [00:00<08:32, 13.06it/s]

t:548.5859892368317	{'ner': 15441.717721933675}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:08<00:00, 12.22it/s]
  0%|                                                                                 | 2/6700 [00:00<09:29, 11.75it/s]

t:548.185868024826	{'ner': 15704.820254443854}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:09<00:00, 12.20it/s]
  0%|                                                                                 | 2/6700 [00:00<09:00, 12.40it/s]

t:549.3895680904388	{'ner': 15594.881770063817}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:34<00:00, 13.01it/s]
  0%|                                                                                 | 2/6700 [00:00<08:05, 13.81it/s]

t:514.8238291740417	{'ner': 15469.41903139613}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:27<00:00, 13.20it/s]
  0%|                                                                                 | 2/6700 [00:00<07:57, 14.03it/s]

t:507.48176288604736	{'ner': 15530.214493742264}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:44<00:00, 12.77it/s]
  0%|                                                                                 | 1/6700 [00:00<12:36,  8.86it/s]

t:524.6264808177948	{'ner': 15438.436614486007}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:33<00:00, 13.06it/s]
  0%|                                                                                 | 1/6700 [00:00<16:35,  6.73it/s]

t:513.2032594680786	{'ner': 15593.581602048354}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:30<00:00, 13.12it/s]
  0%|                                                                                 | 2/6700 [00:00<08:15, 13.53it/s]

t:510.5876486301422	{'ner': 15583.523384026512}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:30<00:00, 13.12it/s]
  0%|                                                                                 | 2/6700 [00:00<08:24, 13.28it/s]

t:510.84014320373535	{'ner': 15696.70593512132}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [08:32<00:00, 13.07it/s]
  0%|                                                                                 | 1/6700 [00:00<11:54,  9.38it/s]

t:512.7981867790222	{'ner': 15616.212403604997}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:32<00:00, 11.70it/s]
  0%|                                                                                 | 2/6700 [00:00<08:44, 12.77it/s]

t:572.836761713028	{'ner': 15755.396491781543}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:26<00:00, 11.82it/s]
  0%|                                                                                 | 2/6700 [00:00<08:59, 12.41it/s]

t:566.9180226325989	{'ner': 15492.81969218689}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:24<00:00, 11.86it/s]
  0%|                                                                                 | 2/6700 [00:00<09:40, 11.54it/s]

t:564.7556402683258	{'ner': 15611.32711907463}


100%|██████████████████████████████████████████████████████████████████████████████| 6700/6700 [09:25<00:00, 11.84it/s]

t:565.9345936775208	{'ner': 15745.414783854812}


 Total Training time: 83996.78734016418 seconds





### Save

In [99]:
# save model
output_dir = 'model_spacy_email_phone_addr_update_pretrained'

nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to model_spacy_email_phone_addr_update_pretrained


### test - general

In [100]:
doc = nlp("amsf.as@yahoo.com")
print([(X.text, X.label_) for X in doc.ents])

[('amsf.as@yahoo.com', 'EMAIL')]


In [101]:
doc = nlp("+16102899411")
print([(X.text, X.label_) for X in doc.ents])

[('+16102899411', 'PHONE')]


In [102]:
doc = nlp("302 Park street")
print([(X.text, X.label_) for X in doc.ents])

[('302 Park street', 'ADDR')]


In [103]:
doc = nlp("909 FANNIN TWO HOUSTON CENTER, SUITE 2310")
print([(X.text, X.label_) for X in doc.ents])

[('909 FANNIN TWO HOUSTON CENTER, SUITE 2310', 'ADDR')]


In [104]:
doc = nlp("+159901011")
print([(X.text, X.label_) for X in doc.ents])

[('+159901011', 'PHONE')]


In [105]:
doc = nlp("DUKE ENERGY TRADING AND MARKETING")
print([(X.text, X.label_) for X in doc.ents])

[]


In [106]:
doc = nlp("New york")
print([(X.text, X.label_) for X in doc.ents])

[]


In [107]:
doc = nlp("LA")
print([(X.text, X.label_) for X in doc.ents])

[]


In [108]:
doc = nlp("495668")
print([(X.text, X.label_) for X in doc.ents])

[]


In [127]:
doc = nlp("Microsoft")
print([(X.text, X.label_) for X in doc.ents])

[]


In [129]:
doc = nlp("https://www.google.com")
print([(X.text, X.label_) for X in doc.ents])

[]


In [130]:
doc = nlp("english")
print([(X.text, X.label_) for X in doc.ents])

[]
