# Import the English Language Model

If you have not already done so, you will need to run this code to download the language model.

In [5]:
import sys
!{sys.executable} -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


# Defining variables

In [1]:
## define directory path and entity type
import os
cwd = os.getcwd()
data_loc = cwd + "/data"
output_loc = cwd + "/output/"
ent_type = "PERSON"

### entity type can be "PERSON", "NORP", "ORG", "GPE", etc.
### https://spacy.io/api/annotation#named-entities

# Imports and setup

In [3]:
import spacy
#import PyPDF2
import os
import csv
import random
import re
import string
#import docx2txt
#import docx
import codecs
import subprocess
from collections import Counter

nlp = spacy.load('en_core_web_sm')

# Walking directory and compiling text files

In [4]:
allfiles = []
onlypdf = []
onlydoc = []
onlydocx = []
onlytxt = []

for root, dirs, files in os.walk(data_loc):
    for file in files:
        allfiles.append(os.path.join(root, file))
        if file.endswith(".doc"):
           onlydoc.append(os.path.join(root, file))
        elif file.endswith(".docx"):
            onlydocx.append(os.path.join(root, file))     
        elif file.endswith(".txt"):
            onlytxt.append(os.path.join(root, file))
        elif file.endswith(".pdf"):
            onlypdf.append(os.path.join(root, file))
            
print('files total: %d ' % len(allfiles))
print('doc: %d' % len(onlydoc))
print('docx: %d' % len(onlydocx))
print('pdf: %d' % len(onlypdf))
print('txt: %d' % len(onlytxt))

files total: 4 
doc: 0
docx: 0
pdf: 0
txt: 4


# Extract text and compile entities

In [10]:
all_ent = []
pdf_ent = []
doc_ent = []
docx_ent = []
txt_ent = []

## the separate lists aren't necessary for this but keeping them anyway

In [12]:
##ents from pdf

for filename in onlypdf: 
    try:
        pdfFileObj = open(filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        for i in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(i)
            pagetext = ((pageObj.extractText()).replace('\n', ' '))
            pdfdoc = nlp(pagetext)
            for ent in pdfdoc.ents:
                entpair = (ent.text, ent.label_)
                pdf_ent.append(entpair)
                all_ent.append(entpair)
    except: 
        pass
    
print('entities from .pdf: %d' % len(doc_ent))


entities from .pdf: 0


In [13]:
##ents from doc

for filename in onlydoc: 
    try:
        cmd = ['antiword', filename]
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        (output, err) = p.communicate()
        uniout = unicode(output, "utf-8")
        doc = nlp(uniout)
        for ent in doc.ents:
            entpair = (ent.text, ent.label_)
            doc_ent.append(entpair)
            all_ent.append(entpair)
    except:
        pass
            
print('entities from .doc: %d' % len(doc_ent))


entities from .doc: 0


In [14]:
##ents from docx

for filename in onlydocx: 
    try:
        pagetext = docx2txt.process(filename)
        docxdoc = nlp(pagetext)
        for ent in docxdoc.ents:
            entpair = (ent.text, ent.label_)
            docx_ent.append(entpair)
            all_ent.append(entpair)
    except:
        pass
            
print('entities from .docx: %d' % len(docx_ent))

entities from .docx: 0


In [15]:
##ents from txt

for filename in onlytxt: 
    try:
        with codecs.open(filename, 'r', encoding='utf-8') as myfile:
            pagetext=myfile.read().replace('\n', ' ')
            txtdoc = nlp(pagetext)
            for ent in txtdoc.ents:
                entpair = (ent.text, ent.label_)
                txt_ent.append(entpair)
                all_ent.append(entpair)
    except:
        pass
            
print('entities from .txt: %d' % len(txt_ent))

entities from .txt: 230


In [16]:
print('total entities: %d' % len(all_ent))

total entities: 230


# Specify entity type

In [17]:
entlist = [x for x in all_ent if ent_type in x]
print(len(entlist))

44


# Clean output

In [18]:
filter_entlist = [x for x in entlist if (x[0])[0].isalpha() and (x[0])[-1].isalpha()] 

## this will limit list to entities that start and end with an alphanumerical character

print(len(filter_entlist))

31


In [19]:
print((list(set(entlist) - set(filter_entlist)))[0:20])

## check this output for data loss

[('Willie\t\t', 'PERSON'), ('James\t', 'PERSON'), ('Mary\t', 'PERSON'), ('Alice\t\t', 'PERSON'), ('Carl\t', 'PERSON'), ('Abernethy\t', 'PERSON'), ('Joseph\t', 'PERSON'), ('Jennie\t\t', 'PERSON'), ('Abernethy\t\t\t', 'PERSON'), ('William\t', 'PERSON'), ('Frances\t', 'PERSON')]


In [22]:
## filter again to only include names w/ more than 1 word

filter_entlist2 = [x for x in entlist if (x[0])[0].isalpha() and (x[0])[-1].isalpha() and ' ' in x[0]]
print(filter_entlist2)

[('Jas M', 'PERSON'), ('s McDowell', 'PERSON'), ('s McDowell', 'PERSON'), ('Fire Ext', 'PERSON'), ('w Palmer', 'PERSON'), ('Henry Miller', 'PERSON')]


# Return top entities

In [26]:
os.makedirs(output_loc)
os.chdir(output_loc)


namecount = Counter(filter_entlist)
fullnamecount = Counter(filter_entlist2)
commonnames = [x for x in fullnamecount.most_common() if x[1] > 5]
commonall = [x for x in namecount.most_common() if x[1] > 5]

entities_table = []

for name in commonnames:
    row = [(name[0])[0].encode('utf-8'), name[1]]
    entities_table.append(row)

out_path = "entities_fullnames.csv"

header = ['Name', 'Frequency']

with open(out_path, 'w') as fo:
    csv_writer = csv.writer(fo)
    csv_writer.writerow(header)
    csv_writer.writerows(entities_table)
    
entities_table2 = []

for name in commonall:
    row = [(name[0])[0].encode('utf-8'), name[1]]
    entities_table2.append(row)

out_path = "names_all.csv"

header = ['Name', 'Frequency']

with open(out_path, 'w') as fo:
    csv_writer = csv.writer(fo)
    csv_writer.writerow(header)
    csv_writer.writerows(entities_table2)
