In [1]:
import re
import json
import spacy
from pprint import pprint
nlp = spacy.load("en_core_web_sm")
from bs4 import BeautifulSoup


In [25]:
def basic_cleanup(document):
    # Function to convert a document to a sequence of words,

    # 1. Remove HTML
    soup = BeautifulSoup(document, 'html5lib') # create a new bs4 object from the html data loaded
    for script in soup(["script", "style", "form"]): # remove all javascript and stylesheet code
        script.extract()
    # get text
    text = soup.get_text()

    return text

def entity_level_cleaning(document):
    # 1. Remove non-letters (preserving '.' char to know the ending of sentence)
    review_text = re.sub("[^a-zA-Z0-9.]", " ", document)
    review_text = re.sub("[.]", ". ", review_text)
    review_text = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', review_text)

    # conversion to lowercase is not checked as capitalization is required
    # for detection of entities like person, organization or location

    # 2. Replace multiple spaces with single space
    document = re.sub(' +', ' ', review_text).strip()
    return document

def get_entities(document, spacy_model):
    cleaned_text = entity_level_cleaning(document)
    doc = spacy_model(cleaned_text)
    return {(X.text.lower().title(), X.label_) for X in doc.ents if X.label_ in {'PERSON', 'ORG', 'GPE'}}

In [3]:
file_path = 'signalmedia-1m.jsonl'

In [11]:
text = []

with open(file_path) as fp:
    lines = fp.readlines()
    for i in range(10):
        text.append(json.loads(lines[i]).get('content'))

In [26]:
index = 3

print(text[index])
print("="*100)
cleaned_text = entity_level_cleaning(basic_cleanup(text[index]))
print(cleaned_text)

NYMag.com Daily Intelligencer Vulture The Cut Science of Us Grub Street Bedford & Bowery FOLLOW: Facebook Twitter UserName LOG IN REGISTER Fashions Runway Street Style Designers Fame Beauty Goods Love & War search Sections Fashions Fame Beauty Goods Love & War Plus Runway Street Style Designers Sites NYMag.com Daily Intelligencer Vulture Science of Us Grub Street Bedford & Bowery Like UsFollow Us Popular on The Cut Ask Polly: Should I Just Give Up on My Writing? » Top Shows Oscar de la Renta See it » Michael Kors See it » Suno See it » Coach See it » Narciso Rodriguez See it » Tory Burch See it » Carolina Herrera See it » Rodarte See it » Diesel Black Gold See it » Jeremy Scott See it » Thom Browne See it » rag & bone See it » Tommy Hilfiger See it » Prabal Gurung See it » Diane Von Furstenberg See it » keeping us honest September 16, 2015 7:12 p.m. This New Dating App Will Ruin Your Internet Game By Allison P. Davis Follow @allisonpdavis No photos over six hours old allowed. 37 Shares

In [29]:
for i in range(4):
    chunks = entity_level_cleaning(basic_cleanup(text[i])).split('.')
    #print(chunks)
    docs = [nlp(x) for x in chunks]
    print(len(docs))
    # dir(doc.ents[0])
    # {(X.text.lower().title(), X.label_) for X in doc.ents if X.label_ in {'PERSON', 'ORG', 'GPE'}}
    print("="*100)
    pprint({(X.lemma_.lower().title(), X.label_) for doc in docs for X in doc.ents if X.label_ in {'PERSON', 'ORG', 'GPE'}})

23
{('Andy Wilson', 'PERSON'),
 ('Bromsgrove', 'ORG'),
 ('Dave Carney', 'PERSON'),
 ('Derek Hardman', 'PERSON'),
 ('Droitwich', 'GPE'),
 ('Five', 'ORG'),
 ('Germany', 'GPE'),
 ('Gloucester', 'PERSON'),
 ('Hill Worcester', 'PERSON'),
 ('Hm Forces Veteran', 'ORG'),
 ('Hull', 'GPE'),
 ('Mr Carney', 'ORG'),
 ('Newcastle', 'GPE'),
 ('Royal British Legion', 'ORG'),
 ('Royal Engineers', 'ORG'),
 ('Share', 'ORG'),
 ('The Postal Order', 'ORG'),
 ('Veteran', 'ORG'),
 ('Worcester Breakfast Club', 'ORG'),
 ('Worcester S', 'PERSON')}
24
{('Bulleit Group', 'ORG'),
 ('Deren Baker', 'PERSON'),
 ('Historical Data', 'ORG'),
 ('Jumpshot', 'GPE'),
 ('Kelly Mayes', 'PERSON'),
 ('Pay', 'PERSON'),
 ('San Francisco', 'GPE'),
 ('Seo', 'ORG'),
 ('Sys Con Media Inc', 'ORG'),
 ('Variable', 'ORG'),
 ('Visibility', 'ORG')}
4
{('Post', 'ORG')}
81
{('-Pron-', 'GPE'),
 ('1K Shares Share', 'ORG'),
 ('A Salute To', 'ORG'),
 ('Air Make Fun Of Contour More', 'ORG'),
 ('Air S World', 'ORG'),
 ('Airline', 'ORG'),
 ('Allison

In [8]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/Users/anuragsharma/Work/submissions/logically/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/Users/anuragsharma/Work/submissions/logically/stanford-ner/stanford-ner.jar', encoding='utf-8')

In [28]:
index = 3

# print(text[index])
print("="*100)
cleaned_text = entity_level_cleaning(basic_cleanup(text[index]))

tokenized_text = word_tokenize(cleaned_text)
# print(tokenized_text)
classified_text = st.tag(tokenized_text)

# print(classified_text)
entities = set()
from itertools import groupby
for tag, chunk in groupby(classified_text, lambda x:x[1]):
    if tag != "O":
        #print("%-12s"%tag, " ".join(w for w, t in chunk))
        entities.add((" ".join(w for w, t in chunk), tag))
        
pprint(entities)

{('Allison P. Davis', 'PERSON'),
 ('Best Party Photos of New York Fashion Week', 'ORGANIZATION'),
 ('Bionic Dong Listen', 'ORGANIZATION'),
 ('Brandon Maxwell', 'PERSON'),
 ('Catherine Deneuve', 'PERSON'),
 ('Depp', 'PERSON'),
 ('Diane Von Furstenberg', 'PERSON'),
 ('Don', 'PERSON'),
 ('Ellen Page', 'PERSON'),
 ('Facebook Tweet', 'ORGANIZATION'),
 ('Fifty Shades of Grey', 'LOCATION'),
 ('Gloria Steinem', 'PERSON'),
 ('Grey', 'PERSON'),
 ('Harvard Women Finally Receive Invitation to Boring Rich Dude Club',
  'ORGANIZATION'),
 ('Helen Mirren', 'PERSON'),
 ('Hollywood', 'LOCATION'),
 ('Huff', 'PERSON'),
 ('Jeremy Scott', 'PERSON'),
 ('Joan Didion', 'PERSON'),
 ('Johnny Depp', 'PERSON'),
 ('Kanye Relegates Kendall', 'PERSON'),
 ('Kardashians', 'ORGANIZATION'),
 ('Kate Mc Kinnon', 'PERSON'),
 ('Kate Spade', 'PERSON'),
 ('Kim Cattrall', 'PERSON'),
 ('Kim Kardashian', 'PERSON'),
 ('Kylie Jenner', 'PERSON'),
 ('My Writing Top Shows Oscar de la Renta See it Michael Kors See',
  'ORGANIZATION'),


In [21]:
text[3]

'NYMag.com Daily Intelligencer Vulture The Cut Science of Us Grub Street Bedford & Bowery FOLLOW: Facebook Twitter UserName LOG IN REGISTER Fashions Runway Street Style Designers Fame Beauty Goods Love & War search Sections Fashions Fame Beauty Goods Love & War Plus Runway Street Style Designers Sites NYMag.com Daily Intelligencer Vulture Science of Us Grub Street Bedford & Bowery Like UsFollow Us Popular on The Cut Ask Polly: Should I Just Give Up on My Writing? » Top Shows Oscar de la Renta See it » Michael Kors See it » Suno See it » Coach See it » Narciso Rodriguez See it » Tory Burch See it » Carolina Herrera See it » Rodarte See it » Diesel Black Gold See it » Jeremy Scott See it » Thom Browne See it » rag & bone See it » Tommy Hilfiger See it » Prabal Gurung See it » Diane Von Furstenberg See it » keeping us honest September 16, 2015 7:12 p.m. This New Dating App Will Ruin Your Internet Game By Allison P. Davis Follow @allisonpdavis No photos over six hours old allowed. 37 Share