# Web content text tokenizer 

Splits titles from web articles into verbs, adjectives, adverbs, superlatives and named entities. Outputs the results to Excel.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
input_filename  = 'data/source/find-keywords.xlsx'
output_filename = 'data/find-keywords-nouns.xlsx'

Load the articles and display a sample of the data

In [3]:
df = pd.read_excel(input_filename)
print('Article count: ', len(df))
# For testing
# df = df.head(10)
df.head(10)

Article count:  593


Unnamed: 0,Url,Title,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Abstract
0,,ParaDocs provides our clients with the highest...,,,,,
1,,"AGNITY Global Inc., (AGNITY) is a global provi...",,,,,
2,,FlashCo has grown to be one of the largest man...,,,,,
3,,PAC Worldwide is a global leader in the manufa...,,,,,
4,,"DynexÂ® Technologies, Inc. is an original pion...",,,,,
5,,"Auris Surgical Robotics, Inc. is a technology ...",,,,,
6,,Sense Corp powers insight-driven organizations...,,,,,
7,,We provide quality software and uncompromising...,,,,,
8,,Driven by the ever-changing needs of our clien...,,,,,
9,,"Americo Manufacturing Company, headquartered i...",,,,,


Load spacy, a Natural Language Processing tool

In [4]:
import spacy
nlp = spacy.load('en')
spacy.__version__

'2.0.8'

In [5]:
def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=4)]
    return docs

def to_text(tokens):
    return next(map(lambda token: token.orth_, tokens), '')


def to_text(tokens):
  for token in tokens:
    print(token.orth_)
  return next(map(lambda token: token.orth_, tokens), '')

def filter_first_punct(noun_chunks):
    noun_chunks = list(noun_chunks)
    if len(noun_chunks) > 0:
        print('ROOT', noun_chunks[0].sent[noun_chunks[0].start])
    return []

def get_nouns(sentences):
    return [to_text(docs.noun_chunks) for docs in sentences]

Tokenize the titles and abstracts. Token types available [here](https://spacy.io/docs/usage/pos-tagging).

In [6]:
df['docs'] = tokenize_text(df['Title'].astype(str))

In [7]:
def print_tokens(article_docs):
    # print('Domain:       ', url)
    print('Title:     ', article_docs)
    print('-------------')
    print('Words:     ', list(map(lambda word: word, article_docs)))
    print('Lemma:     ', list(map(lambda word: word.lemma_, article_docs)))
    print('Types:     ', list(map(lambda word: word.pos_, article_docs)))
    print('Tags:      ', list(map(lambda word: word.tag_, article_docs)))
    print('>')
    print('Nouns:     ', list(filter(lambda word: word.pos_ == 'NOUN' or word.tag_ == 'NNP' or word.tag_ == 'NNPS', article_docs)))
    print('Nouns sentences (chunks):     ', get_nouns(article_docs.sents))
    print('Noun chunks:', list(article_docs.noun_chunks))
    print('Noun chunks +1 words:', list(filter(lambda chunk: len(str(chunk).split(' ')) >= 2, list(article_docs.noun_chunks))))
    print('Verbs:     ', list(filter(lambda word: word.pos_ == 'VERB', article_docs)))
    print('Verbs Lemma:', list(map(lambda word: word.lemma_, filter(lambda word: word.pos_ == 'VERB', article_docs))))
    print('Adjectives:', list(filter(lambda word: word.pos_ == 'ADJ', article_docs)))
    print('Adjs Lemma:', list(map(lambda word: word.lemma_, filter(lambda word: word.pos_ == 'ADJ', article_docs))))
    print('Adverbs:   ', list(filter(lambda word: word.pos_ == 'ADV', article_docs)))
    print('Adverbs Lemma:', list(map(lambda word: word.lemma_, filter(lambda word: word.pos_ == 'ADV', article_docs))))
    print('Superlatives:', list(filter(lambda word: word.tag_ == 'JJS' or word.tag_ == 'RBS', article_docs)))
    print('Entities:  ', list(map(lambda entity: (entity, entity.label_), article_docs.ents)))
    
def df_url_docs(id, docs_field = 'docs'):
    return df[docs_field][id]

Manually check that the correct data types have been identified

In [8]:
print_tokens(df_url_docs(0))

Title:      ParaDocs provides our clients with the highest level of emergency medical standby. Professionalism, discretion, and clinical excellence are our core values. From large public concerts to intimate private dinners we can provide an array of medical services to fit your needs. We provide only the most highly trained EMTâ€™s, paramedics, doctors, or ambulances services.
-------------
Words:      [ParaDocs, provides, our, clients, with, the, highest, level, of, emergency, medical, standby, ., Professionalism, ,, discretion, ,, and, clinical, excellence, are, our, core, values, ., From, large, public, concerts, to, intimate, private, dinners, we, can, provide, an, array, of, medical, services, to, fit, your, needs, ., We, provide, only, the, most, highly, trained, EMTâ€, ™, s, ,, paramedics, ,, doctors, ,, or, ambulances, services, .]
Lemma:      ['paradocs', 'provide', '-PRON-', 'client', 'with', 'the', 'high', 'level', 'of', 'emergency', 'medical', 'standby', '.', 'professional

## Excel File Generation

In [9]:
# Loops through every article and applies f() to it. 
# Then applies token_extractor() to convert from a Token to a string.
# Finally, concatenates the tokens of a single type with commas
def filter_bad_excel_strings(tokens_string):
    tokens_string = re.sub('[\000-\010]|[\013-\014]|[\016-\037]', '', tokens_string)
    if tokens_string.startswith("="):
        return tokens_string[1:]
    elif tokens_string.startswith("- "):
        return tokens_string[2:]
    else:
        return tokens_string

def map_articles(token_extractor, f, articles):
    def map_article(article):
        tokens_string = ",".join(map(token_extractor, f(article)))
        # Replace excel invalid chars
        tokens_strings = filter_bad_excel_strings(tokens_string)
        return tokens_string
    return list(map(map_article, articles))

def make_excel_df(docs_column_name = 'docs'):
    df_excel = pd.DataFrame()
    docs = df[docs_column_name]
    df_excel['Title'] = df['Title']
    df_excel['Nouns'] = map_articles(lambda token: token.orth_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'NOUN' or word.tag_ == 'NNP' or word.tag_ == 'NNPS', sentence), 
                                     docs)
    df_excel['Noun Chunks (1)'] = map_articles(lambda token: token, 
                                     lambda doc : get_nouns(doc.sents), 
                                     docs)
    df_excel['Noun Chunks (2)'] = map_articles(
                                     lambda chunk: str(chunk),
                                     lambda doc : list(doc.noun_chunks), 
                                     docs)
    df_excel['Noun Chunks (3) +1 words'] = map_articles(
                                     lambda chunk: str(chunk),
                                     lambda doc : list(filter(lambda chunk: len(str(chunk).split(' ')) >= 2, list(doc.noun_chunks))), 
                                     docs)
    #df_excel['Noun Chunks +1 words (3)'] = map_articles(lambda token: token, 
    #                                 list(filter(lambda chunk: len(str(chunk).split(' ')) >= 2, list(article_docs.noun_chunks))))
    df_excel['Verbs'] = map_articles(lambda token: token.orth_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'VERB', sentence), 
                                     docs)
    df_excel['Verbs Lemma'] = map_articles(lambda token: token.lemma_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'VERB', sentence), 
                                     docs)
    df_excel['Adjectives'] = map_articles(lambda token: token.orth_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'ADJ', sentence), 
                                     docs)
    df_excel['Adjectives Lemma'] = map_articles(lambda token: token.lemma_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'ADJ', sentence), 
                                     docs)
    df_excel['Adverbs'] = map_articles(lambda token: token.orth_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'ADV', sentence), 
                                     docs)
    df_excel['Adverbs Lemma'] = map_articles(lambda token: token.lemma_, 
                                     lambda sentence : filter(lambda word: word.pos_ == 'ADV', sentence), 
                                     docs)
    df_excel['Superlatives'] = map_articles(lambda token: token.orth_, 
                                     lambda sentence : filter(lambda word: word.tag_ == 'JJS' or word.tag_ == 'RBS', sentence), 
                                     docs)
    df_excel['Superlatives Lemma'] = map_articles(lambda token: token.orth_, 
                                     lambda sentence : filter(lambda word: word.tag_ == 'JJS' or word.tag_ == 'RBS', sentence), 
                                     docs)
    df_excel['Entities'] = map_articles(lambda ent: ent.orth_, 
                                     lambda sentence : sentence.ents, 
                                     docs)
    return df_excel

In [10]:
df_excel_titles = make_excel_df()
df_excel_titles.head()

Unnamed: 0,Title,Nouns,Noun Chunks (1),Noun Chunks (2),Noun Chunks (3) +1 words,Verbs,Verbs Lemma,Adjectives,Adjectives Lemma,Adverbs,Adverbs Lemma,Superlatives,Superlatives Lemma,Entities
0,ParaDocs provides our clients with the highest...,"ParaDocs,clients,level,emergency,standby,Profe...","ParaDocs,Professionalism,large public concerts,We","ParaDocs,our clients,the highest level,emergen...","our clients,the highest level,emergency medica...","provides,are,can,provide,fit,provide,trained","provide,be,can,provide,fit,provide,train","our,highest,medical,clinical,our,core,large,pu...","-PRON-,high,medical,clinical,-PRON-,core,large...","only,most,highly","only,most,highly","highest,most","highest,most","ParaDocs,™s"
1,"AGNITY Global Inc., (AGNITY) is a global provi...","AGNITY,Global,Inc.,AGNITY,provider,Intelligent...","AGNITY Global Inc.,AGNITYâ€™s products,All AGN...","AGNITY Global Inc.,AGNITY,a global provider,In...","AGNITY Global Inc.,a global provider,Intellige...","is,enable,transform,become,capitalize,are,powe...","be,enable,transform,become,capitalize,be,power...","global,real,their,workplace,competitive,new,it...","global,real,-PRON-,workplace,competitive,new,-...",more,more,,,"AGNITY Global Inc.,AGNITY,Intelligent Business..."
2,FlashCo has grown to be one of the largest man...,"FlashCo,manufacturers,flashings,accessories,No...","FlashCo,We,FlashCo,Our designs,FlashCo,its pro...","FlashCo,the largest manufacturers,roof flashin...","the largest manufacturers,roof flashings,North...","has,grown,be,are,providing,designed,lasting,ha...","have,grow,be,be,provide,design,last,have,make,...","largest,roof,committed,longest,roof,available,...","large,roof,committed,long,roof,available,signi...","best,strongly,when","best,strongly,when","largest,best,longest,best,best,most,highest,ne...","largest,best,longest,best,best,most,highest,ne...","FlashCo,North America,FlashCo,FlashCo,FlashCo,..."
3,PAC Worldwide is a global leader in the manufa...,"PAC,Worldwide,leader,manufacturing,distributio...","PAC Worldwide,Our products,We","PAC Worldwide,a global leader,the manufacturin...","PAC Worldwide,a global leader,the manufacturin...","is,branded,include,provide,branded,will,receive","be,brand,include,provide,brand,will,receive","global,protective,Our,flat,our,highest,our,you...","global,protective,-PRON-,flat,-PRON-,high,-PRO...",now,now,highest,highest,
4,"DynexÂ® Technologies, Inc. is an original pion...","DynexÂ,®,Technologies,Inc.,pioneer,technology,...","DynexÂ®,Our talented, multidisciplinary staff,...","DynexÂ®,Technologies,Inc.,an original pioneer,...","an original pioneer,microplate technology,Our ...","is,include,deliver,cutting,meet,improve,enhanc...","be,include,deliver,cut,meet,improve,enhance,le...","original,microplate,Our,talented,multidiscipli...","original,microplate,-PRON-,talented,multidisci...","approximately,worldwide,most,ultimately,also,w...","approximately,worldwide,most,ultimately,also,w...",most,most,"Technologies, Inc.,approximately 100,DSXÂ,ELIS..."


In [11]:
writer = pd.ExcelWriter(output_filename)
df_excel_titles.to_excel(writer,'Titles')
writer.save()