In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import warnings
import seaborn as sns
%matplotlib inline
warnings.filterwarnings('ignore')
from time import sleep
import re



# Text Exploration - Spacy


In [23]:
import spacy

#nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en')

In [3]:
file_in = open("articles.txt","r")
corpus = file_in.read()
file_in.close()

In [4]:
%%time
parsed_corpus= nlp(corpus)

CPU times: user 39.1 s, sys: 1.41 s, total: 40.5 s
Wall time: 41.1 s


In [5]:
limit = 50
for num, sentence in enumerate(parsed_corpus.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence)
    print('')
    if num == limit:
        break


Sentence 1:
     Abstract Purpose of Study: The authors determined short-term effects of a home environmental intervention on self-efficacy and upset in caregivers and daily function of dementia patients.

Sentence 2:
They also determined if treatment effect varied by caregiver gender, race, and relationship to patient.

Sentence 3:
Design and Methods: Families (N = 171) of dementia patients were randomized to intervention or usual care control group.

Sentence 4:
The intervention involved 5 90-min home visits by occupational therapists who provided education and physical and social environmental modifi-cations.

Sentence 5:
Results: Compared with controls, intervention caregivers reported fewer declines in patients' instrumental activities of daily living (p = .030) and less decline in self-care and fewer behavior problems in patients at 3 months post-test.

Sentence 6:
Also, intervention spouses reported reduced upset (p = .049), women reported enhanced self-efficacy in managing beha

## POS

In [6]:
token_text = [token.orth_ for token in parsed_corpus]
token_pos = [token.pos_ for token in parsed_corpus]
pd.DataFrame(list(zip(token_text, token_pos)),
    columns=['token_text', 'part_of_speech'])


Unnamed: 0,token_text,part_of_speech
0,,SPACE
1,Abstract,PROPN
2,Purpose,PROPN
3,of,ADP
4,Study,PROPN
5,:,PUNCT
6,The,DET
7,authors,NOUN
8,determined,VERB
9,short,ADJ


## Named-entity recognition (NER) 

In [7]:
for num, entity in enumerate(parsed_corpus.ents):
    print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print('')
    if num == limit:
        break

Entity 1:      Abstract Purpose of Study - ORG

Entity 2: 171 - CARDINAL

Entity 3: 5 - CARDINAL

Entity 4: daily - DATE

Entity 5: at 3 months - DATE

Entity 6: IADL - ORG

Entity 7:  Home - ORG

Entity 8: Laurence G. Branch - PERSON

Entity 9: Schulz - PERSON

Entity 10: Burgio - PERSON

Entity 11: 1996 - DATE

Entity 12: Biegel - PERSON

Entity 13: Schulz - PERSON

Entity 14: 1999 - DATE

Entity 15: Cox 1998 - DATE

Entity 16: African - NORP

Entity 17: American - NORP

Entity 18: Zarit - GPE

Entity 19: Stephens - GPE

Entity 20: Townsend - GPE

Entity 21: Greene - PERSON

Entity 22: Leitsch 1999 - DATE

Entity 23: daily - DATE

Entity 24: Lawton - PERSON

Entity 25: Nahemow 1973 - DATE

Entity 26: e.g. - LAW

Entity 27: Schulz - PERSON

Entity 28: Heckhausen 1999 - DATE

Entity 29: Pynoos - PERSON

Entity 30: Ohta - PERSON

Entity 31: 1991 - DATE

Entity 32: 12 - CARDINAL

Entity 33: 66% of - PERCENT

Entity 34: 89% remained - PERCENT

Entity 35: Gitlin - PERSON

Entity 36: Corcor

## Lemma

In [8]:
token_lemma = [token.lemma_ for token in parsed_corpus]
token_shape = [token.shape_ for token in parsed_corpus]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,,,
1,Abstract,abstract,Xxxxx
2,Purpose,purpose,Xxxxx
3,of,of,xx
4,Study,study,Xxxxx
5,:,:,:
6,The,the,Xxx
7,authors,author,xxxx
8,determined,determine,xxxx
9,short,short,xxxx


## Entity Type, IOB

In [9]:
token_entity_type = [token.ent_type_ for token in parsed_corpus]
token_entity_iob = [token.ent_iob_ for token in parsed_corpus]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,,ORG,B
1,Abstract,ORG,I
2,Purpose,ORG,I
3,of,ORG,I
4,Study,ORG,I
5,:,,O
6,The,,O
7,authors,,O
8,determined,,O
9,short,,O


## Misc Attribution

In [10]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_corpus]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,,-11.804298,,,Yes,,
1,Abstract,-19.579313,,,,,
2,Purpose,-19.579313,,,,,
3,of,-4.128464,Yes,,,,
4,Study,-19.579313,,,,,
5,:,-6.052439,,Yes,,,
6,The,-5.774222,Yes,,,,
7,authors,-11.414688,,,,,
8,determined,-10.898046,,,,,
9,short,-8.915989,,,,,
