In [1]:
# install needed Packages
import nltk
from nltk.tokenize import word_tokenize
import textstat
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import re
import spacy
from spacy import displacy
import json
nlp = spacy.load('en_core_web_sm')
print('Done')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/sourabh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Done


**Functions**+


In [51]:
def lexical_diversity(text):
    word_count = len(text)
    vocab_size = len(set(text))
    diversity_score = word_count / vocab_size
    return diversity_score


def freq_dist_sentence(text,stop_flag = False):
    '''
    Returns word count for Each Sentence
    '''
    tokenized_word = word_tokenize(text.lower())
    if stop_flag:
        stop_words=set(stopwords.words("english"))
        tokenized_word = [x for x in tokenized_word if (x not in stop_words and x.isalpha())]
    fdist = FreqDist(tokenized_word)
    return fdist

def polarity_sc(text):
    sid = SentimentIntensityAnalyzer()
    scores = sid.polarity_scores(text)
    return scores

def reading_standard(text):
    x = textstat.text_standard(text)
    match = re.search(r'(.?\d+)th(\s\w{3}\s((.?\d+)))?',x)
    r_stan = []
    if match:
        r_stan.append(match.group(1))
        r_stan.append(match.group(3))
    return r_stan

def spacy_vizualizer(title,text):
    text = nlp(text)
    title = nlp(title)
    html_dep = displacy.render(title, style = 'dep', minify = True)
    html_ent = displacy.render(text, style = 'ent', minify = True)
    return (html_dep, html_ent)   
    

def get_article_features(title, text):
    result = {}
    Result = []
    result['lexical_diversity'] = lexical_diversity(text)
    result['word_dist'] = freq_dist_sentence(text).most_common()
    result['word_dist_without_stopwords'] = freq_dist_sentence(text,stop_flag=True)
    result['polarity_title_pos'] = polarity_sc(title)['pos']
    result['polarity_title_neg'] = polarity_sc(title)['neg']
    result['polarity_title_neu'] = polarity_sc(title)['neu']
    result['reading_standard'] = reading_standard(text)
    result['dependency_html'],result['ner_html'] = spacy_vizualizer(title,text)
    Result.append(result)# list of Dictionaries
    json_result = json.dumps(Result)
    return json_result
    

In [18]:
response = get_article_features(title,text)

In [19]:
response

'[{"lexical_diversity": 12.033333333333333, "word_dist": [[".", 6], ["the", 5], [",", 4], ["aneurysms", 2], ["often", 2], ["in", 2], ["of", 2], ["aneurysm", 2], ["can", 2], ["and", 2], ["occur", 1], ["aorta", 1], ["brain", 1], ["back", 1], ["knee", 1], ["intestine", 1], ["or", 1], ["spleen", 1], ["a", 1], ["ruptured", 1], ["result", 1], ["internal", 1], ["bleeding", 1], ["stroke", 1], ["it", 1], ["sometimes", 1], ["be", 1], ["fatal", 1], ["have", 1], ["no", 1], ["symptoms", 1], ["until", 1], ["they", 1], ["rupture", 1], ["treatment", 1], ["varies", 1], ["from", 1], ["watchful", 1], ["waiting", 1], ["to", 1], ["emergency", 1], ["surgery", 1], ["choice", 1], ["depends", 1], ["on", 1], ["location", 1], ["size", 1], ["condition", 1]], "word_dist_without_stopwords": {"aneurysms": 2, "often": 2, "occur": 1, "aorta": 1, "brain": 1, "back": 1, "knee": 1, "intestine": 1, "spleen": 1, "ruptured": 1, "aneurysm": 2, "result": 1, "internal": 1, "bleeding": 1, "stroke": 1, "sometimes": 1, "fatal": 1

# JSON READ

In [12]:
# spacy vizuals
# most important word
# similarity
text = '''
Aneurysms often occur in the aorta, brain, back of the knee, intestine or spleen. 
A ruptured aneurysm can result in internal bleeding and stroke. It can sometimes be fatal.
Aneurysms often have no symptoms until they rupture.
Treatment varies from watchful waiting to emergency surgery. The choice depends on the location, size and condition of the aneurysm.
'''
title = 'Aneurysm'


In [30]:
d =json.loads(response)

In [38]:
d[0]['dependency_html']
'''
['lexical_diversity']
['word_dist'] 
['word_dist_without_stopwords']
['polarity_title_pos']
['polarity_title_neg'] 
['polarity_title_neu'] 
['reading_standard']
['dependency_html'],['ner_html']
'''

'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="0" class="displacy" width="225" height="137.0" style="max-width: none; height: 137.0px; color: #000000; background: #ffffff; font-family: Arial"><text class="displacy-token" fill="currentColor" text-anchor="middle" y="47.0"><tspan class="displacy-word" fill="currentColor" x="50">Aneurysm</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">PROPN</tspan></text></svg>'

### Lexical Diversity

In [39]:
d[0]['lexical_diversity']

12.033333333333333

## Word Distribution with and without stopwords

In [40]:
d[0]['word_dist']

[['.', 6],
 ['the', 5],
 [',', 4],
 ['aneurysms', 2],
 ['often', 2],
 ['in', 2],
 ['of', 2],
 ['aneurysm', 2],
 ['can', 2],
 ['and', 2],
 ['occur', 1],
 ['aorta', 1],
 ['brain', 1],
 ['back', 1],
 ['knee', 1],
 ['intestine', 1],
 ['or', 1],
 ['spleen', 1],
 ['a', 1],
 ['ruptured', 1],
 ['result', 1],
 ['internal', 1],
 ['bleeding', 1],
 ['stroke', 1],
 ['it', 1],
 ['sometimes', 1],
 ['be', 1],
 ['fatal', 1],
 ['have', 1],
 ['no', 1],
 ['symptoms', 1],
 ['until', 1],
 ['they', 1],
 ['rupture', 1],
 ['treatment', 1],
 ['varies', 1],
 ['from', 1],
 ['watchful', 1],
 ['waiting', 1],
 ['to', 1],
 ['emergency', 1],
 ['surgery', 1],
 ['choice', 1],
 ['depends', 1],
 ['on', 1],
 ['location', 1],
 ['size', 1],
 ['condition', 1]]

In [None]:
d[0]['word_dist_without_stopwords']

## Polarity Scores

In [42]:
d[0]['polarity_title_pos']

0.0

In [43]:
d[0]['polarity_title_neg'] 

0.0

In [44]:
d[0]['polarity_title_neu']

1.0

## Reading Standard (8th grade and so on)

In [45]:
d[0]['reading_standard']

['8', '9']

## Dependency Graph of Article

In [53]:
d[0]['dependency_html']

'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="0" class="displacy" width="225" height="137.0" style="max-width: none; height: 137.0px; color: #000000; background: #ffffff; font-family: Arial"><text class="displacy-token" fill="currentColor" text-anchor="middle" y="47.0"><tspan class="displacy-word" fill="currentColor" x="50">Aneurysm</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">PROPN</tspan></text></svg>'

## NER of Content Entered

In [52]:
d[0]['ner_html']

'<div class="entities" style="line-height: 2.5"><mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone"><span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span></mark><mark class="entity" style="background: #c887fb; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">Aneurysms<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">NORP</span></mark> often occur in the aorta, brain, back of the knee, intestine or spleen. <mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.

In [50]:
spacy_vizualizer('Aneurysm is fun',text)

(None, None)