In [1]:
# install needed Packages
import nltk
from nltk.tokenize import word_tokenize
import textstat
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import re
import spacy
from spacy import displacy
import json
nlp = spacy.load('en_core_web_sm')
print('Done')



Done


**Functions**


In [2]:
def lexical_diversity(text):
    word_count = len(text)
    vocab_size = len(set(text))
    diversity_score = vocab_size / word_count
    return (diversity_score * 100)


def freq_dist_sentence(text,stop_flag = False):
    '''
    Returns word count for Each Sentence
    '''
    tokenized_word = word_tokenize(text.lower())
    if stop_flag:
        stop_words=set(stopwords.words("english"))
        tokenized_word = [x for x in tokenized_word if (x not in stop_words and x.isalpha())]
    fdist = FreqDist(tokenized_word)
    return fdist

def polarity_sc(text):
    sid = SentimentIntensityAnalyzer()
    scores = sid.polarity_scores(text)
    return scores

def reading_standard(text):
    x = textstat.text_standard(text)
    match = re.search(r'(.?\d+)th(\s\w{3}\s((.?\d+)))?',x)
    r_stan = []
    if match:
        r_stan.append(match.group(1))
        r_stan.append(match.group(3))
    return r_stan

def spacy_vizualizer(title,text):
    text = nlp(text)
    title = nlp(title)
    html_dep = displacy.render(title, style = 'dep', minify = True)
    html_ent = displacy.render(text, style = 'ent', minify = True)
    return (html_dep, html_ent)   
    

def get_article_features(title, text):
    result = {}
    Result = []
    result['lexical_diversity'] = lexical_diversity(text)
    result['word_dist'] = dict(freq_dist_sentence(text))
    result['word_dist_without_stopwords'] = freq_dist_sentence(text,stop_flag=True)
    result['polarity_title_pos'] = polarity_sc(title)['pos'] * 100
    result['polarity_title_neg'] = polarity_sc(title)['neg'] * 100
    result['polarity_title_neu'] = polarity_sc(title)['neu'] * 100
    result['reading_standard'] = reading_standard(text)
    result['dependency_html'],result['ner_html'] = spacy_vizualizer(title,text)
    Result.append(result)# list of Dictionaries
    json_result = json.dumps(Result)
    return json_result
    

# JSON READ

In [3]:
# spacy vizuals
# most important word
# similarity
text = '''Researchers have developed a photocatalytic system based on a material in the class of metal-organic frameworks.
The system can be used to degrade pollutants present in water while simultaneously producing hydrogen that can be captured
and used further.'''

title = 'New material cleans and splits water which made them happy!'


In [4]:
response = get_article_features(title,text)
d = json.loads(response)

### Lexical Diversity

In [5]:
d[0]['lexical_diversity']

11.067193675889328

## Word Distribution with and without stopwords

In [6]:
dict(d[0]['word_dist'])

{'.': 2,
 'a': 2,
 'and': 1,
 'based': 1,
 'be': 2,
 'can': 2,
 'captured': 1,
 'class': 1,
 'degrade': 1,
 'developed': 1,
 'frameworks': 1,
 'further': 1,
 'have': 1,
 'hydrogen': 1,
 'in': 2,
 'material': 1,
 'metal-organic': 1,
 'of': 1,
 'on': 1,
 'photocatalytic': 1,
 'pollutants': 1,
 'present': 1,
 'producing': 1,
 'researchers': 1,
 'simultaneously': 1,
 'system': 2,
 'that': 1,
 'the': 2,
 'to': 1,
 'used': 2,
 'water': 1,
 'while': 1}

In [7]:
d[0]['word_dist_without_stopwords']

{'based': 1,
 'captured': 1,
 'class': 1,
 'degrade': 1,
 'developed': 1,
 'frameworks': 1,
 'hydrogen': 1,
 'material': 1,
 'photocatalytic': 1,
 'pollutants': 1,
 'present': 1,
 'producing': 1,
 'researchers': 1,
 'simultaneously': 1,
 'system': 2,
 'used': 2,
 'water': 1}

## Polarity Scores

In [8]:
d[0]['polarity_title_pos']

30.7

In [9]:
d[0]['polarity_title_neg'] 

0.0

In [10]:
d[0]['polarity_title_neu']

69.3

## Reading Standard (8th grade and so on)

In [11]:
d[0]['reading_standard']

['13', '14']

## Dependency Graph of Article

In [12]:
d[0]['dependency_html']

'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="0" class="displacy" width="1800" height="399.5" style="max-width: none; height: 399.5px; color: #000000; background: #ffffff; font-family: Arial"><text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"><tspan class="displacy-word" fill="currentColor" x="50">New</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"><tspan class="displacy-word" fill="currentColor" x="225">material</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"><tspan class="displacy-word" fill="currentColor" x="400">cleans</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">NOUN</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="mid

## NER of Content Entered

In [13]:
d[0]['ner_html']

'<div class="entities" style="line-height: 2.5">Researchers have developed a photocatalytic system based on a material in the class of metal-organic frameworks.<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone"><span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span></mark>The system can be used to degrade pollutants present in water while simultaneously producing hydrogen that can be captured<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone"><span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-lef