In [22]:
import sys

f = open('nyt.txt', 'r')

news_content=f.read()
news_content

"Mr. Obama planned to promote the effort on Monday during a visit to Camden, N.J. The ban is part of Mr. Obama's push to ease tensions between law enforcement and minority \\communities in reaction to the crises in Baltimore; Ferguson, Mo. We are, without a doubt, sitting at a defining moment in American policing, Ronald L. Davis, the director of the Office of Community Oriented Policing Services at the Department of Justice, told reporters in a conference call organized by the White House"

In [27]:
import nltk

results=[]

for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)):
    no_of_tokens = len(nltk.word_tokenize(sentence))
    
    # Let's do POS tagging
    tagged = nltk.pos_tag(nltk.word_tokenize(sentence))

    # Count the no of Nouns in the sentence
    no_of_nouns=len([word for word, pos in tagged if pos in ["NN","NNP"] ])

    #Use NER to tag the named entities.
    ners = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False)
    no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'node')])
    
    score=(no_of_ners + no_of_nouns) / float(no_of_tokens)

    results.append((sent_no, no_of_tokens, no_of_ners, no_of_nouns,
                    score,
                    sentence))

In [26]:
results

[(0,
  17,
  0,
  7,
  0.4117647058823529,
  'Mr. Obama planned to promote the effort on Monday during a visit to Camden, N.J.'),
 (1,
  30,
  0,
  12,
  0.4,
  "The ban is part of Mr. Obama's push to ease tensions between law enforcement and minority \\communities in reaction to the crises in Baltimore; Ferguson, Mo."),
 (2,
  47,
  0,
  18,
  0.3829787234042553,
  'We are, without a doubt, sitting at a defining moment in American policing, Ronald L. Davis, the director of the Office of Community Oriented Policing Services at the Department of Justice, told reporters in a conference call organized by the White House')]

In [29]:
for sent in sorted(results, key=lambda x: x[4],reverse=True):
    print sent[5]
    print

Mr. Obama planned to promote the effort on Monday during a visit to Camden, N.J.

The ban is part of Mr. Obama's push to ease tensions between law enforcement and minority \communities in reaction to the crises in Baltimore; Ferguson, Mo.

We are, without a doubt, sitting at a defining moment in American policing, Ronald L. Davis, the director of the Office of Community Oriented Policing Services at the Department of Justice, told reporters in a conference call organized by the White House



### TF-IDF 

In [41]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
results = []

news_content = """Mr. Obama planned to promote the effort on Monday during
a visit to Camden, N.J. The ban is part of Mr. Obama's push to ease
tensions between law enforcement and minority \communities in reaction to
the crises in Baltimore; Ferguson, Mo. We are, without a doubt, sitting
at a defining moment in American policing, Ronald L. Davis, the director
of the Office of Community Oriented Policing Services at the Department
of Justice, told reporters in a conference call organized by the White
House"""

# sentence tokenize
sentences = nltk.sent_tokenize(news_content)

vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)

sklearn_binary = vectorizer.fit_transform(sentences)
print vectorizer.get_feature_names()
print sklearn_binary.toarray()
for i in sklearn_binary.toarray():
    results.append(i.sum()/float(len(i.nonzero()[0])))

results

[u'american', u'and', u'are', u'at', u'baltimore', u'ban', u'between', u'by', u'call', u'camden', u'communities', u'community', u'conference', u'crises', u'davis', u'defining', u'department', u'director', u'doubt', u'during', u'ease', u'effort', u'enforcement', u'ferguson', u'house', u'in', u'is', u'justice', u'law', u'minority', u'mo', u'moment', u'monday', u'mr', u'obama', u'of', u'office', u'on', u'organized', u'oriented', u'part', u'planned', u'policing', u'promote', u'push', u'reaction', u'reporters', u'ronald', u'services', u'sitting', u'tensions', u'the', u'to', u'told', u'visit', u'we', u'white', u'without']
[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.30993994  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.30993994
   0.          0.30993994  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.309

[0.28281630091973642, 0.20674753944674082, 0.17991562138192305]