In [1]:
import nltk

In [2]:
import sys
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Approach 1 ：score = (no_of_ners + no_of_nouns)/float(no_of_tokens)

### read the content of the news article 

In [3]:
f = open('text_data.txt')
news_content = f.read()
news_content

'Tests showed that the chemical fipronil, which can harm people\'s kidneys, liver and thyroid glands, was found in eggs from the Netherlands.\nFipronil is used to treat lice and ticks in chickens.\nOne German official said up to 10 million of the contaminated eggs may have been sold in Germany.\nChristian Meyer, the agriculture minister for Lower Saxony, told German television that there was a risk to children if they ate two of the eggs a day.\nAbout 180 poultry farms in the Netherlands have been temporarily shut in recent days while investigations are held.\nMeanwhile, European supermarkets have moved to halt the distribution of eggs from the affected batches.\nHowever, Aldi - which has close to 4,000 stores in Germany - is the first retailer to stop selling all eggs as a precaution.\n"This is merely a precaution, there is no reason to assume there are any health risks," Aldi said in a statement.\nA spokeswoman for Aldi UK told the BBC its eggs were all British and were not affected 

In [4]:
results = []
sent_tokens = nltk.sent_tokenize(news_content)
for sent_no, sentence in enumerate(sent_tokens):
    word_tokens = nltk.word_tokenize(sentence) # word tokenization
    
    no_of_tokens = len(word_tokens)  
    print('Number of tokens: ', no_of_tokens)
    
    tagged = nltk.pos_tag(word_tokens)  # POS tagging
    no_of_nouns = len([word for word,pos in tagged if pos in ["NN","NNP"]]) # Count the no of Nouns in the sentence
    
    ners = nltk.ne_chunk(nltk.pos_tag(word_tokens), binary=False) # Use NER to tag the named entities
    no_of_ners = len([chunk for chunk in ners if hasattr(chunk, 'label')]) # has attr 判定chunk 是否有属性‘node‘
    
    score = (no_of_ners + no_of_nouns)/float(no_of_tokens)
    results.append((sent_no, no_of_tokens, no_of_ners, no_of_nouns, score, sentence))
    

Number of tokens:  27
Number of tokens:  11
Number of tokens:  19
Number of tokens:  30
Number of tokens:  19
Number of tokens:  17
Number of tokens:  26
Number of tokens:  26
Number of tokens:  21
Number of tokens:  18
Number of tokens:  19
Number of tokens:  22
Number of tokens:  16


In [5]:
import pandas as pd

In [6]:
df = pd.DataFrame(results,columns=['sent_no','no_of_tokens','no_of_ners','no_of_nouns','score','sentence'])
df

Unnamed: 0,sent_no,no_of_tokens,no_of_ners,no_of_nouns,score,sentence
0,0,27,1,4,0.185185,"Tests showed that the chemical fipronil, which..."
1,1,11,1,2,0.272727,Fipronil is used to treat lice and ticks in ch...
2,2,19,2,2,0.210526,One German official said up to 10 million of t...
3,3,30,4,8,0.4,"Christian Meyer, the agriculture minister for ..."
4,4,19,1,2,0.157895,About 180 poultry farms in the Netherlands hav...
5,5,17,1,1,0.117647,"Meanwhile, European supermarkets have moved to..."
6,6,26,2,4,0.230769,"However, Aldi - which has close to 4,000 store..."
7,7,26,1,5,0.230769,"""This is merely a precaution, there is no reas..."
8,8,21,3,5,0.380952,A spokeswoman for Aldi UK told the BBC its egg...
9,9,18,0,2,0.111111,Reuters reports that investigators believe the...


In [7]:
# 按 score排序
df.sort_values('score',ascending=False)

Unnamed: 0,sent_no,no_of_tokens,no_of_ners,no_of_nouns,score,sentence
3,3,30,4,8,0.4,"Christian Meyer, the agriculture minister for ..."
8,8,21,3,5,0.380952,A spokeswoman for Aldi UK told the BBC its egg...
10,10,19,2,5,0.368421,Poultry World reported that fipronil may have ...
11,11,22,2,5,0.318182,The Netherlands is Europe's largest exporter o...
1,1,11,1,2,0.272727,Fipronil is used to treat lice and ticks in ch...
6,6,26,2,4,0.230769,"However, Aldi - which has close to 4,000 store..."
7,7,26,1,5,0.230769,"""This is merely a precaution, there is no reas..."
2,2,19,2,2,0.210526,One German official said up to 10 million of t...
0,0,27,1,4,0.185185,"Tests showed that the chemical fipronil, which..."
4,4,19,1,2,0.157895,About 180 poultry farms in the Netherlands hav...


# Approach 2
## 除了用上述的score，另一个句子重要性的度量是TF-IDF score
重要的句子通常包含重要的词，其中大部分是内涵丰富（富有歧义）的词——discriminatory words，其度量可采用TF-IDF

In [8]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

## 1.content(string)

In [9]:
results = []
news_content = "Mr. Obama planned to promote the effort on Monday during\
a visit to Camden, N.J. The ban is part of Mr. Obama's push to ease\
tensions between law enforcement and minority communities in reaction to\
the crises in Baltimore; Ferguson, Mo. We are, without a doubt, sitting\
at a defining moment in American policing, Ronald L. Davis, the director\
of the Office of Community Oriented Policing Services at the Department\
of Justice, told reporters in a conference call organized by the White\
House"

## 2.sentence tokens(list)

In [10]:
sentences = nltk.sent_tokenize(news_content)
vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)

## 3. TfidfVectorizer 
### which is a scoring method 
### that calculate a vector of TF-IDF scores for each sentence in a given list of sentences.

## 4. term-doc matrix

For a given list of sentences/documents, will give you the score corresponding to each sentence and 

will also provide the ability to build ** a term-doc matrix** that will look just like our output.

In [14]:
sklearn_binary = vectorizer.fit_transform(sentences) 
len(sentences)
len(sklearn_binary.toarray())
print(".get_feature_names() :\n",vectorizer.get_feature_names())
print("\nsklearn_binary.toarray() term-doc matirx :\n",sklearn_binary.toarray()) # .fit_transfomr().toarray()

3

3

.get_feature_names() :
 ['american', 'and', 'are', 'at', 'baltimore', 'ban', 'between', 'by', 'call', 'camden', 'communities', 'community', 'conference', 'crises', 'davis', 'defining', 'departmentof', 'directorof', 'doubt', 'duringa', 'easetensions', 'effort', 'enforcement', 'ferguson', 'in', 'is', 'justice', 'law', 'minority', 'mo', 'moment', 'monday', 'mr', 'obama', 'of', 'office', 'on', 'organized', 'oriented', 'part', 'planned', 'policing', 'promote', 'push', 'reaction', 'reporters', 'ronald', 'services', 'sittingat', 'the', 'to', 'told', 'tothe', 'visit', 'we', 'whitehouse', 'without']

sklearn_binary.toarray() term-doc matirx :
 [[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.30993994  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.30993994
   0.          0.30993994  0.          0.          0.          0.          0.
   0.          0.          0.          0.        

### 停用词的分值近乎0，特异词如ban、Obama 分值高

In [21]:
# 用非零TF-IDF 词 来计算平均TF-IDF score
results = []
for i in sklearn_binary.toarray(): 
    score_sum = i.sum()
    no_of_nonzero = len(i.nonzero()[0]) # TF-IDF值非零的个数,
    s = score_sum / float(no_of_nonzero)
    results.append(s)
    
print(results)

[0.28281630091973642, 0.205405136454686, 0.1838455955948535]


会发现两种score 很相近

# 了解了上述两种算法，就可以 对任意两篇给定的新闻搞进行信息摘要处理，即可写一个新闻摘要器。