In [36]:
import pandas as pd
import numpy as np 
import textwrap
import nltk 
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Yash
[nltk_data]     Phatak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Yash
[nltk_data]     Phatak\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
data = pd.read_csv('bbc_text_cls.csv')

In [5]:
data.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
#Picking a sample text
doc = data[data.labels == 'business']['text'].sample(random_state=42)

In [7]:
def wrap(x): #Helps print us the results in a better way.
    return textwrap.fill(x,replace_whitespace=False,fix_sentence_endings=True)

In [19]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [30]:
#We use the Split function to remove the title -> split("\n",1) -> atmost one split
#[Title, text] and we sent_tokenize only the text.
sents = nltk.sent_tokenize(doc.iloc[0].split("\n",1)[1])

In [31]:
sents

['\nUK retail sales fell in December, failing to meet expectations and making it by some counts the worst Christmas since 1981.',
 'Retail sales dropped by 1% on the month in December, after a 0.6% rise in November, the Office for National Statistics (ONS) said.',
 'The ONS revised the annual 2004 rate of growth down from the 5.9% estimated in November to 3.2%.',
 'A number of retailers have already reported poor figures for December.',
 'Clothing retailers and non-specialist stores were the worst hit with only internet retailers showing any significant growth, according to the ONS.',
 'The last time retailers endured a tougher Christmas was 23 years previously, when sales plunged 1.7%.',
 'The ONS echoed an earlier caution from Bank of England governor Mervyn King not to read too much into the poor December figures.',
 'Some analysts put a positive gloss on the figures, pointing out that the non-seasonally-adjusted figures showed a performance comparable with 2003.',
 'The November-De

In [41]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    norm='l1' 
)

In [42]:
X = featurizer.fit_transform(sents)

In [43]:
def get_sentence_score(tfidf_row):
    #return average of the non-zerp values
    #of the tfidf representation of the sentence
    x = tfidf_row[tfidf_row!=0]
    return x.mean()

In [44]:
scores = np.zeros(len(sents))
for i in range(len(sents)):
    score = get_sentence_score(X[i,:])
    scores[i]=score

In [50]:
sort_idx = np.argsort(-scores) #descending indices

In [51]:
#Now which sentences to choose.
# 1. Top N sentences
# 2. Top N words or characters
# 3. Top X% sentences or top X% words
# 4. Sentences with scores > average score
# 5. Sentences with Scores > factor*average
# You also dont have to sort. May make more sense in story summary n all.


In [52]:
print("Generated Summary:")
for i in sort_idx[:5]:
    print(wrap("%.2f:%s"%(scores[i],sents[i])))

Generated Summary:
0.14:A number of retailers have already reported poor figures for
December.
0.13:However, reports from some High Street retailers highlight the
weakness of the sector.
0.12:The ONS revised the annual 2004 rate of growth down from the 5.9%
estimated in November to 3.2%.
0.10:"Our view is the Bank of England will keep its powder dry and
wait to see the big picture."
0.10:And a British Retail Consortium survey found that Christmas 2004
was the worst for 10 years.


In [53]:
doc.iloc[0].split("\n",1)[0]

'Christmas sales worst since 1981'

In [54]:
def summarize(text):
    sents = nltk.sent_tokenize(text)
    X = featurizer.fit_transform(sents)
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        scores[i] = get_sentence_score(X[i,:]) 
    sort_idx = np.argsort(-scores)
    for i in sort_idx[:5]:
        print(wrap("%.2f:%s"%(scores[i],sents[i])))

In [58]:
doc = data[data.labels=='entertainment']['text'].sample(random_state=121)
summarize(doc.iloc[0].split("\n",1)[1])

0.50:"We're sure fans will too."
0.20:Weekend tickets are now on sale priced at £125 each.
0.11:Other acts in this year's line-up include The Charlatans, Marilyn
Manson and Kings of Leon.
0.11:Organiser Mean Fiddler said more acts were still to be confirmed
for the summer event.
0.10:It will the first time the veteran British metal band have played
Reading in 23 years.


In [59]:
doc.iloc[0].split("\n",1)[0]

'Pixies take on Reading and Leeds'

In [60]:
print(wrap(doc.iloc[0]))

Pixies take on Reading and Leeds

Pixies, Foo Fighters and Iron Maiden
will headline this summer's Leeds and Reading festivals.

The trio of
rock heavyweights will top the bill for the three-day events at
Bramham Park, near Wetherby, and at Richfield Avenue, Reading.  They
are the Pixies' and Iron Maiden's only UK festival gigs, while Foo
Fighters are also at T In The Park.  The Killers, Razorlight and
Queens of the Stone Age are also playing the twin festivals, to be
held on 26-28 August.  Other acts in this year's line-up include The
Charlatans, Marilyn Manson and Kings of Leon.  Pete Doherty's band
Babyshamblers will appear on the NME/Radio One stage along with
Kasabian, Black Rebel Motorcycle Club and Bloc Party.

Organiser Mean
Fiddler said more acts were still to be confirmed for the summer
event.  "We are all very excited to be going back to Reading and
intend to have a fantastic time," said Iron Maiden's Bruce Dickinson.
"We're sure fans will too."  It will the first time the v