In [156]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk 

In [157]:
import textwrap
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [158]:
text = []
label = []
for dirname, _ , filenames in os.walk('bbc'):
    try:
        filenames.remove('README.TXT')
    except:
        pass
    for filename in filenames:
          if filename.endswith(".txt"):
                label.append(dirname.split("\\")[-1])
                fullpathfile = os.path.join(dirname,filename)
                with open(fullpathfile, 'r', encoding="utf8", errors='ignore') as infile:
                    intext = ''
                    for line in infile:
                        intext = intext + ' ' + line
                    text.append(intext)

In [159]:
df = pd.DataFrame(list(zip(text, label)), 
               columns =['text', 'label'])
df.head()

Unnamed: 0,text,label
0,Ad sales boost Time Warner profit\n \n Quarte...,business
1,Dollar gains on Greenspan speech\n \n The dol...,business
2,Yukos unit buyer faces loan claim\n \n The ow...,business
3,High fuel prices hit BA's profits\n \n Britis...,business
4,Pernod takeover talk lifts Domecq\n \n Shares...,business


In [160]:
df.label.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: label, dtype: int64

In [161]:
tech = df[df['label'] == 'tech']['text']
tech.head()

1824     Ink helps drive democracy in Asia\n \n The Ky...
1825     China net cafe culture crackdown\n \n Chinese...
1826     Microsoft seeking spyware trojan\n \n Microso...
1827     Digital guru floats sub-$100 PC\n \n Nicholas...
1828     Technology gets the creative bug\n \n The hi-...
Name: text, dtype: object

**Test on one text**

In [162]:
print(wrap(tech.iloc[2]))

 Microsoft seeking spyware trojan
 
 Microsoft is investigating a
trojan program that attempts to switch off the firm's anti-spyware
software.
 
 The spyware tool was only released by Microsoft in the
last few weeks and has been downloaded by six million people.  Stephen
Toulouse, a security manager at Microsoft, said the malicious program
was called Bankash-A Trojan and was being sent as an e-mail
attachment.  Microsoft said it did not believe the program was
widespread and recommended users to use an anti-virus program.  The
program attempts to disable or delete Microsoft's anti-spyware tool
 
 It may also try to
steal online banking passwords or other personal information by
tracking users' keystrokes.
 
 Microsoft said in a statement it is
investigating what it called a criminal attack on its software.
Earlier this week, Microsoft said it would buy anti-virus software
maker Sybari Software to improve its security in its Windows and
e-mail software.  Microsoft has said it plans to o

In [163]:
#tokenize to sentences
sents = nltk.sent_tokenize(tech.iloc[2].split("\n",1)[1])
print(sent_sample)



In [164]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize = TfidfVectorizer(stop_words=stopwords.words('english'),
                           norm='l1') #l1 is for not be bias by the long text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\e175932\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\e175932\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [165]:
X = vectorize.fit_transform(sents)

#calculate scores for the ranking
def get_sentence_score(tfidf_vec):
    x = tfidf_vec[tfidf_vec != 0 ]  #only count the word appear more than 1
    return x.mean()
scores = np.zeros(len(sents))
for i in range(len(sents)):
        score = get_sentence_score(X[i,:])
        scores[i] = score


In [166]:
sort_idx = np.argsort(-scores) #the argsort return the index, -scores for the descending order

In [167]:
#generate the summary by choosing top 30% sentences
n = int(0.3*len(sents))
for i in sort_idx[:n]:
    print(wrap( "%.2f: %s" % (scores[i], sents[i])))

0.12: Microsoft said in a statement it is investigating what it called
a criminal attack on its software.
0.10:  
 Microsoft is investigating a trojan program that attempts to
switch off the firm's anti-spyware software.
0.10: The spyware tool was only released by Microsoft in the last few
weeks and has been downloaded by six million people.


In [168]:
#lets check with the title to check how well the summary
tech.iloc[2].split("\n",1)[0]

' Microsoft seeking spyware trojan'

**Compile the process**

In [169]:
def summarize(text):
    sents = nltk.sent_tokenize(text)
    X = vectorize.fit_transform(sents)
    
    #compute scores for each sentence
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        score = get_sentence_score(X[i,:])
        scores[i] = score
    sort_idx = np.argsort(-scores)
    n = int(0.3*len(sents))
    for i in sort_idx[:n]:
        print(wrap( "%.2f: %s" % (scores[i], sents[i])))

**Entertaiment Article**

In [171]:
doc = df[df['label'] == "entertainment"]['text'].sample()
summarize(doc.iloc[0].split("\n", 1)[1])
print("\n Orignal Document:")
print(wrap(doc.iloc[0]))

0.20: His brother and bandmate Noel was in bed at the time.
0.17: The band said they were victims of an "unprovoked attack" in a
nightclub.
0.10: At the time, police said a "physical altercation" broke out
among the musicians at about 0200 local time.

 Orignal Document:
 Oasis star fined for German brawl
 
 Oasis singer Liam Gallagher has
been fined 50,000 euros (£35,000) after a fight in a German hotel two
years ago.
 
 Gallagher was arrested along with drummer Alan White and
three other members of the band's entourage after the brawl in Munich
in December 2002. The band said they were victims of an "unprovoked
attack" in a nightclub.  But police said Gallagher kicked an officer
in the chest and had large amounts of alcohol and drugs - possibly
cocaine - in his blood.  Gallagher lost two front teeth in the fight,
which led to the band abandoning their German tour.  His brother and
bandmate Noel was in bed at the time.
 
 "The process has stopped by
paying 50,000 euros," said Anton Wi

**Politics Article**

In [144]:
from termcolor import colored

In [172]:
doc = df[df['label'] == "politics"]['text'].sample()
summarize(doc.iloc[0].split("\n", 1)[1])
print(colored("\n Orignal Document:", attrs=['bold']))
print(wrap(doc.iloc[0]))

0.17: Israel will not attend, but is said to be closely watching the
outcome.
0.12: The attack, which killed five, was the first of its kind since
he took office.
0.11: The Israeli government refuses to accept Syria's denials that it
was implicated in the nightclub bombing.
0.11: He added: "We have an opportunity and it would be irresponsible
if we, the Israelis, or the world allow it to slip away."
[1m
 Orignal Document:[0m
 Abbas 'will not tolerate' attacks
 
 Palestinian leader Mahmoud Abbas
has said he will not tolerate attacks such as last Friday's suicide
bombing in the Israeli city of Tel Aviv.
 
 In an interview ahead of a
meeting in London to discuss Palestinian reforms, Mr Abbas said such
attacks were against Palestinian interests.  The Palestinian Authority
(PA) was exerting "a 100% effort" to end the violence, Mr Abbas added.
The attack, which killed five, was the first of its kind since he took
office.  Mr Abbas confirmed Israel shared information with the PA in
the hunt

**Sport Article**

In [150]:
doc = df[df['label'] == "sport"]['text'].sample()
summarize(doc.iloc[0].split("\n", 1)[1])
print(colored("\n Orignal Document:", attrs=['bold']))
print(wrap(doc.iloc[0]))

0.13: "We are watching it closely," said London race director David
Bedford.
0.12: "There is a long way to go before the race and we are hoping the
situation will be satisfactorily resolved."
[1m
 Orignal Document:[0m
 London hope over Chepkemei
 
 London Marathon organisers are hoping
that banned athlete Susan Chepkemei will still take part in this
year's race on 17 April.
 
 Chepkemei was suspended from all
competition until the end of the year by Athletics Kenya after failing
to report to a national training camp.  "We are watching it closely,"
said London race director David Bedford.  "There is a long way to go
before the race and we are hoping the situation will be satisfactorily
resolved."  The camp in Embu was to prepare for the IAAF World Cross
Country Championships later this month.  Chepkemei however took part
and finished third in last Sunday's world best 10K race in Puerto
Rico.  The 29-year-old has finished second to Paula Radcliffe in the
2002 and 2003 London races as w