In [2]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashut\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
df=pd.read_csv('bbc_text_cls (1).csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
lables=set(df['labels'])
lables

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [12]:
label='politics'

896    Labour plans maternity pay rise\n\nMaternity p...
897    Watchdog probes e-mail deletions\n\nThe inform...
898    Hewitt decries 'career sexism'\n\nPlans to ext...
899    Labour chooses Manchester\n\nThe Labour Party ...
900    Brown ally rejects Budget spree\n\nChancellor ...
Name: text, dtype: object

In [26]:
doc=df[df.labels=='politics']['text'].sample(random_state=10)
doc

1124    Iraqis win death test case probe\n\nThe family...
Name: text, dtype: object

In [27]:
def wrap(x):
    return textwrap.fill(x,replace_whitespace=False,fix_sentence_endings=True)



In [28]:
print(wrap(doc.iloc[0]))

Iraqis win death test case probe

The family of an Iraqi civilian
allegedly killed by UK troops have won a challenge against the
government's refusal to order a full inquiry.

The High Court ruled on
Tuesday that Baha Mousa's death in British custody in Iraq fell within
the European Convention on Human Rights.  And the judges paved the way
for an independent inquiry by saying previous investigations were
inadequate.  But judicial reviews into five other deaths in southern
Iraq were ruled out.  Their families will be appealing against the
judgement.

The families' solicitor Phil Shiner described it as "a
historic day for human rights and the rule of law in the UK". Father-
of-two Mr Mousa, 28, a hotel receptionist, was arrested with eight men
seized at a hotel in Basra in September 2003. He was allegedly beaten
to death while in the custody of the Queen's Lancashire Regiment.  The
Iraqi families' lawyer argued that failing to adequately investigate
the death breached the European Conven

In [31]:
sents=nltk.sent_tokenize(doc.iloc[0].split("\n",1)[1])
sents

["\nThe family of an Iraqi civilian allegedly killed by UK troops have won a challenge against the government's refusal to order a full inquiry.",
 "The High Court ruled on Tuesday that Baha Mousa's death in British custody in Iraq fell within the European Convention on Human Rights.",
 'And the judges paved the way for an independent inquiry by saying previous investigations were inadequate.',
 'But judicial reviews into five other deaths in southern Iraq were ruled out.',
 'Their families will be appealing against the judgement.',
 'The families\' solicitor Phil Shiner described it as "a historic day for human rights and the rule of law in the UK".',
 'Father-of-two Mr Mousa, 28, a hotel receptionist, was arrested with eight men seized at a hotel in Basra in September 2003.',
 "He was allegedly beaten to death while in the custody of the Queen's Lancashire Regiment.",
 "The Iraqi families' lawyer argued that failing to adequately investigate the death breached the European Convention

In [32]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    norm='l1',
)

In [33]:
x=featurizer.fit_transform(sents)

In [34]:
def getSentScore(tfidf_row):
    x=tfidf_row[tfidf_row!=0]
    return x.mean()



In [36]:
# this score is the sum of all non zero text frequency of the vector of particular sentences
# more the score better that occurence of important term as more freq is important
scores = np.zeros(len(sents))
for i in range(len(sents)):
  score = getSentScore(x[i,:])
  scores[i] = score
scores  

array([0.07692308, 0.0625    , 0.11111111, 0.14285714, 0.33333333,
       0.08333333, 0.07142857, 0.14285714, 0.07692308, 0.08333333,
       0.05882353, 0.1       , 0.14285714, 0.1       , 0.0625    ,
       0.14285714, 0.14285714, 0.07692308, 0.25      , 0.08333333,
       0.09090909, 0.25      , 0.25      , 0.05555556, 0.1       ,
       0.09090909])

In [37]:
indexSort=np.argsort(-scores)
print("genetated summary")

genetated summary


In [39]:
for i in indexSort[:20]:
    print(wrap("%.2f: %s" % (scores[i], sents[i])))



0.33: Their families will be appealing against the judgement.
0.25: The MoD are considering whether to appeal."
0.25: "I can't say anything further for obvious reasons.
0.25: Both sides were granted permission to appeal.
0.14: "There must be an effective public investigation by an
independent official body.
0.14: But judicial reviews into five other deaths in southern Iraq
were ruled out.
0.14: He was allegedly beaten to death while in the custody of the
Queen's Lancashire Regiment.
0.14: Only such an investigation could reveal what really happened and
who might be responsible."
0.14: The other five Iraqis did not die in custody, so their cases had
to fail, they said.
0.11: And the judges paved the way for an independent inquiry by
saying previous investigations were inadequate.
0.10: Because we don't want people thinking that British soldiers beat
up civilians and get away with it'," he said.
0.10: They said as Mr Mousa was in custody when he died, his case came
within the UK's jurisd