In [1]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

--2024-08-01 16:57:27--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2024-08-01 16:57:27 (62.4 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [8]:
df = pd.read_csv('bbc_text_cls.csv')

In [9]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [16]:
doc=df[df.labels == 'sport']['text'].sample(random_state=42)

In [17]:
doc

Unnamed: 0,text
1437,"Premier League planning Cole date\n\nThe Premier League is attempting to find a mutually convenient date to investigate allegations Chelsea made an illegal approach for Ashley Cole.\n\nBoth Chelsea and Arsenal will be asked to give evidence to a Premier League commission, but no deadline has been put on when that meeting will convene. ""It's hard to put a date on it,"" a Premier League spokesman confirmed to BBC Sport. ""It's not a formal situation where they've got so much time to respond."" Arsenal and England defender Cole reportedly met Blues boss Jose Mourinho and chief executive Peter Kenyon in a London hotel 11 days ago. Chelsea have yet to officially confirm or deny the meeting, which would be in breach of Premier League rule K3.\n\nNow the Gunners have asked for an inquiry to look into claims that their player has been ""tapped up"". Both clubs have pledged to co-operate with the inquiry which will be conducted on a single day as opposed to being run as an ongoing evaluation. Cole is in negotiations with the Gunners over extending his current deal which ends in 2007. And his Arsenal team-mate Robert Pires has urged the England left-back to stay at Highbury. Pires told the Evening Standard: ""He has been at Arsenal for ever. He is a very attacking left-back and I think he is enjoying his football because at Arsenal he plays in an offensive team. ""I am not sure he will get the same pleasure at Chelsea, even though they are doing so well at the moment. ""I have built a fantastic playing relationship with Ashley. ""We play together so well - we could do it with our eyes shut. ""But you have to respect the decision of the player. Everybody has that right."""


In [18]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [19]:
print(wrap(doc.iloc[0]))

Premier League planning Cole date

The Premier League is attempting to
find a mutually convenient date to investigate allegations Chelsea
made an illegal approach for Ashley Cole.

Both Chelsea and Arsenal
will be asked to give evidence to a Premier League commission, but no
deadline has been put on when that meeting will convene.  "It's hard
to put a date on it," a Premier League spokesman confirmed to BBC
Sport.  "It's not a formal situation where they've got so much time to
respond."  Arsenal and England defender Cole reportedly met Blues boss
Jose Mourinho and chief executive Peter Kenyon in a London hotel 11
days ago.  Chelsea have yet to officially confirm or deny the meeting,
which would be in breach of Premier League rule K3.

Now the Gunners
have asked for an inquiry to look into claims that their player has
been "tapped up". Both clubs have pledged to co-operate with the
inquiry which will be conducted on a single day as opposed to being
run as an ongoing evaluation.  Cole is

In [22]:
print(doc.iloc[0].split("\n", 1)[1])


The Premier League is attempting to find a mutually convenient date to investigate allegations Chelsea made an illegal approach for Ashley Cole.

Both Chelsea and Arsenal will be asked to give evidence to a Premier League commission, but no deadline has been put on when that meeting will convene. "It's hard to put a date on it," a Premier League spokesman confirmed to BBC Sport. "It's not a formal situation where they've got so much time to respond." Arsenal and England defender Cole reportedly met Blues boss Jose Mourinho and chief executive Peter Kenyon in a London hotel 11 days ago. Chelsea have yet to officially confirm or deny the meeting, which would be in breach of Premier League rule K3.

Now the Gunners have asked for an inquiry to look into claims that their player has been "tapped up". Both clubs have pledged to co-operate with the inquiry which will be conducted on a single day as opposed to being run as an ongoing evaluation. Cole is in negotiations with the Gunners over 

In [23]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [24]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    norm='l1')

In [25]:
X = featurizer.fit_transform(sents)


In [27]:
# compute similarity matrix
S = cosine_similarity(X)

In [28]:
# normalize similarity matrix
S /= S.sum(axis=1, keepdims=True)

In [30]:
# uniform transition matrix
U = np.ones_like(S) / len(S)

In [31]:
# smoothed similarity matrix
factor = 0.15
S = (1 - factor) * S + factor * U

In [32]:
# find the limiting / stationary distribution
eigenvals, eigenvecs = np.linalg.eig(S.T)

In [33]:
eigenvecs[:,0] / eigenvecs[:,0].sum()


array([0.06655955, 0.07237271, 0.05798974, 0.05882353, 0.05516387,
       0.0593196 , 0.06413955, 0.05499482, 0.05497252, 0.06324132,
       0.05531656, 0.05782044, 0.05674999, 0.05339598, 0.05519106,
       0.05512522, 0.05882353])

In [34]:
limiting_dist = np.ones(len(S)) / len(S)
threshold = 1e-8
delta = float('inf')
iters = 0
while delta > threshold:
  iters += 1

  # Markov transition
  p = limiting_dist.dot(S)

  # compute change in limiting distribution
  delta = np.abs(p - limiting_dist).sum()

  # update limiting distribution
  limiting_dist = p

print(iters)

61


In [35]:
scores = limiting_dist


In [36]:
sort_idx = np.argsort(-scores)


In [37]:
# Many options for how to choose which sentences to include:

# 1) top N sentences
# 2) top N words
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

print("Generated summary:")
for i in sort_idx[:5]:
  print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.07: Both Chelsea and Arsenal will be asked to give evidence to a
Premier League commission, but no deadline has been put on when that
meeting will convene.
0.07: 
The Premier League is attempting to find a mutually convenient
date to investigate allegations Chelsea made an illegal approach for
Ashley Cole.
0.06: Now the Gunners have asked for an inquiry to look into claims
that their player has been "tapped up".
0.06: And his Arsenal team-mate Robert Pires has urged the England
left-back to stay at Highbury.
0.06: Chelsea have yet to officially confirm or deny the meeting,
which would be in breach of Premier League rule K3.


In [38]:
doc.iloc[0].split("\n")[0]


'Premier League planning Cole date'

Libraries for Text Summarization

In [39]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [40]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [41]:
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(
    doc.iloc[0].split("\n", 1)[1],
    Tokenizer("english"))
summary = summarizer(parser.document, sentences_count=5)

In [42]:
summary


(<Sentence: The Premier League is attempting to find a mutually convenient date to investigate allegations Chelsea made an illegal approach for Ashley Cole.>,
 <Sentence: Both Chelsea and Arsenal will be asked to give evidence to a Premier League commission, but no deadline has been put on when that meeting will convene.>,
 <Sentence: "It's hard to put a date on it," a Premier League spokesman confirmed to BBC Sport.>,
 <Sentence: Both clubs have pledged to co-operate with the inquiry which will be conducted on a single day as opposed to being run as an ongoing evaluation.>,
 <Sentence: He is a very attacking left-back and I think he is enjoying his football because at Arsenal he plays in an offensive team.>)

In [43]:
for s in summary:
  print(wrap(str(s)))

The Premier League is attempting to find a mutually convenient date to
investigate allegations Chelsea made an illegal approach for Ashley
Cole.
Both Chelsea and Arsenal will be asked to give evidence to a Premier
League commission, but no deadline has been put on when that meeting
will convene.
"It's hard to put a date on it," a Premier League spokesman confirmed
to BBC Sport.
Both clubs have pledged to co-operate with the inquiry which will be
conducted on a single day as opposed to being run as an ongoing
evaluation.
He is a very attacking left-back and I think he is enjoying his
football because at Arsenal he plays in an offensive team.


In [44]:
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences_count=5)
for s in summary:
  print(wrap(str(s)))

"It's not a formal situation where they've got so much time to
respond."
Chelsea have yet to officially confirm or deny the meeting, which
would be in breach of Premier League rule K3.
Now the Gunners have asked for an inquiry to look into claims that
their player has been "tapped up".
And his Arsenal team-mate Robert Pires has urged the England left-back
to stay at Highbury.
Pires told the Evening Standard: "He has been at Arsenal for ever.


In [47]:
# https://radimrehurek.com/gensim_3.8.3/summarization/summariser.html
# https://arxiv.org/abs/1602.03606
# Parameters
# text (str) – Given text.
# ratio (float, optional) – Number between 0 and 1 that determines the
#     proportion of the number of sentences of the original text to be
#     chosen for the summary.
# word_count (int or None, optional) – Determines how many words will the
#     output contain. If both parameters are provided, the ratio will be
#     ignored.
# split (bool, optional) – If True, list of sentences will be returned.
#     Otherwise joined strings will bwe returned.
# from gensim.summarization.summarizer import summarize
# summary = summarize(doc.iloc[0].split("\n", 1)[1])
# print(wrap(summary))