In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/cnn/train.csv")

In [4]:
df

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
287108,fffdfb56fdf1a12d364562cc2b9b1d4de7481dee,By . James Rush . Former first daughter Chelse...,Chelsea Clinton said question of running for o...
287109,fffeecb8690b85de8c3faed80adbc7a978f9ae2a,An apologetic Vanilla Ice has given his first ...,"Vanilla Ice, 47 - real name Robert Van Winkle ..."
287110,ffff5231e4c71544bc6c97015cdb16c60e42b3f4,America's most lethal sniper claimed he wished...,America's most lethal sniper made comment in i...
287111,ffff924b14a8d82058b6c1c5368ff1113c1632af,"By . Sara Malm . PUBLISHED: . 12:19 EST, 8 Mar...",A swarm of more than one million has crossed b...


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Randomly select 2000 rows
sampled_df = df.sample(n=2000, random_state=42)  # Adjust random_state for reproducibility

# Extract articles from the sampled DataFrame
articles = sampled_df['article'].tolist()

# Convert articles to TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(articles)

# Calculate similarities between sentences
sentence_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to extract summary
def extractive_summary(article, num_sentences=3):
    sentences = article.split('. ')  # Split the article into sentences
    sentence_scores = [(sentence, np.mean(sentence_similarities[i])) for i, sentence in enumerate(sentences)]
    sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)[:num_sentences]
    summary = '. '.join([score[0] for score in sentence_scores])
    return summary

# Create summaries for each article in the sampled DataFrame
sampled_df['summary'] = sampled_df['article'].apply(lambda x: extractive_summary(x))


In [7]:
sampled_df

Unnamed: 0,id,article,highlights,summary
272581,ed0fed726929c1eeabe6c390e47128dbb7d7a055,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,"towards to south and south east, and further n..."
772,023cd84001b33aed4ff0f3f5ecb0fdd2151cf543,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,I have more energy. Our weight loss has helped...
171868,6a70a0d8d3ed365fe1df6d35f1587a8b9b298618,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,The boy was taken to Milton Ulladulla hospital...
63167,b37204c13ea38b511265e41ac69fb12acfb63f85,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...,"""They are in good health condition. The offici..."
68522,c24e5805afd5145bc48410e876db91d44a06be5e,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,landing site in the snow next to the Russian s...
...,...,...,...,...
51971,93302a6eb3612d76b9a344b5b9da71df9af2613a,By . Nicola Harley . A legal firm that made mo...,Insult came in a training manual Raleys Solici...,"‘I feel very bitter about it,’ he said. ‘When ..."
169533,676526749ee6c87e2fabd9558d06a2bcc31dc8ea,The average interest rate on an easy-access IS...,It is the lowest average rate since MoneyFacts...,The scheme allowed banks and building societie...
109664,195c7db04d3745352471544e3beadf5805ae3f1f,(CNN) -- There are plenty of reasons to fall i...,Staircases are important elements in home deco...,The staircase decor represents a significant c...
118100,248087aad653122712d059e0d14eae65dcf346e7,"By . Steve Robson . PUBLISHED: . 01:29 EST, 25...",Victor Ponta says he is 'rather perplexed' by ...,"He wrote: 'Our people are, by far, our most va..."


In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# Function to extract summary using LexRank
def lexrank_summary(article, num_sentences=3):
    parser = PlaintextParser.from_string(article, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return ' '.join(str(sentence) for sentence in summary)

# Create summaries for each article in the sampled DataFrame using LexRank
sampled_df['lexrank_summary'] = sampled_df['article'].apply(lambda x: lexrank_summary(x))


In [21]:
sampled_df

Unnamed: 0,id,article,highlights,lexrank_summary
272581,ed0fed726929c1eeabe6c390e47128dbb7d7a055,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,Although frost is set to hit the south tonight...
772,023cd84001b33aed4ff0f3f5ecb0fdd2151cf543,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,"Since joining Weight Watchers in January, Marg..."
171868,6a70a0d8d3ed365fe1df6d35f1587a8b9b298618,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,The teenager can be seen trying to stab the sh...
63167,b37204c13ea38b511265e41ac69fb12acfb63f85,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...,Turkish Foreign Ministry spokesman Selcuk Unal...
68522,c24e5805afd5145bc48410e876db91d44a06be5e,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,The Aurora Australis had been cracking through...
...,...,...,...,...
51971,93302a6eb3612d76b9a344b5b9da71df9af2613a,By . Nicola Harley . A legal firm that made mo...,Insult came in a training manual Raleys Solici...,A legal firm that made more than £77 million f...
169533,676526749ee6c87e2fabd9558d06a2bcc31dc8ea,The average interest rate on an easy-access IS...,It is the lowest average rate since MoneyFacts...,The average interest rate on an easy-access IS...
109664,195c7db04d3745352471544e3beadf5805ae3f1f,(CNN) -- There are plenty of reasons to fall i...,Staircases are important elements in home deco...,Jacki Poovey and her husband designed their st...
118100,248087aad653122712d059e0d14eae65dcf346e7,"By . Steve Robson . PUBLISHED: . 01:29 EST, 25...",Victor Ponta says he is 'rather perplexed' by ...,Romania's prime minister has said that the UK ...


In [22]:
from sumy.summarizers.sum_basic import SumBasicSummarizer

# Function to extract summary using SumBasic
def sumbasic_summary(article, num_sentences=3):
    parser = PlaintextParser.from_string(article, Tokenizer("english"))
    summarizer = SumBasicSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return ' '.join(str(sentence) for sentence in summary)

# Create summaries for each article in the sampled DataFrame using SumBasic
sampled_df['sumbasic_summary'] = sampled_df['article'].apply(lambda x: sumbasic_summary(x))


In [23]:
sampled_df

Unnamed: 0,id,article,highlights,lexrank_summary,sumbasic_summary
272581,ed0fed726929c1eeabe6c390e47128dbb7d7a055,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,Although frost is set to hit the south tonight...,"'The best of the sunshine has been . Tonight, ..."
772,023cd84001b33aed4ff0f3f5ecb0fdd2151cf543,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,"Since joining Weight Watchers in January, Marg...","Margaret, has dropped from a hefty size 18 to ..."
171868,6a70a0d8d3ed365fe1df6d35f1587a8b9b298618,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,The teenager can be seen trying to stab the sh...,The curious boy went under the water to film t...
63167,b37204c13ea38b511265e41ac69fb12acfb63f85,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...,Turkish Foreign Ministry spokesman Selcuk Unal...,The people hustled to the southern Turkish Yay...
68522,c24e5805afd5145bc48410e876db91d44a06be5e,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,The Aurora Australis had been cracking through...,A . to the Aurora. Helicopter .
...,...,...,...,...,...
51971,93302a6eb3612d76b9a344b5b9da71df9af2613a,By . Nicola Harley . A legal firm that made mo...,Insult came in a training manual Raleys Solici...,A legal firm that made more than £77 million f...,By . The insult came in a training manual Rale...
169533,676526749ee6c87e2fabd9558d06a2bcc31dc8ea,The average interest rate on an easy-access IS...,It is the lowest average rate since MoneyFacts...,The average interest rate on an easy-access IS...,Experts said it was another sign that savers h...
109664,195c7db04d3745352471544e3beadf5805ae3f1f,(CNN) -- There are plenty of reasons to fall i...,Staircases are important elements in home deco...,Jacki Poovey and her husband designed their st...,"It's a home's conduit and connector. ""The stai..."
118100,248087aad653122712d059e0d14eae65dcf346e7,"By . Steve Robson . PUBLISHED: . 01:29 EST, 25...",Victor Ponta says he is 'rather perplexed' by ...,Romania's prime minister has said that the UK ...,By . And we will do our utmost to keep them in...


In [24]:
from transformers import pipeline

# Function to extract summary using BERT
def bert_summary(article, max_length=150):
    summarizer = pipeline("summarization")
    summary = summarizer(article, max_length=max_length, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Create summaries for each article in the sampled DataFrame using BERT
sampled_df['bert_summary'] = sampled_df['article'].apply(lambda x: bert_summary(x))


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Token indices sequence length is longer than the specified maximum sequence length for this model (1067 > 1024). Running this sequence through the model will result in indexing errors


IndexError: ignored