In [161]:
from rouge import Rouge
from datasets import load_dataset
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import regex as re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from rouge_score import rouge_scorer

# Load data

In [67]:
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

In [68]:
cnn_daily_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [77]:
train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

In [81]:
train_df

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."
...,...,...
287108,"The nine-year-old daughter of a black, unarmed...","Rumain Brisbon, 34, was killed after Phoenix p..."
287109,Legalising assisted suicide is a slippery slop...,"Theo Boer, a European assisted suicide watchdo..."
287110,A group calling itself 'The Women of the 99 Pe...,Ohio congressman criticised for 'condoning the...
287111,Most men enjoy a good pint of lager or real al...,The Black Country Ale Tairsters have been to 1...


In [None]:
df = pd.concat([train_df,test_df, val_df], ignore_index=True)

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [105]:
df_tmp = df.copy()
df = df[:5]

In [175]:
# Reset
df = df_tmp[:5]

In [176]:
print(df['article'][0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

# Preprocess data

In [168]:
def cleaned_list_of_sentences(article):
    # Split text into sentences
    sentences = sent_tokenize(article)

    # Prepare the set of stop words
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # For each sentence, tokenize -> remove stopwords -> rejoin
    filtered_sentences = []
    for sent in sentences:
        filtered_stemmed_tokens = []
        # Tokenize words in this sentence
        tokens = word_tokenize(sent)
        # Apply steaming and filter out stopwords 
        for word in tokens:
            word = stemmer.stem(word) 
            if word not in stop_words:
                filtered_stemmed_tokens.append(word)

        # Rejoin tokens to form a sentence
        filtered_sentence = " ".join(filtered_stemmed_tokens)
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences

In [None]:
# Make lower case
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Remove none alphabetics, none digits and none dot signs 
pattern = r'[^A-Za-z0-9.\s]+'
df['article'] = df['article'].apply(lambda x: re.sub(pattern, '', x))

# Remove stopwords and apply steeming
df['article'] = df['article'].apply(cleaned_list_of_sentences)

# Create summaries

In [171]:
def create_matrix(senteces_list):
    n = len(senteces_list)
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            common_words = set(senteces_list[i]) & set(senteces_list[j])
            matrix[i][j] = matrix[j][i] = len(common_words)
    return matrix

def get_rank(matrix):
    ranking_vector = matrix.sum(axis=1)
    # Step 4: Rank Sentences
    ranked_sentences = sorted(enumerate(ranking_vector), key=lambda x: x[1], reverse=True)

    return ranked_sentences

def produce_summary(summary_threshold, sentences, ranked_sentences):
    summary_indices = [index for index, _ in ranked_sentences[:summary_threshold]]
    summary = " ".join([sentences[i] for i in sorted(summary_indices)])

    return summary 

In [177]:
df['article'][0]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [172]:
summaries = []

for idx, row in df.iterrows():
    list_of_sentences = row["article"]
    print(list_of_sentences)
    matrix = create_matrix(list_of_sentences)
    ranked_sents = get_rank(matrix)
    print(ranked_sents)
    summary_text = produce_summary(summary_threshold=2, 
                                   sentences=list_of_sentences, 
                                   ranked_sentences=ranked_sents)
    summaries.append(summary_text)

df["summary"] = summaries

['london england reuter harri potter star daniel radcliff gain access report 20 million 41.1 million fortun turn 18 monday insist money wont cast spell .', 'daniel radcliff harri potter harri potter order phoenix disappoint gossip columnist around world young actor say ha plan fritter hi cash away fast car drink celebr parti .', 'dont plan one peopl soon turn 18 suddenli buy themselv massiv sport car collect someth similar told australian interview earlier thi month .', 'dont think ill particularli extravag .', 'thing like buy thing cost 10 pound book cd dvd .', '18 radcliff abl gambl casino buy drink pub see horror film hostel part ii current six place hi number one movi uk box offic chart .', 'detail hell mark hi landmark birthday wrap .', 'hi agent publicist comment hi plan .', 'ill definit sort parti said interview .', 'hope none read .', 'radcliff earn first five potter film held trust fund ha abl touch .', 'despit hi grow fame rich actor say keep hi feet firmli ground .', 'peopl 

# Evaluate solution

In [164]:
scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
    use_stemmer=True
)

all_scores = []

for idx, row in df.iterrows():
    candidate = row['summary']
    reference = row['highlights']
    
    # Compute the ROUGE scores for this pair
    scores = scorer.score(candidate, reference)
    
    # Each score is a dict of { 'precision': float, 'recall': float, 'fmeasure': float }
    # for each of the requested metrics.
    all_scores.append(scores)

# Let's attach these raw scores back to the DataFrame as a new column
df['rouge_scores'] = all_scores


In [163]:
df

Unnamed: 0,article,highlights,summary,rouge_scores
0,[london england reuter harri potter star danie...,harry potter star daniel radcliffe gets £20m f...,daniel radcliff harri potter harri potter orde...,"{'rouge1': (0.23076923076923078, 0.17307692307..."
1,[editor note behind scene seri cnn correspond ...,mentally ill inmates in miami are housed on th...,end ninth floor sever mental disturb get ani r...,"{'rouge1': (0.04081632653061224, 0.05714285714..."
2,[minneapoli minnesota cnn driver minneapoli br...,"new: ""i thought i was going to die,"" driver sa...",whole bridg one side mississippi complet gave ...,"{'rouge1': (0.04878048780487805, 0.05714285714..."
3,[washington cnn doctor remov five small polyp ...,"five small polyps found during procedure; ""non...",washington cnn doctor remov five small polyp p...,"{'rouge1': (0.2916666666666667, 0.175, 0.21875..."
4,[cnn nation footbal leagu ha indefinit suspend...,"new: nfl chief, atlanta falcons owner critical...",cnn nation footbal leagu ha indefinit suspend ...,"{'rouge1': (0.1951219512195122, 0.2, 0.1975308..."


In [178]:
df['article'][0]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [165]:
df['summary'][0]

'daniel radcliff harri potter harri potter order phoenix disappoint gossip columnist around world young actor say ha plan fritter hi cash away fast car drink celebr parti . 18 radcliff abl gambl casino buy drink pub see horror film hostel part ii current six place hi number one movi uk box offic chart .'