In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Dataset

https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

# EDA

In [2]:
df_train = pd.read_csv("cnn_dailymail/train.csv")

In [3]:
df_train.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
df_

In [4]:
print("Shape: ", df_train.shape)

Shape:  (287113, 3)


In [5]:
df_train.isna().sum()

id            0
article       0
highlights    0
dtype: int64

In [6]:
df_train.duplicated(subset= ['article', 'highlights']).sum()

3098

In [7]:
df_train = df_train.drop_duplicates(subset= ['article', 'highlights'])
assert df_train.duplicated(subset= ['article', 'highlights']).sum() == 0

In [8]:
print('Example: \n')

print("Article: \n")
sample = df_train.sample()
article = sample["article"].values[0]
print(article)
print("\nSummary: \n")
summary = sample["highlights"].values[0]
print(summary)

Example: 

Article: 

(CNN Student News) -- October 15, 2013 . With the debt ceiling deadline days away, leaders in Washington say they're optimistic about reaching a deal. We also discuss an infectious bacterium that appeared in an NFL locker room, and we explain the Muslim ritual of Hajj. Plus, we make a Career Connection with an opera singer, and a high school elects a king and queen that students say embody the spirit of homecoming. On this page you will find today's show Transcript, the Daily Curriculum, Maps pertaining to today's show, and a place for you to leave feedback. TRANSCRIPT . Click here to access the transcript of today's CNN Student News program. Please note that there may be a delay between the time when the video is available and when the transcript is published. DAILY CURRICULUM . Click here for a printable version of the Daily Curriculum (PDF). Media Literacy Question of the Day: . What are the benefits and limitations of a news report about a medical issue? Where

In [9]:
df_train = df_train.set_index("id")

In [10]:
df = df_train.sample(10000)
df.shape

(10000, 2)

# Preprocessing

In [11]:
def preprocessing(df):
    result = df.copy()
    result["sent_token"] = result["article"].apply(lambda article: sent_tokenize(article))

In [12]:
df["sent_token"] = df["article"].apply(lambda article: sent_tokenize(article))

# Models

## Random Model
This model takes randomly some sentences in the text to create the summary.

In [64]:
class RandomModel:
    def summarize(self, articles, threshold=0.2):
        result = []
        for article in articles:
            sentence_tokenized = sent_tokenize(article)
            num_sentences = int(threshold*len(sentence_tokenized))
            index_sentences_summary = np.sort(np.random.choice(len(sentence_tokenized), 
                                                               num_sentences, 
                                                               replace=False))
            summary = "".join(list(np.array(sentence_tokenized)[index_sentences_summary]))
            result.append(summary)
        return result

In [65]:
rm = RandomModel()
summaries = rm.summarize(df["article"].values)

## Weight Sentences Model

In [113]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def _create_dictionary_table(text_string) -> dict:
   
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    # Reducing words to their root form
    stem = PorterStemmer()
    
    # Creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [114]:
_create_dictionary_table(article)

{'.': 30,
 'dan': 1,
 'bloom': 1,
 'tonn': 2,
 'heroin': 11,
 'worth': 2,
 'almost': 1,
 '£160million': 2,
 'ha': 4,
 'found': 2,
 'hidden': 1,
 'insid': 1,
 'sack': 2,
 'cement': 2,
 'tini': 1,
 'sail': 3,
 'boat': 3,
 'indian': 6,
 'ocean': 6,
 'crew': 2,
 'australian': 2,
 'warship': 2,
 'intercept': 2,
 'dhow': 5,
 '27': 1,
 'nautic': 3,
 'mile': 3,
 'east': 3,
 'kenyan': 2,
 'port': 1,
 'citi': 1,
 'mombasa': 1,
 'discov': 1,
 'drug': 3,
 'stow': 1,
 '46': 1,
 'separ': 1,
 'bag': 1,
 'seizur': 3,
 'largest': 1,
 'ever': 1,
 'histori': 1,
 'combin': 1,
 'maritim': 3,
 'forc': 3,
 ',': 20,
 'joint': 2,
 'oper': 2,
 '30': 1,
 'countri': 1,
 'combat': 1,
 'piraci': 2,
 'milit': 1,
 'smuggl': 1,
 'water': 5,
 'africa': 4,
 'haul': 2,
 ':': 4,
 'seiz': 4,
 'small': 1,
 'dramat': 1,
 'moment': 1,
 'approach': 1,
 'wa': 4,
 'fill': 1,
 'cover': 1,
 'frigat': 1,
 'hma': 2,
 'darwin': 2,
 'vessel': 2,
 'wednesday': 2,
 'night': 1,
 'confisc': 2,
 '1,023': 1,
 'kg': 1,
 'accord': 2,
 'austra

In [115]:
from nltk.tokenize import word_tokenize, sent_tokenize

sentences = sent_tokenize(article)
sentences

['By .',
 'Dan Bloom .',
 'A tonne of heroin worth almost £160million has been found hidden inside sacks of cement on a tiny sailing boat in the Indian Ocean.',
 'The crew of an Australian warship intercepted the dhow 27 nautical miles east of the Kenyan port city of Mombasa and discovered the drugs stowed in 46 separate bags.',
 'The seizure is largest ever in the history of the Combined Maritime Forces, a joint operation between 30 countries to combat piracy, militancy and smuggling in the waters east of Africa.',
 'Haul: More than a tonne of heroin worth £160million has been seized from a small boat in the Indian Ocean .',
 'Dramatic: The moment crews approached the dhow, which was filled with sacks of cement as cover .',
 'The frigate HMAS Darwin intercepted .',
 "the vessel on Wednesday night and confiscated 1,023 kg\xa0 of heroin, according to Australia's Defence Department.",
 'The drugs were destroyed, the online statement said.',
 "'This is a major heroin seizure, which has re

In [116]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    # Algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words
      
    return sentence_weight

In [117]:
def _calculate_average_score(sentence_weight) -> int:
   
    # Calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    # Getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [118]:
def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

In [119]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)
    print(sentence_scores)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores,  threshold)

    return article_summary


In [120]:
_run_article_summary(first_article)

{'Los Ang': 4.6, '"Termin': 5.478260869565218, 'ET) Sat': 6.833333333333333, "That's ": 6.0, 'Through': 4.137931034482759, 'Uniform': 5.222222222222222, 'The FBI': 5.869565217391305, 'Meanwhi': 4.5, 'By late': 6.846153846153846, 'In all,': 4.482758620689655, 'Those t': 4.35, '"Please': 6.6875, '"Thank ': 4.25, 'The Fed': 3.533333333333333, 'ET).': 11.333333333333334, 'It appl': 4.909090909090909, 'On Frid': 6.071428571428571, 'ET), a ': 6.25, 'One TSA': 5.222222222222222, 'By Frid': 6.15, 'There w': 7.857142857142857, 'Passeng': 8.545454545454545, 'Many tr': 5.833333333333333, 'Some st': 5.6, '"First ': 4.0, 'Prayers': 7.625}


' ET) Saturday. That\'s the terminal where a gunman shot three TSA officers Friday morning, killing one of them, before being shot himself and taken into custody. The FBI was still working the shooting scene Saturday inside Terminal 3, and its flights had been assigned to other terminals during the morning, airport officials said. By late Saturday morning, all of LAX\'s roadways were open. "Please contact ur airline for flight status before coming to LAX," the airport said on Twitter. ET). On Friday morning, about 9:20 a.m. PT (12:20 p.m. ET), a gunman opened fire at a Transportation Security Administration checkpoint. By Friday afternoon, the airport was letting employees back into Terminals 1 and 2, said Lindsey, the airport\'s executive director. There was "no time frame" of when the FBI will complete its investigation in Terminal 3, the airport said. Passengers expecting to fly out of Terminal 3 should contact their airlines for more information, Lindsey said. Prayers go out to all

# Model Evaluation

In [None]:
first_article = df.iloc[0, 0]
first_summary = df.iloc[0, 1]
model_summary = summaries[0]

In [105]:
def evaluate_similarity(true_summary, pred_summary):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([true_summary, pred_summary])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)[0, 1]
    return cosine_sim

evaluate_similarity(first_summary, model_summary)

0.16719361979636152

In [107]:
def calculate_all_similarities(true_summaries, pred_summaries):
    assert len(true_summaries) == len(pred_summaries)
    similarities = []
    for true_summary, pred_summary in zip(true_summaries, pred_summaries):
        similarities.append(evaluate_similarity(true_summary, pred_summary))
    return similarities

In [108]:
calculate_all_similarities(df['highlights'].values, summaries)

[0.16719361979636152,
 0.3739501403499871,
 0.4196820108364419,
 0.3716262840381357,
 0.33533902977581237,
 0.318861260559221,
 0.25999013124950704,
 0.34536669516707,
 0.1210220991210687,
 0.36178424704792467,
 0.18186875019474125,
 0.15436920988396846,
 0.14859355444514669,
 0.08445347645448935,
 0.31199090173065386,
 0.3486407169768039,
 0.04163176004221193,
 0.3086521386488764,
 0.23467696015119352,
 0.11159182298754723,
 0.2385960984262253,
 0.24460828333116208,
 0.3275280300523088,
 0.23433823999672979,
 0.31498374813759134,
 0.1432573161437739,
 0.3606949133120981,
 0.4099188117782805,
 0.2504723883126947,
 0.3351166276173549,
 0.5877592532963498,
 0.3376588302293561,
 0.44248599191751714,
 0.1680985646837237,
 0.20269830624225665,
 0.4315665471311296,
 0.24623297815296374,
 0.2892218417324774,
 0.5005152795813375,
 0.1565129971602251,
 0.18814556277652839,
 0.428735117465904,
 0.17748649914778847,
 0.24423510870769363,
 0.28544325265534615,
 0.38465695422575286,
 0.138394613206

In [109]:
def evaluate_model(true_summaries, pred_summaries):
    avg_similarity = np.mean(calculate_all_similarities(true_summaries, pred_summaries))
    print(f"Average similarity: {avg_similarity:.3f}")
    return avg_similarity

In [110]:
evaluate_model(df['highlights'].values, summaries)

Average similarity: 0.293


0.29267890433623517