In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Dataset

https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

# EDA

In [2]:
df_train = pd.read_csv("cnn_dailymail/train.csv")

In [3]:
df_train.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
df_

In [4]:
print("Shape: ", df_train.shape)

Shape:  (287113, 3)


In [5]:
df_train.isna().sum()

id            0
article       0
highlights    0
dtype: int64

In [6]:
df_train.duplicated(subset= ['article', 'highlights']).sum()

3098

In [7]:
df_train = df_train.drop_duplicates(subset= ['article', 'highlights'])
assert df_train.duplicated(subset= ['article', 'highlights']).sum() == 0

In [8]:
print('Example: \n')

print("Article: \n")
sample = df_train.sample()
article = sample["article"].values[0]
print(article)
print("\nSummary: \n")
summary = sample["highlights"].values[0]
print(summary)

Example: 

Article: 

(CNN Student News) -- October 15, 2013 . With the debt ceiling deadline days away, leaders in Washington say they're optimistic about reaching a deal. We also discuss an infectious bacterium that appeared in an NFL locker room, and we explain the Muslim ritual of Hajj. Plus, we make a Career Connection with an opera singer, and a high school elects a king and queen that students say embody the spirit of homecoming. On this page you will find today's show Transcript, the Daily Curriculum, Maps pertaining to today's show, and a place for you to leave feedback. TRANSCRIPT . Click here to access the transcript of today's CNN Student News program. Please note that there may be a delay between the time when the video is available and when the transcript is published. DAILY CURRICULUM . Click here for a printable version of the Daily Curriculum (PDF). Media Literacy Question of the Day: . What are the benefits and limitations of a news report about a medical issue? Where

In [9]:
df_train = df_train.set_index("id")

In [10]:
df = df_train.sample(10000)
df.shape

(10000, 2)

# Preprocessing

In [11]:
def preprocessing(df):
    result = df.copy()
    result["sent_token"] = result["article"].apply(lambda article: sent_tokenize(article))

In [12]:
df["sent_token"] = df["article"].apply(lambda article: sent_tokenize(article))

# Models

## Random Model
This model takes randomly some sentences in the text to create the summary.

In [13]:
class RandomModel:
    def summarize(self, articles, threshold=0.2):
        result = []
        for article in articles:
            sentence_tokenized = sent_tokenize(article)
            num_sentences = int(threshold*len(sentence_tokenized))
            index_sentences_summary = np.sort(np.random.choice(len(sentence_tokenized), 
                                                               num_sentences, 
                                                               replace=False))
            summary = "".join(list(np.array(sentence_tokenized)[index_sentences_summary]))
            result.append(summary)

In [14]:
rm = RandomModel()
summaries = rm.summarize(df["article"].values)

## Weight Sentences Model

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def create_dictionary_table(text_string) -> dict:
   
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    # Reducing words to their root form
    stem = PorterStemmer()
    
    # Creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benjamindrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
create_dictionary_table(article)

{'(': 3,
 'cnn': 5,
 'student': 6,
 'new': 5,
 ')': 3,
 '--': 1,
 'octob': 2,
 '15': 1,
 ',': 16,
 '2013': 1,
 '.': 36,
 'debt': 3,
 'ceil': 3,
 'deadlin': 1,
 'day': 2,
 'away': 1,
 'leader': 1,
 'washington': 2,
 'say': 2,
 "'re": 2,
 'optimist': 1,
 'reach': 2,
 'deal': 2,
 'also': 2,
 'discuss': 2,
 'infecti': 1,
 'bacterium': 2,
 'appear': 1,
 'nfl': 1,
 'locker': 1,
 'room': 1,
 'explain': 2,
 'muslim': 1,
 'ritual': 1,
 'hajj': 2,
 'plu': 1,
 'make': 1,
 'career': 2,
 'connect': 1,
 'opera': 2,
 'singer': 2,
 'high': 1,
 'school': 1,
 'elect': 1,
 'king': 2,
 'queen': 2,
 'embodi': 1,
 'spirit': 1,
 'homecom': 3,
 'thi': 5,
 'page': 3,
 'find': 1,
 'today': 7,
 "'s": 8,
 'show': 5,
 'transcript': 4,
 'daili': 4,
 'curriculum': 4,
 'map': 3,
 'pertain': 1,
 'place': 1,
 'leav': 2,
 'feedback': 4,
 'click': 3,
 'access': 1,
 'program': 5,
 'pleas': 2,
 'note': 1,
 'may': 2,
 'delay': 1,
 'time': 1,
 'video': 1,
 'avail': 1,
 'publish': 1,
 'printabl': 1,
 'version': 1,
 'pdf': 2,


In [17]:
from nltk.tokenize import word_tokenize, sent_tokenize

sentences = sent_tokenize(article)
sentences

['(CNN Student News) -- October 15, 2013 .',
 "With the debt ceiling deadline days away, leaders in Washington say they're optimistic about reaching a deal.",
 'We also discuss an infectious bacterium that appeared in an NFL locker room, and we explain the Muslim ritual of Hajj.',
 'Plus, we make a Career Connection with an opera singer, and a high school elects a king and queen that students say embody the spirit of homecoming.',
 "On this page you will find today's show Transcript, the Daily Curriculum, Maps pertaining to today's show, and a place for you to leave feedback.",
 'TRANSCRIPT .',
 "Click here to access the transcript of today's CNN Student News program.",
 'Please note that there may be a delay between the time when the video is available and when the transcript is published.',
 'DAILY CURRICULUM .',
 'Click here for a printable version of the Daily Curriculum (PDF).',
 'Media Literacy Question of the Day: .',
 'What are the benefits and limitations of a news report abou

In [18]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    # Algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words
      
    return sentence_weight

In [19]:
def _calculate_average_score(sentence_weight) -> int:
   
    # Calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    # Getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [20]:
def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

In [21]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores,  threshold)

    return article_summary


In [22]:
_run_article_summary(article)

NameError: name '_create_dictionary_table' is not defined

# Model Evaluation