In [1]:
## Importing necessary libraries

import pandas as pd
import numpy as np
import re
import nltk
import string
import contractions
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
from nlp_utils import *

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

import transformers
from transformers import pipeline

from heapq import nlargest
from torchmetrics.text.rouge import ROUGEScore
from pprint import pprint

In [2]:
## Importing the dataset
df = pd.read_csv('/Users/simrankodwani/Desktop/Fall 2022/NLP/Project/cnn_dailymail/train.csv')
df = df[0:10000]
df.head(4)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...


In [3]:
df = df.drop(['id'],axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article     10000 non-null  object
 1   highlights  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [4]:
## Tokenizing the sentences
tokens = []
for sentence in df['article']:
    tokens.append(word_tokenize(sentence))

In [5]:
## Removing punctuations and stopwords from the tokens

stop_words= stopwords.words('english') 
punctuation = string.punctuation
punctuation = punctuation.replace('.','')

format_sentences = []
for sentence in tokens:
    words = []
    for word in sentence:
        if word not in stop_words and word not in punctuation:  
            words.append(word.lower())
    format_sentences.append(words)

In [6]:
## Removing re-occuring words (otherwords) and numbers
otherwords = ['by','associated','press','published','est','october','updated']

formatted_sentences = []
for sentence in format_sentences:
    words = []
    for word in sentence:
        if word not in otherwords and not re.search('\d{2}:\d{2}',word) and not re.search('\d{2,4}',word):  
            words.append(word)
    formatted_sentences.append(words)

In [7]:
## Lemmatizing sentences
lemmatize_sentences = []

lemma = WordNetLemmatizer()

for sentence in formatted_sentences:
    lemmas = []
    for word in sentence:
        lemmas.append(lemma.lemmatize(word))
    lemmatize_sentences.append(lemmas)

In [8]:
## Generating sentences from formatted lists
corpus = []

for sentence in lemmatize_sentences:
    words = ' '.join([str(word) for word in sentence])
    corpus.append(words)

In [9]:
for sentence in corpus:
    sentence = sentence.replace('\'s','')

In [10]:
## Deleting lists that are no longer needed

del tokens
del format_sentences
del formatted_sentences
del lemmatize_sentences

In [11]:
summarizer = pipeline("summarization",  model = "facebook/bart-large-cnn")
rouge = ROUGEScore()
def summarize(sentence, target):
    ## Getting word counts for sentence
    word_count = {}
    for word in word_tokenize(sentence):
        if word not in ['.']:
            if word in word_count.keys():
                word_count[word] +=1
            else:
                word_count[word] = 1
    
                
    ## Getting the word with max count
    max_count = max(word_count, key=word_count.get)
    
    ## 
    max_val = word_count[max_count]
    
    ## Dividing the occurence of each word with value of max_val
    for key,value in word_count.items():
        word_count[key] = word_count[key]/max_val
        
        
    ## Getting sentence scores for each sentence in the corpus
    sentence_score = {}
    for sentences in sent_tokenize(sentence):
        sentence_score[sentences] = 0
        for word in sentences:
            if word in word_count.keys():
                sentence_score[sentences]+=word_count[word]
                
                    
    sentence_list = []
    for sentences in sent_tokenize(sentence):
        sentence_list.append(sentences)
        
        
    summary = summarizer(sentence_list, 
                         min_length = 8, 
                         max_length = 10)
    
    s = []
    for i in range(len(summary)):
        s.append(list(summary[i].values()))
    flat = [item for sublist in s for item in sublist]
    preds = ' '.join(flat)
    print(preds)
    pprint(rouge(preds, target))

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
summarize(corpus[10], df['highlights'][10])

Your max_length is set to 10, but you input_length is only 3. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)
Your max_length is set to 10, but you input_length is only 8. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 10, but you input_length is only 5. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 10, but you input_length is only 6. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 10, but you input_length is only 8. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


CNN.com will feature iRep  ellie zolfagharif  take look map today ’ likely China is four times bigger than green The Mercator projection map is a The mercator projection map is a Land mass cost distorting size favour You might think advent satellite imagery tool Much due technical reason said mr The biggest challenge impossible portray reality spherical One best alternative mercator projection presented The gall-peters projection make A certain place appear stretched horizontally near A depiction of henricus mart It 's said columbus The map reflects latest theory form world Africa around time larger greenland  brazil five time larger alaska The map suggests scandinavian While look like europe larger north  russia also n't large depicted World map typically diverse ranging heart cone One model invented gerardus merc  land mass cost distorting size favour For instance mercator projection north americ Greenland and greenland also look Africa is bigger than Africa. In fact fit north americ

In [15]:
corpus[10]

". ellie zolfagharifard . take look map today ’ likely see north america larger africa alaska larger mexico china smaller greenland . but reality china four time bigger greenland africa three time bigger north america mexico larger alaska . the distortion result mercator projection map commonly seen hanging classroom text book created help sailor navigate world . the mercator projection map commonly seen hanging classroom text book created help sailor navigate world . the familiar map give right shape land mass cost distorting size favour wealthy land north . you might think advent satellite imagery tool google map improved view world ’ necessarily case according james wan writing guardian . much due technical reason said mr wan others inconsistences caused ideological assumption change way see world . the biggest challenge impossible portray reality spherical world flat map – problem haunted cartographer century . one best alternative mercator projection presented d. arno peter pictur