In [1]:
import ssl

# Disable SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


In [2]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

# # Download NLTK resources if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')

# Load English language model for spaCy
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize text using NLTK
    tokens = word_tokenize(text)
    
    # Remove stopwords using NLTK
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    preprocessed_text = ""

    # Join tokens back into a single string
    if len(tokens) > 3:
        preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [3]:
import pandas as pd

articles_df = pd.read_csv('/Users/viru/Documents/news-headline-generator/dataset/CNN_Articels_clean.csv')
articles_df.head()

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d..."
1,2,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...
2,3,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut..."
3,4,"Paul R. La Monica, CNN Business",2022-03-15 09:57:36,business,investing,https://www.cnn.com/2022/03/15/investing/brics...,Russia is no longer an option for investors. T...,"For many years, the world's most popular emerg...","investing, Russia is no longer an option for i...",Russia is no longer an option for investors. T...,"New York (CNN Business)For many years, the wor..."
4,7,Reuters,2022-03-15 11:27:02,business,business,https://www.cnn.com/2022/03/15/business/russia...,Russian energy investment ban part of new EU s...,The European Union formally approved on Tuesda...,"business, Russian energy investment ban part o...",EU bans investment in Russian energy in new sa...,The European Union formally approved on Tuesda...


In [4]:
articles_df.iloc[0]['Article text']

' (CNN)Right now, there\'s a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\'s largest truck manufacturers -- Navistar in the US and Traton, Volkswagen\'s trucking business, in Europe -- to design and build fully autonomous models, which it hopes to launch by 2024. Photos: The Yara Birkeland is what its builders call the world\'s first zero-emission, autonomous cargo ship. The ship is scheduled to make its first journey between two Norwegian towns before the end of the year. Click through to see

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def extractive_summarization(text, num_sentences):
    # Tokenize sentences
    sentences = nltk.sent_tokenize(text)
    
    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Fit and transform the text
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    
    # Compute similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Sort sentences by their score
    scores = list(enumerate(similarity_matrix.mean(axis=1)))
    ranked_sentences = sorted(scores, key=lambda x: x[1], reverse=True)
    
    # Get top sentences as summary
    top_sentence_indices = [ranked_sentences[i][0] for i in range(num_sentences)]
    top_sentence_indices.sort()
    
    # Generate the summary
    summary = ' '.join([sentences[i] for i in top_sentence_indices])

    headline = ' '.join(summary.split(' ')[:10])
    
    return headline

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def extract_key_phrases(text, num_key_phrases):
    # Tokenize sentences
    sentences = nltk.sent_tokenize(text)
    word_tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
    stop_words = set(stopwords.words('english'))
    word_tokens = [[word for word in tokens if word not in stop_words and word not in string.punctuation] for tokens in word_tokens]
    word_tokens = [word for sublist in word_tokens for word in sublist]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(word_tokens)])
    
    feature_names = tfidf_vectorizer.get_feature_names_out()
    feature_scores = list(zip(feature_names, tfidf_matrix.toarray()[0]))
    sorted_features = sorted(feature_scores, key=lambda x: x[1], reverse=True)
    top_key_phrases = [phrase for phrase, score in sorted_features[:num_key_phrases]]
    
    return top_key_phrases


In [17]:
from rouge import Rouge

def generate_headline(key_phrases):
    # Combine key phrases with some creative language to construct the headline
    headline = f"{', '.join(key_phrases[:-1])} {key_phrases[-1]}"
    return headline


def calculate_average_rouge_1(predicted_headlines, actual_headlines):
    rouge = Rouge()
    rouge_1_scores = []
    for predicted_headline, actual_headline in zip(predicted_headlines, actual_headlines):
        scores = rouge.get_scores(predicted_headline, actual_headline)
        rouge_1_score = scores[0]['rouge-1']['f']
        rouge_1_scores.append(rouge_1_score)
    average_rouge_1_score = sum(rouge_1_scores) / len(rouge_1_scores)
    return average_rouge_1_score

In [8]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
model = model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [22]:
test_set = []

for index, row in articles_df.iterrows():
    if index < 2:
        continue

    encoding = tokenizer.encode_plus(row['Article text'], return_tensors = "pt")
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)

    beam_outputs = model.generate(
        input_ids = input_ids,
        attention_mask = attention_masks,
        max_length = 64,
        num_beams = 10,
        early_stopping = True,
    )

    result = tokenizer.decode(beam_outputs[0])
    news_article = {
        'article text': row['Article text'],
        'headline': result
    }

    test_set.append(news_article)
    if index == 10:
        break

testset_df = pd.DataFrame(test_set)
testset_df.head()

Unnamed: 0,article text,headline
0,"(CNN)In a Hong Kong warehouse, a swarm of aut...",<pad> The Robots Running Our Warehouses</s>
1,"New York (CNN Business)For many years, the wor...",<pad> Emerging Markets - Forget the BRICS or M...
2,The European Union formally approved on Tuesda...,<pad> EU formally Approves New Sanctions on Ru...
3,(CNN Business)A woman holding a sign reading ...,<pad> Russia's Invasion of Ukraine is a Crime</s>
4,New York (CNN Business)Fox News correspondent ...,<pad> Fox News' Benjamin Hall Has Been Injured...


In [24]:
test_headlines = []

for index, row in testset_df.iterrows():
    sen = row['article text'].split(".")
    main = []
    for i in sen:
        pre_pro_sen = preprocess_text(i)
        if pre_pro_sen != "":
            main.append(pre_pro_sen)

    text = '. '.join(main)

    headline = extractive_summarization(text, 1)
    test_headlines.append(headline)

test_headlines

['hide caption photos robots running warehousesstretch latest robot boston dynamics',
 'sen sharma said investors may start look emerging markets replace',
 'european union formally approved tuesday new barrage sanctions russia invasion',
 'cnn businessa woman holding sign reading war interrupted live news',
 'new york cnn businessfox news correspondent benjamin hall deployed recent',
 'even moscow halts payments foreign investors sovereign debt roughly billion',
 'oil prices stay current levels national average price regular gasoline',
 'new delhiindia may take russian offer buy crude oil commodities',
 'japanese authorities ordered crypto exchanges monday process transactions involving crypto']

In [25]:
test_headlines_actual = list(testset_df['headline'])

average_rouge_1_score = calculate_average_rouge_1(test_headlines, test_headlines_actual)
print("Average ROUGE-1 Score:", average_rouge_1_score)

Average ROUGE-1 Score: 0.01234567846364886
