In [349]:
import re
from nltk.corpus import stopwords
import pandas as pd
from collections import Counter
import spacy
from heapq import nlargest
import spacy
from transformers import pipeline

In [290]:
df = pd.read_csv('news_summary_more.csv')
df.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [291]:
df.shape

(98401, 2)

In [292]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98401 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98401 non-null  object
 1   text       98401 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [293]:
df.isnull().sum()

headlines    0
text         0
dtype: int64

In [294]:
df.shape

(98401, 2)

In [295]:
df.drop(columns = ['headlines'], axis = 1, inplace = True)

In [296]:
df.head()

Unnamed: 0,text
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Kunal Shah's credit card bill payment platform...
2,New Zealand defeated India by 8 wickets in the...
3,"With Aegon Life iTerm Insurance plan, customer..."
4,Speaking about the sexual harassment allegatio...


In [297]:
sw = stopwords.words('english')

In [298]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub('[^a-z]', ' ', text)
    text = re.sub('\s+[a-z]\s+',' ', text)
    text = re.sub('\s+', ' ', text)
    words = [word for word in text.split() if word not in sw]
    return words

In [299]:
df['words'] = df['text'].apply(text_preprocessing)

In [300]:
df.head()

Unnamed: 0,text,words
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog..."
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla..."
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt..."
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra..."


In [301]:
def summarizing1(words):
    word_freq = Counter(words)
    return word_freq

In [302]:
df['word_freq'] = df['words'].apply(summarizing1)

In [303]:
df.head()

Unnamed: 0,text,words,word_freq
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog...","{'saurav': 1, 'kant': 1, 'alumnus': 1, 'upgrad..."
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla...","{'kunal': 1, 'shah': 1, 'credit': 1, 'card': 1..."
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt...","{'new': 1, 'zealand': 1, 'defeated': 1, 'india..."
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer...","{'aegon': 1, 'life': 3, 'iterm': 1, 'insurance..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra...","{'speaking': 1, 'sexual': 1, 'harassment': 1, ..."


In [304]:
def summarizing2(word_freq):
    max_freq = max(word_freq.values())
    for word in word_freq.keys():
        word_freq[word] = word_freq[word] / max_freq
    return word_freq

In [305]:
df['word_freq'] = df['word_freq'].apply(summarizing2)

In [306]:
df.head()

Unnamed: 0,text,words,word_freq
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog...","{'saurav': 0.3333333333333333, 'kant': 0.33333..."
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla...","{'kunal': 0.3333333333333333, 'shah': 0.333333..."
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt...","{'new': 0.25, 'zealand': 0.25, 'defeated': 0.2..."
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer...","{'aegon': 0.3333333333333333, 'life': 1.0, 'it..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra...","{'speaking': 0.3333333333333333, 'sexual': 0.3..."


In [307]:
df['text'][5]

'Pakistani singer Rahat Fateh Ali Khan has denied receiving any notice from the Enforcement Directorate over allegedly smuggling foreign currency out of India. "It would have been better if the authorities would have served the notice first if any and then publicised this," reads a press release issued on behalf of Rahat. The statement further called the allegation "bizarre".'

In [308]:
nlp = spacy.load('en_core_web_sm')

In [309]:
def sen_splitting(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

In [310]:
df['sents'] = df['text'].apply(sen_splitting)

In [311]:
df.head()

Unnamed: 0,text,words,word_freq,sents
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog...","{'saurav': 0.3333333333333333, 'kant': 0.33333...","[Saurav Kant, an alumnus of upGrad and IIIT-B'..."
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla...","{'kunal': 0.3333333333333333, 'shah': 0.333333...",[Kunal Shah's credit card bill payment platfor...
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt...","{'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...",[New Zealand defeated India by 8 wickets in th...
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer...","{'aegon': 0.3333333333333333, 'life': 1.0, 'it...","[With Aegon Life iTerm Insurance plan, custome..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra...","{'speaking': 0.3333333333333333, 'sexual': 0.3...",[Speaking about the sexual harassment allegati...


In [312]:
df['sents'][1]

["Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year.",
 'Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins.',
 'Users get one CRED coin per rupee of bill paid, which can be used to avail rewards from brands like Ixigo, BookMyShow, UberEats, Cult.',
 'Fit and more.']

In [313]:
word_freq_n = df['word_freq']

In [314]:
word_freq_n

0        {'saurav': 0.3333333333333333, 'kant': 0.33333...
1        {'kunal': 0.3333333333333333, 'shah': 0.333333...
2        {'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...
3        {'aegon': 0.3333333333333333, 'life': 1.0, 'it...
4        {'speaking': 0.3333333333333333, 'sexual': 0.3...
                               ...                        
98396    {'crpf': 0.5, 'jawan': 1.0, 'tuesday': 0.5, 'a...
98397    {'uff': 0.3333333333333333, 'yeh': 0.333333333...
98398    {'according': 0.25, 'reports': 0.5, 'new': 0.2...
98399    {'new': 0.3333333333333333, 'music': 0.3333333...
98400    {'madhesi': 0.5, 'morcha': 1.0, 'alliance': 0....
Name: word_freq, Length: 98401, dtype: object

In [315]:
sents_n = df['sents']

In [316]:
sents_n

0        [Saurav Kant, an alumnus of upGrad and IIIT-B'...
1        [Kunal Shah's credit card bill payment platfor...
2        [New Zealand defeated India by 8 wickets in th...
3        [With Aegon Life iTerm Insurance plan, custome...
4        [Speaking about the sexual harassment allegati...
                               ...                        
98396    [A CRPF jawan was on Tuesday axed to death wit...
98397    ['Uff Yeh', the first song from the Sonakshi S...
98398    [According to reports, a new version of the 19...
98399    [A new music video shows rapper Snoop Dogg aim...
98400    [Madhesi Morcha, an alliance of seven politica...
Name: sents, Length: 98401, dtype: object

In [317]:
for i in sents_n[1]:
    print(i)

Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year.
Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins.
Users get one CRED coin per rupee of bill paid, which can be used to avail rewards from brands like Ixigo, BookMyShow, UberEats, Cult.
Fit and more.


In [318]:
df.shape

(98401, 4)

In [319]:
df.head()

Unnamed: 0,text,words,word_freq,sents
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog...","{'saurav': 0.3333333333333333, 'kant': 0.33333...","[Saurav Kant, an alumnus of upGrad and IIIT-B'..."
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla...","{'kunal': 0.3333333333333333, 'shah': 0.333333...",[Kunal Shah's credit card bill payment platfor...
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt...","{'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...",[New Zealand defeated India by 8 wickets in th...
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer...","{'aegon': 0.3333333333333333, 'life': 1.0, 'it...","[With Aegon Life iTerm Insurance plan, custome..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra...","{'speaking': 0.3333333333333333, 'sexual': 0.3...",[Speaking about the sexual harassment allegati...


In [320]:
df['word_freq']

0        {'saurav': 0.3333333333333333, 'kant': 0.33333...
1        {'kunal': 0.3333333333333333, 'shah': 0.333333...
2        {'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...
3        {'aegon': 0.3333333333333333, 'life': 1.0, 'it...
4        {'speaking': 0.3333333333333333, 'sexual': 0.3...
                               ...                        
98396    {'crpf': 0.5, 'jawan': 1.0, 'tuesday': 0.5, 'a...
98397    {'uff': 0.3333333333333333, 'yeh': 0.333333333...
98398    {'according': 0.25, 'reports': 0.5, 'new': 0.2...
98399    {'new': 0.3333333333333333, 'music': 0.3333333...
98400    {'madhesi': 0.5, 'morcha': 1.0, 'alliance': 0....
Name: word_freq, Length: 98401, dtype: object

In [321]:
word_freq_n[0].keys()

dict_keys(['saurav', 'kant', 'alumnus', 'upgrad', 'iiit', 'pg', 'program', 'machine', 'learning', 'artificial', 'intelligence', 'sr', 'systems', 'engineer', 'infosys', 'almost', 'years', 'work', 'experience', 'degree', 'career', 'support', 'helped', 'transition', 'data', 'scientist', 'tech', 'mahindra', 'salary', 'hike', 'online', 'power', 'powered', 'lakh', 'careers'])

In [322]:
word_freq_n

0        {'saurav': 0.3333333333333333, 'kant': 0.33333...
1        {'kunal': 0.3333333333333333, 'shah': 0.333333...
2        {'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...
3        {'aegon': 0.3333333333333333, 'life': 1.0, 'it...
4        {'speaking': 0.3333333333333333, 'sexual': 0.3...
                               ...                        
98396    {'crpf': 0.5, 'jawan': 1.0, 'tuesday': 0.5, 'a...
98397    {'uff': 0.3333333333333333, 'yeh': 0.333333333...
98398    {'according': 0.25, 'reports': 0.5, 'new': 0.2...
98399    {'new': 0.3333333333333333, 'music': 0.3333333...
98400    {'madhesi': 0.5, 'morcha': 1.0, 'alliance': 0....
Name: word_freq, Length: 98401, dtype: object

In [323]:
def sent_sc(data1, data2):
    l = {}
    word_freq = data1
    sents = data2
    for sen in sents:
        for word in sen.split():
            if word in word_freq.keys():
                if sen not in l.keys():
                    l[sen] = word_freq[word]
                else:
                    l[sen] += word_freq[word]
    return l

In [324]:
n = []
for i in range(df.shape[0]):
    n.append(sent_sc(df['word_freq'][i], df['sents'][i]))

In [325]:
n[:10]

[{"Saurav Kant, an alumnus of upGrad and IIIT-B's PG Program in Machine learning and Artificial Intelligence, was a Sr Systems Engineer at Infosys with almost 5 years of work experience.": 1.9999999999999998,
  "The program and upGrad's 360-degree career support helped him transition to a Data Scientist at Tech Mahindra with 90% salary hike.": 2.333333333333333,
  "upGrad's Online Power Learning has powered 3 lakh+ careers.": 0.3333333333333333},
 {"Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year.": 4.666666666666667,
  'Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins.': 1.0,
  'Users get one CRED coin per rupee of bill paid, which can be used to avail rewards from brands like Ixigo, BookMyShow, UberEats, Cult.': 4.333333333333333},
 {'New Zealand defeated India by 8 wickets in the fourth ODI at Hamilton on Thursday to win their first match of the five-match ODI series.': 2.25,
  "Indi

In [326]:
len(n)

98401

In [327]:
m = []
num_sentences = 1
for i in n:
    m.append(nlargest(num_sentences, i, key = i.get))

In [328]:
m[:10]

[["The program and upGrad's 360-degree career support helped him transition to a Data Scientist at Tech Mahindra with 90% salary hike."],
 ["Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year."],
 ["India lost an international match under Rohit Sharma's captaincy after 12 consecutive victories dating back to March 2018."],
 ['Also, customers have options to insure against Critical Illnesses, Disability and Accidental Death Benefit Rider with a life cover up to the age of 80 years.'],
 ['Speaking about the sexual harassment allegations against Rajkumar Hirani, Sonam Kapoor said, "I\'ve known Hirani for many years...'],
 ['"It would have been better if the authorities would have served the notice first if any and then publicised this," reads a press release issued on behalf of Rahat.'],
 ['India recorded their lowest ODI total in New Zealand after getting all out for 92 runs in 30.5 overs in the fourth ODI at Hamilton on Th

In [329]:
df['headline'] = m

In [330]:
df.head()

Unnamed: 0,text,words,word_freq,sents,headline
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog...","{'saurav': 0.3333333333333333, 'kant': 0.33333...","[Saurav Kant, an alumnus of upGrad and IIIT-B'...",[The program and upGrad's 360-degree career su...
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla...","{'kunal': 0.3333333333333333, 'shah': 0.333333...",[Kunal Shah's credit card bill payment platfor...,[Kunal Shah's credit card bill payment platfor...
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt...","{'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...",[New Zealand defeated India by 8 wickets in th...,[India lost an international match under Rohit...
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer...","{'aegon': 0.3333333333333333, 'life': 1.0, 'it...","[With Aegon Life iTerm Insurance plan, custome...","[Also, customers have options to insure agains..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra...","{'speaking': 0.3333333333333333, 'sexual': 0.3...",[Speaking about the sexual harassment allegati...,[Speaking about the sexual harassment allegati...


In [331]:
df.columns

Index(['text', 'words', 'word_freq', 'sents', 'headline'], dtype='object')

In [332]:
df.index

RangeIndex(start=0, stop=98401, step=1)

In [337]:
df['headline'][0][0]

"The program and upGrad's 360-degree career support helped him transition to a Data Scientist at Tech Mahindra with 90% salary hike."

In [345]:
s = []
for i in df['headline']:
    s.append(i[0])

In [347]:
df['headline1'] = s

In [348]:
df.head()

Unnamed: 0,text,words,word_freq,sents,headline,headline1
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...","[saurav, kant, alumnus, upgrad, iiit, pg, prog...","{'saurav': 0.3333333333333333, 'kant': 0.33333...","[Saurav Kant, an alumnus of upGrad and IIIT-B'...",[The program and upGrad's 360-degree career su...,The program and upGrad's 360-degree career sup...
1,Kunal Shah's credit card bill payment platform...,"[kunal, shah, credit, card, bill, payment, pla...","{'kunal': 0.3333333333333333, 'shah': 0.333333...",[Kunal Shah's credit card bill payment platfor...,[Kunal Shah's credit card bill payment platfor...,Kunal Shah's credit card bill payment platform...
2,New Zealand defeated India by 8 wickets in the...,"[new, zealand, defeated, india, wickets, fourt...","{'new': 0.25, 'zealand': 0.25, 'defeated': 0.2...",[New Zealand defeated India by 8 wickets in th...,[India lost an international match under Rohit...,India lost an international match under Rohit ...
3,"With Aegon Life iTerm Insurance plan, customer...","[aegon, life, iterm, insurance, plan, customer...","{'aegon': 0.3333333333333333, 'life': 1.0, 'it...","[With Aegon Life iTerm Insurance plan, custome...","[Also, customers have options to insure agains...","Also, customers have options to insure against..."
4,Speaking about the sexual harassment allegatio...,"[speaking, sexual, harassment, allegations, ra...","{'speaking': 0.3333333333333333, 'sexual': 0.3...",[Speaking about the sexual harassment allegati...,[Speaking about the sexual harassment allegati...,Speaking about the sexual harassment allegatio...


In [356]:
df.head()

Unnamed: 0,text,headline1
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...",The program and upGrad's 360-degree career sup...
1,Kunal Shah's credit card bill payment platform...,Kunal Shah's credit card bill payment platform...
2,New Zealand defeated India by 8 wickets in the...,India lost an international match under Rohit ...
3,"With Aegon Life iTerm Insurance plan, customer...","Also, customers have options to insure against..."
4,Speaking about the sexual harassment allegatio...,Speaking about the sexual harassment allegatio...


In [357]:
df['text'][0]

"Saurav Kant, an alumnus of upGrad and IIIT-B's PG Program in Machine learning and Artificial Intelligence, was a Sr Systems Engineer at Infosys with almost 5 years of work experience. The program and upGrad's 360-degree career support helped him transition to a Data Scientist at Tech Mahindra with 90% salary hike. upGrad's Online Power Learning has powered 3 lakh+ careers."

In [358]:
df['headline1'][0]

"The program and upGrad's 360-degree career support helped him transition to a Data Scientist at Tech Mahindra with 90% salary hike."

In [359]:
df['text'][1]

"Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year. Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins. Users get one CRED coin per rupee of bill paid, which can be used to avail rewards from brands like Ixigo, BookMyShow, UberEats, Cult.Fit and more."

In [360]:
df['headline1'][1]

"Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year."

In [361]:
df['text'][2]

"New Zealand defeated India by 8 wickets in the fourth ODI at Hamilton on Thursday to win their first match of the five-match ODI series. India lost an international match under Rohit Sharma's captaincy after 12 consecutive victories dating back to March 2018. The match witnessed India getting all out for 92, their seventh lowest total in ODI cricket history."

In [362]:
df['headline1'][2]

"India lost an international match under Rohit Sharma's captaincy after 12 consecutive victories dating back to March 2018."