In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mannacharya/aeon-essays-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/aeon-essays-dataset


In [27]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from collections import Counter
import spacy
from heapq import nlargest
from transformers import BartForConditionalGeneration, BartTokenizer

In [4]:
df = pd.read_csv('/kaggle/input/aeon-essays-dataset/essays.csv')
df.head()

Unnamed: 0,title,description,essay,authors,source_url,thumbnail_url
0,Space exploration,When self-replicating craft bring life to the ...,"Some time late this century, someone will push...",Jay Olson,https://aeon.co//essays/cosmic-expansion-is-a-...,https://images.aeonmedia.co/images/9239658f-b9...
1,History of science,"To the detriment of the public, scientists and...",Would boycotting Russian scientists be an effe...,Lorraine Daston & Peter Harrison,https://aeon.co//essays/science-and-history-ca...,https://images.aeonmedia.co/images/7e9ea9e3-03...
2,Religion,"Once a centre of Afghan culture, Sufism seems ...",My introduction into the world of Afghanistan’...,Annika Schmeding,https://aeon.co//essays/sufi-transitions-betwe...,https://images.aeonmedia.co/images/957fb6c9-40...
3,Thinkers and theories,The intrepid logician Kurt Gödel believed in t...,"As the foremost logician of the 20th century, ...",Alexander T Englert,https://aeon.co//essays/kurt-godel-his-mother-...,https://images.aeonmedia.co/images/cbe24f46-98...
4,Thinkers and theories,"For Rachel Bespaloff, philosophy was a sensual...",Shortly after Rachel Bespaloff’s suicide in 19...,Isabel Jacobs,https://aeon.co//essays/for-rachel-bespaloff-p...,https://images.aeonmedia.co/images/536e31b1-dc...


In [5]:
df.shape

(2235, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          2235 non-null   object
 1   description    2235 non-null   object
 2   essay          2235 non-null   object
 3   authors        2235 non-null   object
 4   source_url     2235 non-null   object
 5   thumbnail_url  2235 non-null   object
dtypes: object(6)
memory usage: 104.9+ KB


In [7]:
df.isnull().sum()

title            0
description      0
essay            0
authors          0
source_url       0
thumbnail_url    0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
df['essay'][0]

'Some time late this century, someone will push a button, unleashing a life force on the cosmos. Within 1,000 years, every star you can see at night will host intelligent life. In less than a million years, that life will saturate the entire Milky Way; in 20 million years – the local group of galaxies. In the fullness of cosmic time, thousands of superclusters of galaxies will be saturated in a forever-expanding sphere of influence, centred on Earth. This won’t require exotic physics. The basic ingredients have been understood since the 1960s. What’s needed is an automated spacecraft that can locate worlds on which to land, build infrastructure, and eventually make copies of itself. The copies are then sent forth to do likewise – in other words, they are von Neumann probes (VNPs). We’ll stipulate a very fast one, travelling at a respectable fraction of the speed of light, with an extremely long range (able to coast between galaxies) and carrying an enormous trove of information. Ambiti

In [10]:
sw = stopwords.words('english')

In [11]:
def text_cleaning(text):
    text = text.lower()
    text = re.sub('[^a-z]', ' ', text)
    text = re.sub('\s+[a-z]\s+', ' ', text)
    text = re.sub('\s+', ' ', text)
    words = [word for word in text.split() if word not in sw]
    return words

In [12]:
def Counting(words):
    words_freq = Counter(words)
    max_freq = max(words_freq.values())
    for word, freq in words_freq.items():
        words_freq[word] = freq / max_freq
    return words_freq

In [13]:
nlp = spacy.load('en_core_web_sm')

In [14]:
def text_summarization(text, n):
    cleaned_words = text_cleaning(text)
    words_freq = Counting(cleaned_words)
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sents]
    sents_freq = dict()
    for sent in sentences:
        for word in sent.split():
            if word in words_freq.keys():
                if sent not in sents_freq:
                    sents_freq[sent] = words_freq[word]
                else:
                    sents_freq[sent] += words_freq[word]
    m = nlargest(n, sents_freq, key = sents_freq.get)
    sent_list = []
    for i in sentences:
        for j in m:
            if i == j:
                sent_list.append(i)
    return ' '.join(sent_list)

In [15]:
df2 = df[['essay']].sample(10)

In [17]:
df2

Unnamed: 0,essay
221,As the planet lurches towards a climate emerge...
1188,Scientists now know much about human mating. T...
226,Steady blows from flint axes echo through the ...
926,"The easy question came first, a few months aft..."
23,"Twenty-five years ago, the burgeoning science ..."
1362,It’s surprising how stressful the first time c...
754,"What is anarchy? The word evokes lawlessness, ..."
1321,The problem of variation haunts medical scienc...
910,"When I was nine years old, on a sunny summer a..."
1829,"So I’m posted up, sharing a sandwich and a cig..."


In [18]:
df2['extractive_sum'] = df2['essay'].apply(lambda x : text_summarization(x, 10))

In [20]:
df2

Unnamed: 0,essay,extractive_sum
221,As the planet lurches towards a climate emerge...,"That same year, as we discussed A Rough Ride t..."
1188,Scientists now know much about human mating. T...,"Yet women do have affairs, a phenomenon that, ..."
226,Steady blows from flint axes echo through the ...,"Next year, once the last of the fallen trees h..."
926,"The easy question came first, a few months aft...","It also, however, has a tendency to produce th..."
23,"Twenty-five years ago, the burgeoning science ...",Solving the hard problem would give us a secur...
1362,It’s surprising how stressful the first time c...,A hypnosis researcher had pointed me to a scri...
754,"What is anarchy? The word evokes lawlessness, ...","In anarchia, an authoritative leader, ruler or..."
1321,The problem of variation haunts medical scienc...,Anything that veers – from having green eyes o...
910,"When I was nine years old, on a sunny summer a...",They looked for consistency in the statements ...
1829,"So I’m posted up, sharing a sandwich and a cig...","How could we be in the middle of Baltimore, a ..."


In [22]:
df2['essay'][221]

'As the planet lurches towards a climate emergency and its life support systems falter, the need for visionary thinkers with fresh insights and big ideas has never been more pressing. No wonder, then, that the world mourned the death earlier this year of James (‘Jim’) Lovelock, whose Gaia theory provided a new framework to think about nature, one that changed the way we regard our relationship with Earth. Lovelock contributed to many fields, such as environmental science, cryobiology and exobiology, from thawing hamsters to building exquisitely sensitive detectors to find life on Mars or to sniff out ozone-destroying chemicals. But when he died on 26 July, the day of his 103rd birthday, the world lost what the Earth scientist Timothy Lenton in Science magazine called ‘a genius and iconoclast of immense intellectual courage’. Lovelock was a true original who was detached from the pressure to conform, one who had found a way to do research outside an institution, and who showed a disrega

In [24]:
df2['extractive_sum'][221]

'That same year, as we discussed A Rough Ride to the Future (2014), one of the many books he wrote to help fund his work, Lovelock bemoaned how research had become a ‘contest of hugely expensive teams in the science equivalent of an Olympic stadium.’ The science that preoccupied him during his career came from what he’d gleaned as an ‘SBN’ or ‘small bespectacled nerd’ (his term) in the museum and in libraries, not in school science lessons or homework, which he found tedious. More than 70 years later, he would write: ‘I suspect that posing the same need now to a civil service laboratory would provide several months’ work for a team of scientists and technicians.’ Lovelock also lived his life inventively: he had used a prototype microwave to reanimate a chilled hamster, avoiding the burns caused by thawing its heart with a heated spoon, but was delighted to find the technology could also warm his lunch at the NIMR, a pioneering use of microwave cooking in the 1950s. The referees refused

In [28]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [29]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [30]:
def abstractive_sum(text):
    tokenization = tokenizer(text, max_length = 1024, return_tensors = 'pt', truncation = True)
    ids = tokenization['input_ids']
    summ = model.generate(ids, max_length = 1000, min_length = 300, length_penalty = 2.0, num_beams = 4, early_stopping = True)
    final_summ = tokenizer.decode(summ[0], skip_special_tokens = True)
    return final_summ

In [31]:
df2.head()

Unnamed: 0,essay,extractive_sum
221,As the planet lurches towards a climate emerge...,"That same year, as we discussed A Rough Ride t..."
1188,Scientists now know much about human mating. T...,"Yet women do have affairs, a phenomenon that, ..."
226,Steady blows from flint axes echo through the ...,"Next year, once the last of the fallen trees h..."
926,"The easy question came first, a few months aft...","It also, however, has a tendency to produce th..."
23,"Twenty-five years ago, the burgeoning science ...",Solving the hard problem would give us a secur...


In [32]:
abstractive_sum(df2['essay'][221])

'James Lovelock was a true original who was detached from the pressure to conform. He found a way to do research outside an institution, and showed a disregard for disciplinary boundaries. Driven by his scepticism about conventional wisdom, enabled by his skill as an inventor, and guided by visceral scientific insights, he made much of his independence. When he started as a lone scientist-inventor in 1961, the bureaucratic restrictions were easy to overcome. But being independent would be 100 times harder to do today, he told me. His kind were going extinct. Even so, with his death, has the world seen the end of the golden age of the independent expert? My colleague at the Science Museum Alex Rose, who curates his archive, says: ‘Jim carefully cultivated his persona as an independent scientist – but we might ask exactly what “independent scientist” means, or why he chose himself this way.’ He was raised in Brixton, south London, where the public library fired up his fascination with sc