**Packages**

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import regex as re

**Necessary Functions**

In [3]:
rouge = evaluate.load('rouge')

In [6]:
chrf = evaluate.load("chrf")

**Data**

In [7]:
dataset = load_dataset("csebuetnlp/xlsum", "english")

Found cached dataset xlsum (/home/ubuntu/.cache/huggingface/datasets/csebuetnlp___xlsum/english/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 304.36it/s]


In [8]:
# EDA
len(dataset['train'])

306522

In [9]:
# EDA
dataset['train'][1]

{'id': 'uk-scotland-highlands-islands-11069985',
 'url': 'https://www.bbc.com/news/uk-scotland-highlands-islands-11069985',
 'title': 'Huge tidal turbine installed at Orkney test site',
 'summary': 'The massive tidal turbine AK1000 has been installed in 35m (114.8ft) of water at a test site in Orkney.',
 'text': 'Atlantis Resources unveiled the marine energy device at Invergordon ahead of it being shipped to Kirkwall. Trials on the device will now be run at the European Marine Energy Centre test site off Eday. The device stands 22.5m (73ft) tall, weighs 1,300 tonnes and has two sets of blades on a single unit. It could generate enough power for 1,000 homes.'}

In [10]:
index = pd.DataFrame({"index": list(range(len(dataset['train'])))})
sample_index = index.sample(n=2000, replace=False, random_state=1004)
sample_index[:5]

Unnamed: 0,index
235420,235420
172024,172024
253546,253546
224954,224954
214134,214134


In [11]:
id = []
url = []
title = []
article = []
summary = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    article.append(dataset["train"][i]['text'])

In [12]:
d = {'id': id, 'url': url, "title": title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,id,url,title,article,summary
0,uk-england-cornwall-55191422,https://www.bbc.com/news/uk-england-cornwall-5...,Care home manager: 'It felt like we were losin...,By Rebecca Ricks & Johnny O'SheaBBC Spotlight ...,"During the spring, at the height of the Covid-..."
1,uk-43893709,https://www.bbc.com/news/uk-43893709,Tafida Raqeeb: Who decides the care of sick ch...,"By Rachel SchraerBBC Reality Check So, why did...","The parents of five-year-old Tafida Raqeeb, wh..."
2,uk-politics-57050659,https://www.bbc.com/news/uk-politics-57050659,Labour reshuffle: Anneliese Dodds out in Starm...,Anneliese Dodds will now become the Labour Par...,Sir Keir Starmer has sacked his shadow chancel...
3,entertainment-arts-38221420,https://www.bbc.com/news/entertainment-arts-38...,Vinyl sales made more than downloads last week,By Mark SavageBBC Music reporter Vinyl sales m...,More money was spent on vinyl than downloaded ...
4,entertainment-arts-24046991,https://www.bbc.com/news/entertainment-arts-24...,Pirates of the Caribbean sequel delayed,Disney's Pirates of The Caribbean: Dead Men Te...,The next Pirates of the Caribbean film has bee...


**Baseline**

In [18]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []
base_chrf = []

for i in range(len(df['article'])): 
    
    
    # first three sentences 
    candidate = ". ".join(df["article"][i].split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])
    
    base_chrf.append(results2['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'base_scores.csv', index=False)
        print(i)
        
data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'base_scores.csv', index=False)
print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
1999


In [19]:
print("Average of all 2000")
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))
print('chrf average:', np.mean(base_chrf))

Average of all 2000
rouge1 average:  0.18615699708157343
rouge2 average:  0.02667181874604599
rougeL average:  0.12080542692742077
rougeLs average: 0.12080542692742077
chrf average: 26.710645519890537


In [20]:
print("Average of First 1000")
print('rouge1 average: ', np.mean(base_r1[:1000]))
print('rouge2 average: ', np.mean(base_r2[:1000]))
print('rougeL average: ', np.mean(base_rL[:1000]))
print('rougeLs average:', np.mean(base_rLs[:1000]))
print('chrf average:', np.mean(base_chrf[:1000]))

Average of First 1000
rouge1 average:  0.18572856885833472
rouge2 average:  0.025796662812245914
rougeL average:  0.12019463217968686
rougeLs average: 0.12019463217968686
chrf average: 26.625614296560503


In [21]:
print("Average of Last 1000")
print('rouge1 average: ', np.mean(base_r1[1000:]))
print('rouge2 average: ', np.mean(base_r2[1000:]))
print('rougeL average: ', np.mean(base_rL[1000:]))
print('rougeLs average:', np.mean(base_rLs[1000:]))
print('chrf average:', np.mean(base_chrf[1000:]))

Average of Last 1000
rouge1 average:  0.18658542530481215
rouge2 average:  0.027546974679846067
rougeL average:  0.12141622167515467
rougeLs average: 0.12141622167515467
chrf average: 26.795676743220575


**Baseline by Top 5 Largest Categories**

In [22]:
def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

In [25]:
categories = []

for i in range(len(dataset['train'])):
    cat = dataset['train'][i]['id']
    result = re.sub('\d','',cat)[:-1]
    result = result.split('-')[0].split('.')[0]
    categories.append(result)

**Category 1: uk**

In [26]:
uk = find_indices(categories, 'uk')
index = pd.DataFrame({"index": uk})
sample_index = index.sample(n=1000, replace=False, random_state=1004)
sample_index[:5]

Unnamed: 0,index
103144,184549
135979,242139
122918,219346
41576,76571
23230,44271


In [27]:
id = []
url = []
title = []
article = []
summary = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    article.append(dataset["train"][i]['text'])

In [28]:
d = {'id': id, 'url': url, "title": title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,id,url,title,article,summary
0,uk-england-bristol-51032231,https://www.bbc.com/news/uk-england-bristol-51...,Author Emily Koch: 'I'm not angry at the drive...,Emily Koch suffered two broken legs and ligame...,A woman left seriously injured after being hit...
1,uk-england-birmingham-54264699,https://www.bbc.com/news/uk-england-birmingham...,Blind TikTok star Lucy Edwards says reaction t...,"Lucy Edwards, who became Radio 1's first blind...",A blind vlogger hopes her TikTok videos on liv...
2,uk-england-london-16091997,https://www.bbc.com/news/uk-england-london-160...,Bendy bus makes final journey for Transport fo...,The vehicles were used on 12 routes over the p...,The last of London's bendy buses was taken off...
3,uk-england-norfolk-54058083,https://www.bbc.com/news/uk-england-norfolk-54...,Horsey seals: Volunteers remove rubber ring fr...,Four members of Friends of Horsey Seals netted...,"Volunteers have helped capture a ""feisty"" seal..."
4,uk-scotland-edinburgh-east-fife-28838287,https://www.bbc.com/news/uk-scotland-edinburgh...,Tim Vine wins funniest Edinburgh Fringe joke a...,"He won with the one-liner: ""I decided to sell ...",A joke by comedian Tim Vine about a vacuum cle...


In [29]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []
base_chrf = []

for i in range(len(df['article'])): 
    
    
    # first three sentences 
    candidate = ". ".join(df["article"][i].split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])
    
    base_chrf.append(results2['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'base_scores_uk.csv', index=False)
        print(i)
        
data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'base_scores_uk.csv', index=False)
print(i)

0
100
200
300
400
500
600
700
800
900
999


In [30]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))
print('chrf average:', np.mean(base_chrf))

rouge1 average:  0.18939253006007692
rouge2 average:  0.027133586978308793
rougeL average:  0.12278381086366534
rougeLs average: 0.12278381086366534
chrf average: 26.572484469570067


**Category 2: world**

In [35]:
cat_inx = find_indices(categories, 'world')
index = pd.DataFrame({"index": cat_inx})
sample_index = index.sample(n=1000, replace=False, random_state=1004)
sample_index[:5]

Unnamed: 0,index
38266,209050
5910,25805
14047,71645
23996,127936
25699,137681


In [36]:
id = []
url = []
title = []
article = []
summary = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    article.append(dataset["train"][i]['text'])

In [37]:
d = {'id': id, 'url': url, "title": title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,id,url,title,article,summary
0,world-asia-53847400,https://www.bbc.com/news/world-asia-53847400,Kim Jong-un gives sister Yo-jong 'more respons...,"Mr Kim still maintains ""absolute authority"", b...",North Korean leader Kim Jong-un has delegated ...
1,world-europe-guernsey-18129181,https://www.bbc.com/news/world-europe-guernsey...,Guernsey overgrown trees prompt road crash fears,Overgrown bushes and trees could result in peo...,Residents are being urged to trim vegetation o...
2,world-africa-44629681,https://www.bbc.co.uk/news/world-africa-44629681,Bringing Gay Pride to Africa's last absolute m...,And anyone doubting the determination needed t...,Africa's last absolute monarchy is holding its...
3,world-asia-41840069,https://www.bbc.com/news/world-asia-41840069,Pakistan polygamy: Lahore man jailed over unap...,The court in Lahore also ordered Shahzad Saqib...,A Pakistan court has jailed a man for six mont...
4,world-europe-guernsey-13052445,https://www.bbc.com/news/world-europe-guernsey...,Alternative recycling bank site proposed,The old facility at Manor Stores closed last y...,Work is continuing to try to find a permanent ...


In [None]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []
base_chrf = []

for i in range(len(df['article'])): 
    
    
    # first three sentences 
    candidate = ". ".join(df["article"][i].split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])
    
    base_chrf.append(results2['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'base_scores_world.csv', index=False)
        print(i)
        
data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'base_scores_world.csv', index=False)
print(i)

0
100


In [None]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))
print('chrf average:', np.mean(base_chrf))

**Category 3: business**

In [None]:
cat_inx = find_indices(categories, 'business')
index = pd.DataFrame({"index": cat_inx})
sample_index = index.sample(n=1000, replace=False, random_state=1004)
sample_index[:5]

In [None]:
id = []
url = []
title = []
article = []
summary = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    article.append(dataset["train"][i]['text'])

In [None]:
d = {'id': id, 'url': url, "title": title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

In [None]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []
base_chrf = []

for i in range(len(df['article'])): 
    
    
    # first three sentences 
    candidate = ". ".join(df["article"][i].split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])
    
    base_chrf.append(results2['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'base_scores_business.csv', index=False)
        print(i)
        
data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'base_scores_business.csv', index=False)
print(i)

In [None]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))
print('chrf average:', np.mean(base_chrf))

**Category 4: entertainment**

In [None]:
cat_inx = find_indices(categories, 'entertainment')
index = pd.DataFrame({"index": cat_inx})
sample_index = index.sample(n=1000, replace=False, random_state=1004)
sample_index[:5]

In [None]:
id = []
url = []
title = []
article = []
summary = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    article.append(dataset["train"][i]['text'])

In [None]:
d = {'id': id, 'url': url, "title": title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

In [None]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []
base_chrf = []

for i in range(len(df['article'])): 
    
    
    # first three sentences 
    candidate = ". ".join(df["article"][i].split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])
    
    base_chrf.append(results2['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'base_scores_entertainment.csv', index=False)
        print(i)
        
data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'base_scores_entertainment.csv', index=False)
print(i)

In [None]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))
print('chrf average:', np.mean(base_chrf))

**Category 5: technology**

In [None]:
cat_inx = find_indices(categories, 'technology')
index = pd.DataFrame({"index": cat_inx})
sample_index = index.sample(n=1000, replace=False, random_state=1004)
sample_index[:5]

In [None]:
id = []
url = []
title = []
article = []
summary = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    article.append(dataset["train"][i]['text'])

In [None]:
d = {'id': id, 'url': url, "title": title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

In [None]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []
base_chrf = []

for i in range(len(df['article'])): 
    
    
    # first three sentences 
    candidate = ". ".join(df["article"][i].split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])
    
    base_chrf.append(results2['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'base_scores_tech.csv', index=False)
        print(i)
        
data = {'rouge1': base_r1, 'rouge2': base_r2, 'rogueL': base_rL, 'rogueLs': base_rLs, 'chrf': base_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'base_scores_tech.csv', index=False)
print(i)

In [None]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))
print('chrf average:', np.mean(base_chrf))