**Packages**

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import re

**Necessary Functions**

In [14]:
import inspect

def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

In [15]:
def data_organize(sample_index):

    article = []
    summary = []

    for i in sample_index["index"]:
        summary.append(dataset["train"][i]['summary'])
        article.append(dataset["train"][i]['text'])

    return article, summary

**Data**

In [16]:
dataset = load_dataset("csebuetnlp/xlsum", "english")

In [17]:
# EDA
len(dataset['train'])

In [18]:
# EDA
dataset['train'][1]

**Sampling for Train, Val, and Test sets for all XL Sum**

In [19]:
index = pd.DataFrame({"index": list(range(len(dataset['train'])))})
sample_index = index.sample(n=2000, replace=False, random_state=1004)
sample_index[:5]

In [20]:
article, summary = data_organize(sample_index)

In [42]:
%pwd

In [21]:
d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('../Datasets/xl_sum_sample_train.csv', index = False)
#df.head(5)

In [22]:
d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('../w266_project/xl_sum_sample_val.csv', index = False)
#df.head(5)

In [23]:
d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('../w266_project/xl_sum_sample_test.csv', index = False)
#df.head(5)

**EDA on Train**

In [24]:
id = []
url = []
title = []
article = []
article_num_sentences = []
article_num_characters = []
article_num_words = []
summary = []
summary_num_sentences = []
summary_num_characters = []
summary_num_words = []

for i in sample_index["index"][0:1000]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    summary_num_sentences.append(len(dataset["train"][i]['summary'].split(".")))
    summary_num_words.append(len(dataset["train"][i]['summary'].split(" ")))
    summary_num_characters.append(len(dataset["train"][i]['summary']))
    article.append(dataset["train"][i]['text'])
    article_num_sentences.append(len(dataset["train"][i]['text'].split(".")))
    article_num_characters.append(len(dataset["train"][i]['text']))
    article_num_words.append(len(dataset["train"][i]['text'].split(" ")))
    

In [25]:
d = {'id': id, 'url': url, "title": title, 'article': article, "article_num_sentences": article_num_sentences, "article_num_words": article_num_words, "article_num_characters": article_num_characters, 'summary': summary,"summary_num_sentences": summary_num_sentences,"summary_num_words": summary_num_words, "summary_num_characters": summary_num_characters}
df = pd.DataFrame(data=d)
df.head(5)

In [26]:
print("min summary words", min(df['summary_num_words']))
print("max summary words", max(df['summary_num_words']))
print("mean summary sentences", np.mean(df['summary_num_sentences']))

In [27]:
ratio = df["summary_num_words"]/df["article_num_words"]
print("min ratio", min(ratio))
print("max ratio", max(ratio))
print("mean ratio", np.mean(ratio))
print("median ratio", np.median(ratio))
plt.hist(ratio)
plt.title("Ratio of word count in summary to article")

In [28]:
plt.scatter(df["summary_num_words"],df["article_num_words"])
z = np.polyfit(df["summary_num_words"],df["article_num_words"], 1)
p = np.poly1d(z)
#add trendline to plot
plt.plot(df["summary_num_words"], p(df["summary_num_words"]))
plt.title("Word Count in Summary vs Article")

**Category EDA**

In [29]:
categories = []

for i in range(len(dataset['train'])):
    cat = dataset['train'][i]['id']
    result = re.sub('\d','',cat)[:-1]
    result = result.split('-')[0].split('.')[0]
    categories.append(result)

In [30]:
def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

In [31]:
print(len(find_indices(categories, 'uk')))

In [32]:
dfc = {'category': categories, 'ind': np.ones(len(categories))}
       
dfc = pd.DataFrame(data = dfc)

table = pd.pivot_table(dfc, values='ind', index=['category'], aggfunc=np.sum, sort = True)

table.sort_values(by=('ind'), ascending=False,inplace=True)

table

**Train, Val, and Test sets for each category**

In [33]:
def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

In [34]:
categories = []

for i in range(len(dataset['train'])):
    cat = dataset['train'][i]['id']
    result = re.sub('\d','',cat)[:-1]
    result = result.split('-')[0].split('.')[0]
    categories.append(result)

**Category 1: uk**

In [41]:
uk = find_indices(categories, 'uk')
index = pd.DataFrame({"index": uk})
sample_index = index.sample(n=2000, replace=False, random_state=1004)

article, summary = data_organize(sample_index)

d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_train_uk.csv', index = False)
#df.head(5)

d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_val_uk.csv', index = False)
#df.head(5)

d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_test_uk.csv', index = False)


**Category 2: world**

In [37]:
world = find_indices(categories, 'world')
index = pd.DataFrame({"index": world})
sample_index = index.sample(n=2000, replace=False, random_state=1004)

article, summary = data_organize(sample_index)

d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_train_world.csv', index = False)
#df.head(5)

d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_val_world.csv', index = False)
#df.head(5)

d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_test_world.csv', index = False)
#df.head(5)

**Category 3: business**

In [38]:
business = find_indices(categories, 'business')
index = pd.DataFrame({"index": business})
sample_index = index.sample(n=2000, replace=False, random_state=1004)

article, summary = data_organize(sample_index)

d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_train_business.csv', index = False)
#df.head(5)

d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_val_business.csv', index = False)
#df.head(5)

d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_test_business.csv', index = False)
#df.head(5)

**Category 4: entertainment**

In [39]:
entertainment = find_indices(categories, 'entertainment')
index = pd.DataFrame({"index": entertainment})
sample_index = index.sample(n=2000, replace=False, random_state=1004)

article, summary = data_organize(sample_index)

d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_train_entertainment.csv', index = False)
#df.head(5)

d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_val_entertainment.csv', index = False)
#df.head(5)

d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_test_entertainment.csv', index = False)
#df.head(5)

**Category 5: technology**

In [35]:
technology = find_indices(categories, 'technology')
index = pd.DataFrame({"index": technology})
sample_index = index.sample(n=2000, replace=False, random_state=1004)

article, summary = data_organize(sample_index)

d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_train_technology.csv', index = False)
#df.head(5)

d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_val_technology.csv', index = False)
#df.head(5)

d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_test_technology.csv', index = False)
#df.head(5)