# Prepare summarization dataset for HuggingFace

In [None]:
import json
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [None]:
def read_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]


def write_jsonl(path, data):
    with open(path, "w") as f:
        for line in data:
            json.dump(line, f)
            f.write("\n")

In [None]:
RAW_PATH = "../../../../mnt/data_6tb/oliver/NordjyllandNews/data/raw/summary.jsonl"
PROCESSED_PATH = "../../../../mnt/data_6tb/oliver/NordjyllandNews/data/processed/summary/"


In [None]:
raw_data = read_jsonl(RAW_PATH)

In [None]:
len(raw_data)

In [None]:
raw_data[0].keys()

Inspect summary duplicate

In [None]:
summary_dict = {}

for article in raw_data:
    summary = article["summary"]
    if summary in summary_dict:
        summary_dict[summary] += 1
    else:
        summary_dict[summary] = 1

duplicates = [summary for summary, count in summary_dict.items() if count > 1]


In [None]:
duplicates

Make blacklist based on summary duplicates

In [None]:
blacklist_summaries = [
    "&nbsp;",
    "Opdateres...",
]

Build processed data.

In [None]:
processed_data = []

MIN_LENGTH = 10

for article in raw_data:
    summary = article["summary"]
    text = article["text_content"]
    text_len = len(text)
    summary_len = len(summary)

    if text_len < MIN_LENGTH or summary_len < MIN_LENGTH or summary in blacklist_summaries:
        continue
    

    processed_data.append({
        "text": text,
        "summary": summary,
        "text_len": text_len,
        "summary_len": summary_len
    })

In [None]:
len(raw_data) - len(processed_data)

Train, val, and test split

In [None]:
n = len(processed_data)

In [None]:
VAL_AND_TEST_FRACTION = 0.05
val_size = int(n * VAL_AND_TEST_FRACTION)
test_size = int(n * VAL_AND_TEST_FRACTION)
train_size = n - val_size - test_size


random.shuffle(processed_data)

train_data = processed_data[:train_size]
val_data = processed_data[train_size:train_size + val_size]
test_data = processed_data[train_size + val_size:]

In [None]:
len(train_data) + len(val_data) + len(test_data) == len(processed_data)

In [None]:
print(f"Train: {len(train_data)}")
print(f"Val: {len(val_data)}")
print(f"Test: {len(test_data)}")

Write splits to disk

In [None]:
train_path = PROCESSED_PATH + "train.jsonl"
val_path = PROCESSED_PATH + "val.jsonl"
test_path = PROCESSED_PATH + "test.jsonl"

In [None]:
write_jsonl(train_path, train_data)
write_jsonl(val_path, val_data)
write_jsonl(test_path, test_data)

## Dataset statistics

Size of dataset

In [None]:
train_size = os.path.getsize(train_path) / 1e6 
val_size = os.path.getsize(val_path) / 1e6
test_size = os.path.getsize(test_path) / 1e6

print(f"Total size: {train_size + val_size + test_size:.2f} MB")


### Text length distribution

In [None]:
texts_lengths = [article["text_len"] for article in processed_data]

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(texts_lengths, bins=100)
plt.title("Text Length Distribution")
plt.ylabel("Frequency")
plt.xlabel("Number of characters in text")
plt.savefig("../figures/text_length_distribution.png")

In [None]:
min(texts_lengths), max(texts_lengths)

### Summary length distribution

In [None]:
summary_lengths = [article["summary_len"] for article in processed_data]

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(summary_lengths, bins=100)
plt.title("Summary Length Distribution")
plt.ylabel("Frequency")
plt.xlabel("Number of characters in summary")
plt.savefig("../figures/summary_length_distribution.png")

In [None]:
min(summary_lengths), max(summary_lengths)

#### Data set problems

Count number of times that the length of the summary is greater than the length of the text.

In [None]:
sum(article["summary_len"] > article["text_len"] for article in processed_data)