# Imports

In [1]:
import os
import re

from datasets import load_dataset
import pandas as pd

import requests
import json

from statistics import mean

import random
import nltk
from nltk.corpus import gutenberg
import numpy as np

import random

# Datasets

## Presidential Speeches - Data
Data Source: https://millercenter.org/presidential-speeches-downloadable-data

In [3]:
url = "https://millercenter.org/sites/default/files/corpus/presidential-speeches.json"
filename = "./data/Speeches/presidential-speeches.json"

# Get request to the url
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Get the JSON data from the response
    json_data = response.json()

    # Save the JSON data to a file
    with open(filename, "w") as file:
        json.dump(json_data, file)

    print(f"JSON data saved to {filename} successfully.")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


JSON data saved to ./data/Speeches/presidential-speeches.json successfully.


In [10]:
with open('../data/Speeches/presidential-speeches.json', 'r') as f:
  data = json.load(f)

speech_texts = [element["transcript"] for element in data if len(element["transcript"].split())>=1100]
print(len(speech_texts))
for i in range(len(speech_texts)):
    speech_texts[i] = speech_texts[i].replace("\r\n\r\n", " ")
    speech_texts[i] = speech_texts[i].replace("\r\n", "")

801


In [4]:
# Print the average number of words in each text
array = []
for transcript in speech_texts:
    words = transcript.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

4915.5830212234705


In [7]:
speech_texts_short = []

for text in speech_texts:
    target_word_count = random.randint(500, 800)
    total_word_count = 0
    selected_sentences = []
    sentences = nltk.sent_tokenize(text)

    sentence_length = []
    for sentence in sentences:
        sentence_length.append(len(sentence.split()))

    max_skip = (len(sentences)*int(mean(sentence_length)))-target_word_count
    words_to_skip = random.randint(0, max_skip)
    for sentence in sentences:
        words = sentence.split()

        if total_word_count + len(words) <= words_to_skip:
            total_word_count += len(words)
            continue
        if total_word_count + len(words) <= target_word_count + words_to_skip:
            selected_sentences.append(sentence)
            total_word_count += len(words)
        else:
            break
    speech_texts_short.append(' '.join(selected_sentences))

In [8]:
speech_class = ['Political speech'] * len(speech_texts_short)

df_speeches = pd.DataFrame({'Text': speech_texts_short, 'Class': speech_class})

In [9]:
print(speech_texts_short[0])

I have heard nothing from the Ambassador about any intention to leave. I have every reason to believe that if he had any plans, he would make them known. I fully covered, in my conference last week, my views toward the Ambassador's service, and I believe when and if he has any plans to leave the State Department service, he will communicate them to me. Q. Mr. President, in your letter to Soviet Premier Khrushchev on Wednesday regarding Cyprus you mentioned basic misunderstandings. Because of this misunderstanding and others, would a personal meeting between you and Khrushchev be desirable at this point? THE PRESIDENT. I think that we are in adequate communication with each other. I would be very happy to see the Chairman when it is indicated that there are any things that we can explore that would be helpful. I know of no reason for such a meeting at this time. Q. Mr. President, in answering an earlier question about the Soviet trade overture, did you mean to imply that trade between t

In [10]:
# Print the average number of words in each text
array = []
for transcript in speech_texts_short:
    words = transcript.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

650.5992509363296


## CNN News - Data
Data Source: https://huggingface.co/datasets/cnn_dailymail/viewer/3.0.0/train

In [13]:
# Load the News-Dataset from Huggingface
news_dataset = load_dataset("cnn_dailymail", "3.0.0")

Found cached dataset cnn_dailymail (/Users/bnnlukas/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
# Get the first 1000 texts of the dataset
dataset_news = news_dataset['train']
news_texts = dataset_news['article'][:1100]

In [15]:
# Print the average number of words in each text
array = []
for text in news_texts:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

594.7454545454545


In [16]:
news_texts_long = []
for text in news_texts:
    if len(text.split()) >= 250:
        news_texts_long.append(text)
print(len(news_texts_long))
print(news_texts_long[5])

1002
BAGHDAD, Iraq (CNN) -- Dressed in a Superman shirt, 5-year-old Youssif held his sister's hand Friday, seemingly unaware that millions of people across the world have been touched by his story. Nearby, his parents talked about the new future and hope they have for their boy -- and the potential for recovery from his severe burns. Youssif holds his sister's hand Friday. He's wearing a facial mask often used to help burn victims. It's the best birthday present the Iraqi family could ever have imagined for their boy: Youssif turns 6 next Friday. "I was so happy I didn't know what to do with myself," his mother, Zainab, told CNN, a broad smile across her face. "I didn't think the reaction would be this big." His father said he was on the roof of his house when CNN called him with the news about the outpouring of support for his son. "We just want to thank everyone who has come forward," he said. "We knew there was kindness out there." Like his wife, he couldn't stop smiling. He talked 

In [14]:
# Cut out the first sequuence ('BAGHDAD, Iraq (CNN) --')
news_texts_modified = []
for text in news_texts_long:
    parts = text.split('--', 1)
    if len(parts) > 1:
        news_texts_modified.append(parts[1])  # Take the second part after the split
    else:
        news_texts_modified.append(text)  # Keep the original string if '--' is not found

In [15]:
print(news_texts_modified[5])

 Dressed in a Superman shirt, 5-year-old Youssif held his sister's hand Friday, seemingly unaware that millions of people across the world have been touched by his story. Nearby, his parents talked about the new future and hope they have for their boy -- and the potential for recovery from his severe burns. Youssif holds his sister's hand Friday. He's wearing a facial mask often used to help burn victims. It's the best birthday present the Iraqi family could ever have imagined for their boy: Youssif turns 6 next Friday. "I was so happy I didn't know what to do with myself," his mother, Zainab, told CNN, a broad smile across her face. "I didn't think the reaction would be this big." His father said he was on the roof of his house when CNN called him with the news about the outpouring of support for his son. "We just want to thank everyone who has come forward," he said. "We knew there was kindness out there." Like his wife, he couldn't stop smiling. He talked about how he tried in vain 

In [16]:
news_class = ['News'] * len(news_texts_modified)

df_news = pd.DataFrame({'Text': news_texts_modified, 'Class': news_class})

In [17]:
# Print the average number of words in each text
array = []
for text in news_texts_modified:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

628.3033932135728


## Jurisdictions - Data
Data Source: https://zenodo.org/record/7151679

In [10]:
data_path_jurisdictions = './data/Juristictions/'

jurisdictions_texts = []
for file_name in os.listdir(data_path_jurisdictions):
    file_path = os.path.join(data_path_jurisdictions, file_name)
    with open(file_path, 'r', encoding='latin-1') as file:
        text = file.read()
        if len(text.split()) <=3000:
            continue
        else:
            jurisdictions_texts.append(text)
print(len(jurisdictions_texts))

780


In [11]:
# Print the average number of words in each text
array = []
for text in jurisdictions_texts:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

14464.241025641026


In [19]:
jurisdictions_texts_short = []

for text in jurisdictions_texts:
    target_word_count = random.randint(500, 800)
    total_word_count = 0
    selected_sentences = []
    sentences = nltk.sent_tokenize(text)
    sentence_length = []
    for sentence in sentences:
        sentence_length.append(len(sentence.split()))

    max_skip = (len(sentences)*int(mean(sentence_length)))-target_word_count-1000
    words_to_skip = random.randint(1000, max_skip)
    for sentence in sentences:
        words = sentence.split()

        if total_word_count + len(words) <= words_to_skip:
            total_word_count += len(words)
            continue
        if total_word_count + len(words) <= target_word_count + words_to_skip:
            selected_sentences.append(sentence)
            total_word_count += len(words)
        else:
            break
    jurisdictions_texts_short.append(' '.join(selected_sentences))

In [20]:
jurisdictions_class = ['Jurisdiction'] * len(jurisdictions_texts_short)

# Create the Dataframe for the jurisdictions
df_jurisdictions = pd.DataFrame({'Text': jurisdictions_texts_short, 'Class': jurisdictions_class})

array_jurisdictions = []
for text in jurisdictions_texts_short:
    words = text.split()
    num_words = len(words)
    array_jurisdictions.append(num_words)

# Print the average number of words in each text
print(mean(array_jurisdictions))

# Print the first rows of the dataframe
df_jurisdictions.head()

650.4089743589743


Unnamed: 0,Text,Class
0,The mere fact that the defendant is challengin...,Jurisdiction
1,(3) Any rule of law the effect of which is tha...,Jurisdiction
2,I was further satisfied that it would be in th...,Jurisdiction
3,It merely means that Parliament was not willin...,Jurisdiction
4,"If, in its case law since the Zambrano decisio...",Jurisdiction


## Literature - Data
Data Source: https://github.com/pgcorpus/gutenberg

In [3]:
data_path_literature = '../data/Literature/'
literature_texts = []

for file_name in os.listdir(data_path_literature):
    file_path = os.path.join(data_path_literature, file_name)
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        text = file.read()
        if len(text.split()) <=30000:
            continue
        else:
            cleaned_text = re.sub(r'\[.*?\]', '', text)
            cleaned_text = re.sub(r'\{.*?\}', '', cleaned_text)
            cleaned_text = cleaned_text.replace('\n', ' ')
            cleaned_text = cleaned_text.replace('\\', '')
            # Replace all multispaces with singlespaces to get texts that are more clean
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
            if 'Language: English' in cleaned_text:
                literature_texts.append(cleaned_text)
print(len(literature_texts))

952


In [4]:
# Print the average number of words in each text
array = []
for text in literature_texts:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

105257.3193277311


In [5]:
literature_texts_short = []

for text in literature_texts:
    target_word_count = random.randint(500, 800)
    total_word_count = 0
    selected_sentences = []
    sentences = nltk.sent_tokenize(text)
    sentence_length = []
    for sentence in sentences:
        sentence_length.append(len(sentence.split()))

    max_skip = (len(sentences)*int(mean(sentence_length)))-target_word_count-20000
    words_to_skip = random.randint(5000, max_skip)
    for sentence in sentences:
        words = sentence.split()

        if total_word_count + len(words) <= words_to_skip:
            total_word_count += len(words)
            continue
        if total_word_count + len(words) <= target_word_count + words_to_skip:
            selected_sentences.append(sentence)
            total_word_count += len(words)
        else:
            break
    literature_texts_short.append(' '.join(selected_sentences))

KeyboardInterrupt: 

In [23]:
# Print the average number of words in each text
array = []
for text in literature_texts_short:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

650.4012605042017


In [24]:
literature_class = ['Literature'] * len(literature_texts_short)

# Create the Dataframe for the literature texts
df_literature = pd.DataFrame({'Text': literature_texts_short, 'Class': literature_class})

# Print the first rows of the dataframe
df_literature['Text'][10]

'He sat up, yawned, sneezed, shook himself, and began to rake among the burning embers of my fire with his naked hand. Presently he found the white stone, which was now red-hot—at any rate it glowed as though it were—and after examining it for a moment finally popped it into his mouth! Then he hunted in the other fire for the black stone, which he treated in a similar fashion. The next thing I remember was that the fires, which had died away almost to nothing, were burning very brightly again, I suppose because someone had put fuel on them, and Zikali was speaking. “Come here, O Macumazana and O Son of Matiwane,” he said, “and I will repeat to you what your spirits have been telling me.” We drew near into the light of the fires, which for some reason or other was extremely vivid. Then he spat the white stone from his mouth into his big hand, and I saw that now it was covered with lines and patches like a bird’s egg. “You cannot read the signs?” he said, holding it towards me; and when 

## Blogs - Data
Data Source: https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus

In [18]:
# Read the blog dataset
df_blo = pd.read_csv('../data/Blogs/blogtext.csv')

In [19]:
n=0
# Create array with blogs over 1000 words to cut too short blogs
large_blog_texts = []
array = []
for text in df_blo['text']:
    words = text.split()
    num_words = len(words)
    if 1000 <= num_words <= 1500 and n < 1000:
        n+=1
        large_blog_texts.append(text)
        array.append(num_words)
# Print the average number of words in each text
print(n)
print(mean(array))

1000
1184.917


In [20]:
# Get arround 650 words in each blog text
target_word_count = 650

# Skip the first 200 words to just get clear text and not the title, the authors, the copyright, etc.
words_to_skip = 200
blog_texts = []

for text in large_blog_texts:
    total_word_count = 0
    selected_sentences = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = sentence.split()

        if total_word_count + len(words) <= words_to_skip:
            total_word_count += len(words)
            continue
        if total_word_count + len(words) <= target_word_count + words_to_skip:
            selected_sentences.append(sentence)
            total_word_count += len(words)
        else:
            break
    if len(' '.join(selected_sentences).split()) >= 250:
        blog_texts.append(' '.join(selected_sentences))

In [21]:
# Print the average number of words in each text
array = []
for text in blog_texts:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

649.6973947895791


In [28]:
blog_class = ['Blog'] * len(blog_texts)

# Create the dataframe for the blog texts
df_blogs = pd.DataFrame({'Text': blog_texts, 'Class': blog_class})

# Print the first rows of the dataframe
df_blogs.head()

Unnamed: 0,Text,Class
0,"i saw one (two, actually, but one appeared to ...",Blog
1,I can't help it HIM (4:38:07 PM): guess you do...,Blog
2,Italian-Australian immigrants always harken ba...,Blog
3,"Your whole family is a bit embarrassed, but yo...",Blog
4,"We each buy nothing, coming back to the realit...",Blog


# Result Dataframe

In [29]:
# Create the resulting dataframe for all texts with the related Classes
result_df = pd.concat([df_speeches, df_news, df_jurisdictions, df_literature, df_blogs], ignore_index=True)

# Print the first rows of the resulting dataframe 
result_df.head()

Unnamed: 0,Text,Class
0,I have heard nothing from the Ambassador about...,Political speech
1,I think it is in the public interest to procee...,Political speech
2,The A-11 aircraft now at Edwards Air force Bas...,Political speech
3,It is one of the most comprehensive bills in t...,Political speech
4,"So long as there remains a man without a job, ...",Political speech


In [30]:
print(len(result_df))

4533


In [31]:
for index, row in result_df.iterrows():
    text = row['Text']
    words = text.split()
    if len(words) < 230:
        result_df = result_df.drop(index)

In [32]:
from statistics import mean
# Print the average number of words in each text in the resulting dataframe
array = []
for text in result_df['Text']:
    words = text.split()
    num_words = len(words)
    array.append(num_words)
print(mean(array))

646.0569787985866


In [33]:
# Export the resulting dataframe to a csv-File in the 'data/Result/' folder
result_df.to_csv('../data/Result/dataset.csv', index=False)