In [1]:
# !pip install requests

In [2]:
# !pip install beautifulsoup4

In [3]:
import requests
import random
import re

# Function to generate book URLs
def generate_book_urls(start_id, end_id):
    return [f'https://www.gutenberg.org/files/{i}/{i}-0.txt' for i in range(start_id, end_id + 1)]


book_urls = generate_book_urls(1, 400)

def fetch_text(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def split_into_sentences(text):
    # Simple regex to split text into sentences
    return re.split(r'(?<=[.!?]) +', text)

def get_random_sentences(sentences, num_sentences=100):
    # Filter sentences by token size (word count) between 30 and 50
    valid_sentences = [s for s in sentences if 30 <= len(s.split()) <= 50]
    return random.sample(valid_sentences, min(num_sentences, len(valid_sentences)))


In [4]:
all_sentences = []

for url in book_urls:
    print(f"Processing {url}")
    try:
        text = fetch_text(url)
        sentences = split_into_sentences(text)
        sentences = [s.strip() for s in sentences if s.strip()]
        random_sentences = get_random_sentences(sentences)
        all_sentences.extend(random_sentences)
        if len(all_sentences) >= 20000:
            break
    except Exception as e:
        print(f"Error processing {url}: {e}")


all_sentences = all_sentences[:20000]

print(f"Total sentences fetched: {len(all_sentences)}")


Processing https://www.gutenberg.org/files/1/1-0.txt
Processing https://www.gutenberg.org/files/2/2-0.txt
Error processing https://www.gutenberg.org/files/2/2-0.txt: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/2/2-0.txt
Processing https://www.gutenberg.org/files/3/3-0.txt
Processing https://www.gutenberg.org/files/4/4-0.txt
Error processing https://www.gutenberg.org/files/4/4-0.txt: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/4/4-0.txt
Processing https://www.gutenberg.org/files/5/5-0.txt
Error processing https://www.gutenberg.org/files/5/5-0.txt: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/5/5-0.txt
Processing https://www.gutenberg.org/files/6/6-0.txt
Error processing https://www.gutenberg.org/files/6/6-0.txt: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/6/6-0.txt
Processing https://www.gutenberg.org/files/7/7-0.txt
Processing https://www.gutenberg.org/files/8/8-0.txt
Error processing htt

In [5]:
# !pip install nltk

In [6]:
import json

# Save the sentences to a JSON file
with open('extracted_sentences.json', 'w') as file:
    json.dump({"irrelevant": all_sentences}, file, indent=4)

print(f"Collected {len(all_sentences)} sentences and saved them in 'extracted_sentences.json'.")


Collected 20000 sentences and saved them in 'extracted_sentences.json'.


In [8]:
try:
    with open('synthesis.json', 'r') as file:
        data = json.load(file)
except FileNotFoundError:
    data = {} 

if "irrelevant" in data:
    data["irrelevant"].extend(all_sentences)
else:
    data["irrelevant"] = all_sentences

with open('synthesis.json', 'w') as file:
    json.dump(data, file, indent=4)

print(f"Collected {len(all_sentences)} sentences and added them to 'synthesis.json'.")

Collected 20000 sentences and added them to 'synthesis.json'.
