# Download data

## Imports and utils

In [1]:
import json
import os
import time
from functools import lru_cache
from pathlib import Path
import re

import requests
from dotenv import load_dotenv
import openai
import bs4
from fastprogress import fastprogress

load_dotenv()  # take environment variables from .env.

openai.api_key = os.environ["OPENAI_KEY"]

def ask_chatgpt_with_pythonic_output(query, model="gpt-4o", sleep_after=0, retries=0, expected_type=None):
    for attempt in range(retries + 1):
        system_prompt = (
            "Each of your answers should all be valid a python object "
            "such as a list, dictionary, etc. "
            "Answer with no introduction and no markdown formatting"
        )
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": query}
            ]
        )
        response = response['choices'][0]['message']['content']
        try:
            response = eval(response)
            if expected_type is not None:
                assert isinstance(response, expected_type)
        except Exception as e:
            if attempt < retries:
                print ("ChatGPT didn't produce valid python, trying again.")
                time.sleep(sleep_after)
            else:
                raise ValueError(f"ChatGPT answer was not valid python: {response}")
            

        time.sleep(sleep_after)
        return response

@lru_cache
def download_page(url, filepath=None, sleep_after=0):
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        time.sleep(sleep_after)
        if filepath is not None:
            with open(filepath, "w") as f:
                f.write(html)
        return html
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

def chunk_list(mylist, chunk_size, overlap=0):
    return [mylist[i:i + chunk_size + overlap] for i in range(0, len(mylist), chunk_size)]
        
def save_to_json(data, filepath):
    json.dump(data, open(filepath, "w"), indent=2)

def load_from_json(filepath):
    return json.load(open(filepath, "r"))

DATA_PATH = Path("data")

if not DATA_PATH.is_dir():
    DATA_PATH.mkdir()

## Generate the composer list

In [4]:
composers_list = ask_chatgpt_with_pythonic_output(
    """Give me a list of the full names of the most famous classical composers.
    Only include composers born after 1500 and dead before 1976.
    Make sure the list has more than 100 composers.
    """,
    model="gpt-4o"
)
print (f"{len(composers_list)} composers: {','.join(composers_list)}")

105 composers: Johann Sebastian Bach,Ludwig van Beethoven,Wolfgang Amadeus Mozart,Franz Schubert,Johannes Brahms,Pyotr Ilyich Tchaikovsky,Richard Wagner,Franz Liszt,Giuseppe Verdi,Antonio Vivaldi,Gioachino Rossini,Frédéric Chopin,Robert Schumann,Gustav Mahler,Felix Mendelssohn,Antonín Dvořák,Camille Saint-Saëns,Georges Bizet,Hector Berlioz,Jean Sibelius,Edvard Grieg,Claude Debussy,Sergei Rachmaninoff,Maurice Ravel,Alexander Scriabin,Igor Stravinsky,Dmitri Shostakovich,Modest Mussorgsky,Nikolai Rimsky-Korsakov,Mikhail Glinka,Carl Maria von Weber,Gaetano Donizetti,Vincenzo Bellini,Gustav Holst,Ralph Vaughan Williams,Ferruccio Busoni,Edward Elgar,Arthur Sullivan,Charles Gounod,Leoš Janáček,Franz Lehar,Paul Dukas,Ottorino Respighi,Alexander Borodin,Manuel de Falla,Heitor Villa-Lobos,Aaron Copland,Samuel Barber,Benjamin Britten,Béla Bartók,Zoltán Kodály,Arnold Schoenberg,Alban Berg,Anton Webern,George Gershwin,Scott Joplin,Erik Satie,César Franck,Jules Massenet,Jacques Offenbach,Jean-Philip

In [8]:
save_to_json(composers_list, DATA_PATH / "composers_list.json")

## Add a bit of structure/metadata to the list

In [10]:
def get_composer_details(composers):
    return ask_chatgpt_with_pythonic_output("""
    For the following composers, add their birth and death years.
    Return a list where elements have the following schema:
    {
      full_name: str,
      first_names: str,
      last_name: str,
      birth_year: int,
      death_year: int
    }
    
    Composers:
    """ + ",".join(composers))

composers_list = load_from_json(DATA_PATH / "composers_list.json")
composers_with_metadata = [
    data
    for composer_chunk in tqdm(chunk_list(composers_list, 10))
    for data in get_composer_details(composer_chunk)
]
save_to_json(composers_with_metadata, DATA_PATH / "composer_list_with_metadata.json")
len(composers_with_metadata)

105

## Get the wikipedia pages URLs of all composers

In [11]:
import requests

def get_wikipedia_url(composer_name, sleep_after=2):
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": composer_name,
        "format": "json"
    }
    
    response = requests.get(search_url, params=params)
    data = response.json()
    time.sleep(sleep_after)
    
    if data['query']['search']:
        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        return page_url
    else:
        print (f"No page found for {composer_name}")
        return None
    

composers_list = load_from_json(DATA_PATH / "composers_list.json")
composer_pages_urls = {
    composer_name: get_wikipedia_url(composer_name)
    for composer_name in tqdm(composers_list)
}
save_to_json(composer_pages_urls, DATA_PATH / "composers_wikipedia_urls.json")

100%|█████████████████████████████████████████| 105/105 [04:22<00:00,  2.50s/it]


## Download the composer wikipedia pages

In [16]:
composer_pages_urls = load_from_json(DATA_PATH / "composers_wikipedia_urls.json")
pages_dir = DATA_PATH / "composers_wikipedia_html"
if not pages_dir.is_dir():
    pages_dir.mkdir()
for composer_name, wikipedia_page in tqdm(composer_pages_urls.items()):
    target_path = pages_dir / (composer_name + ".html")
    if not target_path.is_file():
        download_page(wikipedia_page, target_path, sleep_after=2)

100%|██████████████████████████████████████| 105/105 [00:00<00:00, 89114.11it/s]


## Summarize the composer wikipedia pages

In this separate pages in chunks of 2000 words and we enforce a 15s pause between the processing of chunk. As a result this takes hours to complete.

In [2]:
prompt_template ="""
From the biography below, return a list of the major events in the composer's life.
Do not include actions by other people after the composer's death.
The answer will be of the form [event_1, event_2, ...]
where each event has the following schema
{"event": str, "summary": str, "year": int, "city": str, "country": str}. 
The year should always be a single integer, and only an integer.

The summary should be a few sentences describing the event and context.
The summary should be very informative but also funny.
Avoid dark humor and don't joke about tragedies.
Don't mention the year. Stick to the events in the biography, do not invent.

"""

def get_events(text):
    return ask_chatgpt_with_pythonic_output(
        prompt_template + text,
        sleep_after=15,
        model='gpt-4o-mini',
        retries=1,
        expected_type=list
    )

pages_dir = DATA_PATH / "composers_wikipedia_html"
events_dir = DATA_PATH / "composer_events" 
if not events_dir.is_dir():
    events_dir.mkdir()
master_progress_bar = fastprogress.master_bar(list(pages_dir.glob("*.html")))
for page_path in master_progress_bar:
    composer_name = page_path.name.split(".")[0]
    target_file = events_dir / (composer_name + ".json")
    if target_file.is_file():
        time.sleep(0.1)
        continue
    html = page_path.read_text()
    soup = bs4.BeautifulSoup(html)
    biography = soup.select("#mw-content-text")[0].get_text()
    words = biography.split(" ")
    word_chunks = chunk_list(words, 2000, overlap=20)
    events = [
        event
        for chunk in fastprogress.progress_bar(word_chunks, parent=master_progress_bar)
        for event in get_events(" ".join(chunk))
    ]
    save_to_json(events, target_file)


ChatGPT didn't produce valid python, trying again.
ChatGPT didn't produce valid python, trying again.
ChatGPT didn't produce valid python, trying again.
ChatGPT didn't produce valid python, trying again.
ChatGPT didn't produce valid python, trying again.
ChatGPT didn't produce valid python, trying again.


In [None]:
prompt_template ="""
Given the list of events below,
detect entries which describe the exact same event (same place, same year, same story),
and return a deduplicated copy of the list with redundancies removed.

"""

def deduplicate_events(events_list):
    return ask_chatgpt_with_pythonic_output(
        prompt_template + str(events_list),
        model='gpt-4o-mini',
        retries=1,
        expected_type=list
    )

events_dir = DATA_PATH / "composer_events" 
deduplicated_events_dir =  DATA_PATH / "deduplicated_events" 
if not deduplicated_events_dir.is_dir():
    deduplicated_events_dir.mkdir()
master_progress_bar = fastprogress.progress_bar(list(events_dir.glob("*.json")))
for events_path in master_progress_bar:
    composer_name = events_path.name.split(".")[0]
    target_file = deduplicated_events_dir / (composer_name + ".json")
    if target_file.is_file():
        time.sleep(0.1)
        continue
    events_list = load_from_json(events_path)
    deduplicated_events = deduplicate_events(events_list)
    # print (composer_name, len(events_list), len(deduplicated_events))
    save_to_json(deduplicated_events, target_file)

## Download the wikipedia pages for each year

In [None]:
years_wikipedia_html_dir = DATA_PATH / "years_wikipedia_html" 
if not year_events_pages.is_dir():
    year_events_pages.mkdir()

for year in tqdm(range(1500, 2000)):
    target_file = years_wikipedia_html_dir / f"{year}.html"
    if target_file.is_file():
        continue
    wikipedia_url = f"https://en.wikipedia.org/wiki/{year}"
    download_page(wikipedia_url, target_file, sleep_after=2)

## Get compositions from IMSLP

In [None]:
IMSLP_URL = "https://imslp.org"

def detect_imslp_link(html):
    soup = bs4.BeautifulSoup(html)
    for link in soup.select("a"):
        if "href" in link.attrs: 
            href = link.attrs["href"]
            if href.startswith(IMSLP_URL + "/wiki/Category"):
                return href

def detect_year(txt):
    # Regular expression to find all numbers in the text
    numbers = re.findall(r'\b\d{4}\b', txt)
    
    for num in numbers:
        year = int(num)
        if 1000 <= year <= 3000:
            return year
    
    return None

def get_publication_year(work_url):
    work_html = download_page(work_url, sleep_after=0)
    try:
        soup = bs4.BeautifulSoup(work_html)
    except:
        return None
    indicators = ["First Publication", "Composition Year"]
    trs = [
        tr for tr in soup.select("tr")
        if any([indicator in tr.get_text() for indicator in indicators])
    ]
    if trs == []:
        return None
    year = trs[0].select("td")[0].get_text()
    return detect_year(year)
    

pages_dir = DATA_PATH / "composers_wikipedia_html"
compositions_dir = DATA_PATH / "compositions"
if not (compositions_dir.is_dir()):
    compositions_dir.mkdir()
progress_master_bar = master_bar(list(pages_dir.glob("*.html"))) 
for page_path in progress_master_bar:
    composer_name = page_path.name.split(".")[0]
    target_file = compositions_dir / (composer_name + ".json")
    if target_file.is_file():
        time.sleep(0.1)
        continue
    wikipedia_html = page_path.read_text()
    imslp_url = detect_imslp_link(wikipedia_html)
    if imslp_url is None:
        continue
    
    imslp_composer = imslp_url.split("Category:")[1].replace("_", " ")
    
    imslp_html = download_page(imslp_url)
    soup = bs4.BeautifulSoup(imslp_html)
    compositions = []
    links_to_composer_works = [
        a for section in soup.select("#mw-pages")
        for a in section.find_all("a", class_="categorypagelink")
        if imslp_composer.replace(" ", "_") in a.attrs["href"]
    ]
    for link in progress_bar(links_to_composer_works, parent=progress_master_bar):
        piece_title = link.text.split("(")[0].strip()
        work_url = IMSLP_URL + link.attrs["href"].replace(" ", "_")
        year = get_publication_year(work_url)
        compositions.append({
            "imslp_url": work_url,
            "year": year,
            "title": piece_title
        })
    save_to_json(compositions, target_file)


## Compile composer data

In [11]:
composers_list = load_from_json(DATA_PATH / "composer_list_with_metadata.json")
wikipedia_urls = load_from_json(DATA_PATH / "composers_wikipedia_urls.json")

full_composer_data_dir = DATA_PATH / "full_composer_data"
if not full_composer_data_dir.is_dir():
    full_composer_data_dir.mkdir()

for composer in composers_list:
    full_name = composer["full_name"]
    events_json = DATA_PATH / "deduplicated_events" / f"{full_name}.json"
    if not events_json.is_file():
        # print (f"No events for {full_name}")
        continue
    compositions_json = DATA_PATH / "compositions" / f"{full_name}.json"
    if not compositions_json.is_file():
        # print (f"No compositions for {full_name}")
        continue
    full_composer_data = {**composer}
    full_composer_data["wikipedia_url"] = wikipedia_urls[full_name]
    full_composer_data["events"] = load_from_json(events_json)
    full_composer_data["compositions"] = load_from_json(compositions_json)
    save_to_json(full_composer_data, full_composer_data_dir / f"{full_name}.json")
    # print (composer)

## Summarize the events of all years


In [None]:
prompt_template="""
From the text below, select the top ~10 major events.
Prefer major technical advances, or major events which
would have made the front page of European newspapers.
Prefer events which could have had an impact on citizens
and in particular music composers.

Return a list of the form [event_1, event_2, ...]
where each event has the following schema
{"event": str, "summary": str, "year": int, "city": str, "country": str}
The year should always be a single integer, and only an integer.
Never use quotation marks inside the summary.

"""

def get_world_events(text):
    prompt = prompt_template + text
    for attempt in range(3):
        try:
            return ask_chatgpt_with_pythonic_output(prompt, sleep_after=1, model='gpt-4o-mini')
        except Exception as err:
            if attempt == 2:
                raise (err)

def extract_events_text(html):
    soup = bs4.BeautifulSoup(html, 'html.parser')
    events_section = soup.find('h2', string="Events")
    
    if not events_section:
        return None
    
    # Extract all text between the "Events" h2 and the next h2
    events_text = []
    for sibling in events_section.parent.find_next_siblings():
        children = list(sibling.children)
        if len(children) and children[0].name == 'h2':
            break
        events_text.append(sibling.get_text())
    
    return '\n'.join(events_text)
                
years_wikipedia_html_dir = DATA_PATH / "years_wikipedia_html" 
year_world_events_dir = DATA_PATH / "year_world_events" 
if not word_events_dir.is_dir():
    word_events_dir.mkdir()

for page_path in tqdm(list(years_wikipedia_html_dir.glob("*.html"))):
    target_file = year_world_events_dir / (f"{page_path.name.replace('.html', '')}.json")
    if target_file.is_file():
        continue
    html = page_path.read_text()
    events_text = extract_events_text(html)
    selected_events_list = get_world_events(events_text)
    save_to_json(selected_events_list, target_file)

### Compile world events to a single file

In [None]:
year_world_events = {
    year: load_from_json(DATA_PATH / "year_world_events" / f"{year}.json")
    for year in range(1500, 2000)
}
save_to_json(year_world_events, DATA_PATH / "year_world_events.json")