# Download data

## Imports and utils

In [186]:
import multiprocessing
import json
import time
import re

from gpt_function_decorator import gpt_function
from pathlib import Path
from functools import lru_cache
from fastprogress import progress_bar
from bs4 import BeautifulSoup


def process_files(function, source_folder, target_folder=None, replace_existing=False, num_processes=1):
    source_folder = Path(source_folder)
    source_files = list(source_folder.glob("*"))
    if not replace_existing:
        target_folder = Path(target_folder)
        target_folder.mkdir(parents=True, exist_ok=True)
        already_processed = {f.stem for f in target_folder.glob("*")}
        source_files = [f for f in source_files if f.stem not in already_processed]
    if (len(source_files) > 1) and (num_processes > 1):
        with multiprocessing.Pool(processes=num_processes) as pool:
             list(progress_bar(pool.imap(function, source_files), total=len(source_files)))
    else:
        for file in progress_bar(source_files):
            function(file)

@lru_cache
def download_page(url, target_file=None, sleep_after=0):
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        time.sleep(sleep_after)
        if target_file is not None:
            Path(target_file).write_text(html)
        return html
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

def chunk_list(mylist, chunk_size, overlap=0):
    return [mylist[i:i + chunk_size + overlap] for i in range(0, len(mylist), chunk_size)]
        
def save_to_json(data, filepath):
    json.dump(data, open(filepath, "w"), indent=2)

def load_from_json(filepath):
    return json.load(open(filepath, "r"))

## Generate the composer list

### Ask for a list of composers

In [89]:
@gpt_function(gpt_model='gpt-4o', retries=2)
def list_famous_composers(n_composers) -> list:
    """Return a list of the most famous classical composers
    who lived between 1600 before 1976"""

target_file = Path("./data/composer_list.json")
if not target_file.is_file():
    composers = [
        composer
        for i in progress_bar(range(10))
        for composer in list_famous_composers(n_composers=100)
    ]
    unique_composers = set(composers)
    composers_list = [c for c in composer_counts if composer_counts[c] > 1]
    save_to_json(composers_list, target_file)

### Make one file per composer

In [75]:
composer_list = load_from_json("./data/composer_list.json")
path = Path("data/composer_names")
path.mkdir(exist_ok=True)
for composer in composer_list:
    save_to_json({"name": composer}, path / f"{composer}.json")

### Compute basic metadata on each composer

In [100]:
@gpt_function(gpt_model='gpt-4o', retries=2)
def composer_metadata(composer) -> dict:
    """Return the following metadata for the given composer:
    {
        full_name: str,
        first_names: str,
        last_name: str,
        birth_year: int,
        death_year: int
    }
    """

target_folder = Path("./data/composer_basic_metadata/")

def compute_basic_metadata(source_file):
    file_data = load_from_json(source_file)
    output = composer_metadata(composer=file_data["name"])
    save_to_json(output, target_folder / source_file.name)


process_files(
    compute_basic_metadata,
    source_folder="./data/composer_names",
    target_folder=target_folder,
    num_processes=3
)

## Summarize composer wiki pages

### Find the composer pages urls via a wiki search

In [108]:
import requests

def get_wikipedia_url_from_search(term, sleep_after=1):
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": term,
        "format": "json"
    }
    
    response = requests.get(search_url, params=params)
    data = response.json()
    time.sleep(sleep_after)
    
    if data['query']['search']:
        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        return page_url
    else:
        print (f"No page found for {term}")
        return None
    
target_folder = Path("data/composer_wikipedia_urls/")

def get_composer_wikipedia_url(source_file):
    file_data = load_from_json(source_file)
    url = get_wikipedia_url_from_search(term=file_data["name"])
    save_to_json({"url": url}, target_folder / source_file.name)


process_files(
    get_composer_wikipedia_url,
    source_folder="./data/composer_names",
    target_folder=target_folder,
    num_processes=3
)

### Download the composer wiki pages

In [112]:
target_folder = Path("data/composer_wikipedia_pages/")

def download_composer_wikipedia_page(source_file):
    url = load_from_json(source_file)["url"]
    target_file = target_folder / source_file.with_suffix(".html").name
    download_page(url, target_file=target_file, sleep_after=1)

process_files(
    download_composer_wikipedia_page,
    source_folder="./data/composer_wikipedia_urls/",
    target_folder=target_folder,
    num_processes=3
)

### Extract sections from composer webpages

In [172]:
def extract_sections_from_wikipedia_page(html_content, section_blacklist=(), section_whitelist=()):
    """wikipedia pages have the following slightly complex schema:

    <div><h2>Section 1 title</h2></div>
    <p>...</p>...
    <div><h2>Section 2 title</h2></div>
    <p>...</p>...
    """
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all h2 tags, which represent section titles
    sections = soup.find_all('h2')

    # Initialize a list to store the results
    section_data = []

    # Iterate over each section
    for section in sections:
        title = section.get_text(strip=True)  # Extract the title text
        if section_blacklist is not None:
            if title in section_blacklist:
                continue
        if section_whitelist is not None:
            if title not in section_blacklist:
                continue
        content = []  # Initialize content list for this section
        for sibling in section.parent.find_next_siblings():
            if len(sibling.select('h2')):  # Stop if the next h2 section starts
                break
            if sibling.get_text(strip=True):  # Add text content if not empty
                content.append(sibling.get_text())

        # Join the content list into a single string
        content_text = " ".join(content)

        # Append the result as a dictionary
        section_data.append({"title": title, "content": content_text})
    return section_data


target_folder = Path("data/composer_wikipedia_sections/")

section_title_blacklist = [
    "External links",
    "Further reading",
    "Sources",
    "See also",
    "References",
    "Contents",
    "Notes, references and sources",
    "Notes",
    "Notes and references",
    "Recordings",
    "Compositions"
]
def section_composer_wikipedia_page(source_file):
    html_content = source_file.read_text()
    sections = extract_sections_from_wikipedia_page(
        html_content, section_blacklist=section_title_blacklist
    )
    target_file = target_folder / source_file.with_suffix(".json").name
    save_to_json(sections, target_file)

process_files(
    section_composer_wikipedia_page,
    source_folder="./data/composer_wikipedia_pages/",
    target_folder=target_folder,
    replace_existing=False,
    num_processes=5
)

### Summarize the composer wikipedia pages

In [171]:
@gpt_function(retries=2)
def list_life_events(text: str, exclude_events: list) -> list[dict]:
    """Return a list of major life events in the given text.
    Exclude any event described in the exclude_events list.
    Make the summary as informative as possible but only with information
    from the text.
    Return an empty list if there are no events.
    
    The output should have this schema:
    [{title:str, summary: str, year: int, location: str}...]
    """

target_folder = Path("data/composer_event_summaries/")
    
def summarize_sections(source_file):
    events = []
    sections = load_from_json(source_file)
    for section in sections:
        title = section["title"]
        text = source_file.stem + title + ".\n" + section["content"]
        past_events = [f"{e['year']} {e['title']}" for e in events]
        section_events = list_life_events(text=text, exclude_events=past_events)
        events += [{"section": title, **e} for e in section_events]
    save_to_json(events, target_folder / source_file.name)

process_files(
    summarize_sections,
    source_folder="./data/composer_wikipedia_sections/",
    target_folder=target_folder,
    num_processes=3
)

## Get compositions

### Download IMSLP pages

In [175]:
IMSLP_URL = "https://imslp.org"

def detect_imslp_link(wikipedia_html):
    soup = BeautifulSoup(wikipedia_html)
    for link in soup.select("a"):
        if "href" in link.attrs: 
            href = link.attrs["href"]
            if href.startswith(IMSLP_URL + "/wiki/Category"):
                return href

target_folder = Path("data/composer_imslp_pages")

def download_imslp_page(wikipedia_html_file):
    html = wikipedia_html_file.read_text()
    imslp_url = detect_imslp_link(html)
    if imslp_url is None:
        return
    target_file = target_folder / wikipedia_html_file.name
    download_page(imslp_url, target_file, sleep_after=1)
    
process_files(
    download_imslp_page,
    source_folder="./data/composer_wikipedia_pages/",
    target_folder=target_folder,
    num_processes=3
)

In [181]:
html = Path("data/composer_imslp_pages/Alban Berg.html").read_text()


'Berg, Alban'

In [187]:
IMSLP_URL = "https://imslp.org"

def detect_year(txt):
    # Regular expression to find all numbers in the text
    numbers = re.findall(r'\b\d{4}\b', txt)
    for num in numbers:
        year = int(num)
        if 1000 <= year <= 3000:
            return year
    return None

def get_publication_year(work_url):
    work_html = download_page(work_url, sleep_after=1)
    try:
        soup = BeautifulSoup(work_html)
    except:
        return None
    indicators = ["First Publication", "Composition Year"]
    trs = [
        tr for tr in soup.select("tr")
        if any([indicator in tr.get_text() for indicator in indicators])
    ]
    if trs == []:
        return None
    year = trs[0].select("td")[0].get_text()
    return detect_year(year)

def get_list_of_work_links(imslp_html):
    soup = BeautifulSoup(imslp_html)
    title = soup.select("h1")[0].get_text().strip()
    imslp_composer = title.replace("Category:", "")
    return [
        a for section in soup.select("#mw-pages")
        for a in section.find_all("a", class_="categorypagelink")
        if imslp_composer.replace(" ", "_") in a.attrs["href"]
    ]

def fetch_work_metadata(imslp_work_link):
    piece_title = imslp_work_link.text.split("(")[0].strip()
    work_url = IMSLP_URL + imslp_work_link.attrs["href"].replace(" ", "_")
    year = get_publication_year(work_url)
    return {
        "imslp_url": work_url,
        "year": year,
        "title": piece_title
    }

target_folder = Path("data/composer_works")

def get_composition_list(imslp_html_file):
    imslp_html = imslp_html_file.read_text()
    work_links = get_list_of_work_links(imslp_html)
    works_with_metadata = [fetch_work_metadata(link) for link in work_links]
    target_file = target_folder / imslp_html_file.with_suffix(".json").name
    save_to_json(works_with_metadata, target_file)


process_files(
    get_composition_list,
    source_folder="./data/composer_imslp_pages/",
    target_folder=target_folder,
    num_processes=3
)

## Download the wikipedia pages for each year

In [None]:
years_wikipedia_html_dir = DATA_PATH / "years_wikipedia_html" 
if not year_events_pages.is_dir():
    year_events_pages.mkdir()

for year in tqdm(range(1500, 2000)):
    target_file = years_wikipedia_html_dir / f"{year}.html"
    if target_file.is_file():
        continue
    wikipedia_url = f"https://en.wikipedia.org/wiki/{year}"
    download_page(wikipedia_url, target_file, sleep_after=2)

## Get compositions from IMSLP

## Compile composer data

In [None]:
composers_list = load_from_json(DATA_PATH / "composer_list_with_metadata.json")
wikipedia_urls = load_from_json(DATA_PATH / "composers_wikipedia_urls.json")

full_composer_data_dir = DATA_PATH / "full_composer_data"
if not full_composer_data_dir.is_dir():
    full_composer_data_dir.mkdir()

for composer_metadata_file in load_from_json("./data/composer_list.json"):
    full_name = composer["full_name"]
    events_json = DATA_PATH / "deduplicated_events" / f"{full_name}.json"
    if not events_json.is_file():
        # print (f"No events for {full_name}")
        continue
    compositions_json = DATA_PATH / "compositions" / f"{full_name}.json"
    if not compositions_json.is_file():
        # print (f"No compositions for {full_name}")
        continue
    full_composer_data = {**composer}
    full_composer_data["wikipedia_url"] = wikipedia_urls[full_name]
    full_composer_data["events"] = load_from_json(events_json)
    full_composer_data["compositions"] = load_from_json(compositions_json)
    save_to_json(full_composer_data, full_composer_data_dir / f"{full_name}.json")
    # print (composer)

In [11]:
composers_list = load_from_json(DATA_PATH / "composer_list_with_metadata.json")
wikipedia_urls = load_from_json(DATA_PATH / "composers_wikipedia_urls.json")

full_composer_data_dir = DATA_PATH / "full_composer_data"
if not full_composer_data_dir.is_dir():
    full_composer_data_dir.mkdir()

for composer in composers_list:
    full_name = composer["full_name"]
    events_json = DATA_PATH / "deduplicated_events" / f"{full_name}.json"
    if not events_json.is_file():
        # print (f"No events for {full_name}")
        continue
    compositions_json = DATA_PATH / "compositions" / f"{full_name}.json"
    if not compositions_json.is_file():
        # print (f"No compositions for {full_name}")
        continue
    full_composer_data = {**composer}
    full_composer_data["wikipedia_url"] = wikipedia_urls[full_name]
    full_composer_data["events"] = load_from_json(events_json)
    full_composer_data["compositions"] = load_from_json(compositions_json)
    save_to_json(full_composer_data, full_composer_data_dir / f"{full_name}.json")
    # print (composer)

## Summarize the events of all years


In [None]:
prompt_template="""
From the text below, select the top ~10 major events.
Prefer major technical advances, or major events which
would have made the front page of European newspapers.
Prefer events which could have had an impact on citizens
and in particular music composers.

Return a list of the form [event_1, event_2, ...]
where each event has the following schema
{"event": str, "summary": str, "year": int, "city": str, "country": str}
The year should always be a single integer, and only an integer.
Never use quotation marks inside the summary.

"""

def get_world_events(text):
    prompt = prompt_template + text
    for attempt in range(3):
        try:
            return ask_chatgpt_with_pythonic_output(prompt, sleep_after=1, model='gpt-4o-mini')
        except Exception as err:
            if attempt == 2:
                raise (err)

def extract_events_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    events_section = soup.find('h2', string="Events")
    
    if not events_section:
        return None
    
    # Extract all text between the "Events" h2 and the next h2
    events_text = []
    for sibling in events_section.parent.find_next_siblings():
        children = list(sibling.children)
        if len(children) and children[0].name == 'h2':
            break
        events_text.append(sibling.get_text())
    
    return '\n'.join(events_text)
                
years_wikipedia_html_dir = DATA_PATH / "years_wikipedia_html" 
year_world_events_dir = DATA_PATH / "year_world_events" 
if not word_events_dir.is_dir():
    word_events_dir.mkdir()

for page_path in tqdm(list(years_wikipedia_html_dir.glob("*.html"))):
    target_file = year_world_events_dir / (f"{page_path.name.replace('.html', '')}.json")
    if target_file.is_file():
        continue
    html = page_path.read_text()
    events_text = extract_events_text(html)
    selected_events_list = get_world_events(events_text)
    save_to_json(selected_events_list, target_file)

### Compile world events to a single file

In [None]:
year_world_events = {
    year: load_from_json(DATA_PATH / "year_world_events" / f"{year}.json")
    for year in range(1500, 2000)
}
save_to_json(year_world_events, DATA_PATH / "year_world_events.json")