# Data collection

## Imports and utils

In [11]:
#!python3 -m pip install -U requests_cache bs4 fastprogress gpt_function

In [2]:
from pathlib import Path
import multiprocessing
import json
import time
import re

from gpt_function_decorator import gpt_function
import requests_cache
from fastprogress import progress_bar
from bs4 import BeautifulSoup

def process_files(
    function,
    source_folder,
    target_folder=None,
    replace_existing=False,
    num_processes=1
):
    source_folder = Path(source_folder)
    source_files = list(source_folder.glob("*"))
    
    if target_folder is not None:
        target_folder = Path(target_folder)
        target_folder.mkdir(parents=True, exist_ok=True)
    
    if not replace_existing:
        already_processed = {f.stem for f in target_folder.glob("*")}
        source_files = [f for f in source_files if f.stem not in already_processed]
    
    if (len(source_files) > 1) and (num_processes > 1):
        with multiprocessing.Pool(processes=num_processes) as pool:
             list(progress_bar(pool.imap(function, source_files), total=len(source_files)))
    else:
        for file in progress_bar(source_files):
            function(file)

requests_cache_session = requests_cache.CachedSession("web-cache")
            
def download_page(url, sleep_after=0):
    try:
        response = requests_cache_session.get(url)
        response.raise_for_status()
        html = response.text
        time.sleep(sleep_after)
        return html
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")
        
def save_to_json(data, filepath):
    json.dump(data, open(filepath, "w"), indent=2)

def load_from_json(filepath):
    return json.load(open(filepath, "r"))

## Generate the composer list

### Ask for a list of composers

In [89]:
@gpt_function(gpt_model='gpt-4o', retries=2)
def list_famous_composers(n_composers) -> list:
    """Return a list of the most famous classical composers
    who lived between 1600 before 1976"""

target_file = Path("./data/composer_names.json")
if not target_file.is_file():
    composers = [
        composer
        for i in progress_bar(range(10))
        for composer in list_famous_composers(n_composers=100)
    ]
    unique_composers = set(composers)
    composers_list = [c for c in composer_counts if composer_counts[c] > 1]
    save_to_json(sorted(composers_list), target_file)

### Make one file per composer

In [None]:
composer_list = load_from_json("./data/composer_list.json")
path = Path("data/composer_names")
path.mkdir(exist_ok=True)
for composer in composer_list:
    save_to_json({"name": composer}, path / f"{composer}.json")

### Compute basic metadata on each composer

In [34]:
@gpt_function(gpt_model='gpt-4o', retries=2)
def composer_metadata(composer) -> dict:
    """Return the following metadata for the given composer:
    {
        first_names: str,
        last_name: str,
        birth_year: int,
        death_year: int
    }
    """

target_folder = Path("./data/composer_basic_metadata/")

def compute_basic_metadata(source_file):
    file_data = load_from_json(source_file)
    output = composer_metadata(composer=file_data["name"])
    output["full_name"] = file_data["name"]
    save_to_json(output, target_folder / source_file.name)


process_files(
    compute_basic_metadata,
    source_folder="./data/composer_names",
    target_folder=target_folder,
    replace_existing=True,
    num_processes=3
)

## Collect life events from Wikipedia

### Download the composer wiki pages

In [12]:
target_folder = Path("data/composer_wikipedia_pages/")

def get_wikipedia_url_from_search(term):
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": term,
        "format": "json"
    }
    
    response = requests_cache_session.get(search_url, params=params)
    data = response.json()
    
    if data['query']['search']:
        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        return page_url
    else:
        print (f"No page found for {term}")
        return None

def extract_sections_from_wikipedia_page(html_content, section_blacklist=None, section_whitelist=None):
    """wikipedia pages have the following slightly complex schema:

    <div><h2>Section 1 title</h2></div>
    <p>...</p>...
    <div><h2>Section 2 title</h2></div>
    <p>...</p>...
    """
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all h2 tags, which represent section titles
    sections = soup.find_all('h2')

    # Initialize a list to store the results
    section_data = []

    # Iterate over each section
    for section in sections:
        title = section.get_text(strip=True)  # Extract the title text
        if section_blacklist is not None:
            if title in section_blacklist:
                continue
        if section_whitelist is not None:
            if title not in section_whitelist:
                continue
        content = []  # Initialize content list for this section
        for sibling in section.parent.find_next_siblings():
            if len(sibling.select('h2')):  # Stop if the next h2 section starts
                break
            if sibling.get_text(strip=True):  # Add text content if not empty
                content.append(sibling.get_text())

        # Join the content list into a single string
        content_text = " ".join(content)

        # Append the result as a dictionary
        section_data.append({"title": title, "content": content_text})
    return section_data


target_folder = Path("./data/composer_wikipedia_content")

section_title_blacklist = [
    "External links",
    "Further reading",
    "Sources",
    "See also",
    "References",
    "Contents",
    "Notes, references and sources",
    "Notes",
    "Notes and references",
    "Recordings",
    "Compositions"
]
def collect_wikipedia_page_and_separate_sections(source_file):
    composer = load_from_json(source_file)["full_name"]
    wikipedia_url = get_wikipedia_url_from_search(composer)
    html_content = download_page(wikipedia_url)
    sections = extract_sections_from_wikipedia_page(
        html_content, section_blacklist=section_title_blacklist
    )
    target_file = target_folder / source_file.name
    data = {"wikipedia_url": wikipedia_url, "sections": sections}
    save_to_json(data, target_file)

process_files(
    collect_wikipedia_page_and_separate_sections,
    source_folder="./data/composer_basic_metadata/",
    target_folder=target_folder,
    replace_existing=False,
    num_processes=3
)

### Summarize the composer wikipedia pages

In [32]:
@gpt_function(retries=2)
def list_life_events(composer_biography: str, exclude_events: list) -> list[dict]:
    """Return a list the composer's life events from the given text.
    Exclude any event described in the exclude_events list.
    Add the best emoji to describe the event. 
    
    The output should have this schema:
    [{title:str, summary: str, year: int, location: str, emoji: str}...]
    
    The summary should be a few sentences, and as informative as possible
    but only with information from the text.
    The summary should not state the year.
    
    Return an empty list if there are no events.
    """

target_folder = Path("data/composer_event_summaries/")
    
def summarize_sections(source_file):
    events = []
    sections = load_from_json(source_file)
    for section in sections:
        title = section["title"]
        text = source_file.stem + title + ".\n" + section["content"]
        past_events = [f"{e['year']} {e['title']}" for e in events]
        section_events = list_life_events(
            composer_biography=text, exclude_events=past_events
        )
        events += [{"section": title, **e} for e in section_events]
    save_to_json(events, target_folder / source_file.name)

process_files(
    summarize_sections,
    source_folder="./data/composer_wikipedia_sections/",
    target_folder=target_folder,
    num_processes=3
)

In [29]:
@gpt_function(retries=2)
def most_relevant_emoji(event: str, context: dict) -> dict:
    """Return the most relevent emoji for the given event
    Output schema: {emoji: str}
    """

@gpt_function(retries=2, think_through=True)
def add_fun_to_text(event_summary: str) -> str:
    """For the given event_summary, follow these steps:
    
    Step 1: Rewrite the event_summary with humor. Be funny!
    Step 2: Copy the result of Step 1 without any joke about tragic events.
    Step 3: Copy the result of Step 2 without the weakest jokes if the text is over 60 words
    Step 4: Copy the result of Step 3 with any information from the original summary that got lost.
    """

target_folder = Path("data/composer_event_summaries_with_fun/")
    
def add_event_fun(source_file):
    
    events = load_from_json(source_file)
    composer = source_file.stem
    events_with_fun = []
    for event in events:
        context = {**event}
        summary = context.pop("summary")
        emoji = most_relevant_emoji(event_summary=event["summary"], context=context)
        fun_version = add_fun_to_text(event_summary=summary)
        events_with_fun.append({**event, **emoji, "fun_version": fun_version})
    save_to_json(events_with_fun, target_folder / source_file.name)

process_files(
    add_event_fun,
    source_folder="./data/composer_event_summaries/",
    target_folder=target_folder,
    num_processes=3
)

## Get compositions

### Download IMSLP pages

In [3]:
from urllib.parse import unquote

IMSLP_URL = "https://imslp.org"

def detect_imslp_url(wikipedia_html):
    soup = BeautifulSoup(wikipedia_html)
    for link in soup.select("a"):
        if "href" in link.attrs: 
            href = link.attrs["href"]
            if href.startswith(IMSLP_URL + "/wiki/Category"):
                return href

def get_work_data_from_imslp_html(imslp_html, composer):
    composer_underscore = composer.replace(" ", "_")
    matches = re.findall(r'.extend\(catpagejs,\{"p1":(\{[^}]+\})', imslp_html)
    match = json.loads(matches[0])
    
    works = [
        work.split("|")[0]
        for letter_list in match.values()
        for work in letter_list
    ]
    works_data = [
        {
            "title": work.replace(f"({composer})", "").strip(),
            "imslp_url": f"{IMSLP_URL}/wiki/{work.replace(' ', '_')}",
        }
        for work in works
    ]
    return [work for work in works_data if composer_underscore in work["imslp_url"]]

def get_publication_year(work_html):
    def detect_year(txt):
        # Regular expression to find all numbers in the text
        numbers = re.findall(r'\b\d{4}\b', txt)
        for num in numbers:
            year = int(num)
            if 1000 <= year <= 3000:
                return year
        return None
    try:
        soup = BeautifulSoup(work_html)
    except:
        return None
    indicators = ["First Publication", "Composition Year"]
    trs = [
        tr for tr in soup.select("tr")
        if any([indicator in tr.get_text() for indicator in indicators])
    ]
    if trs == []:
        return None
    year = trs[0].select("td")[0].get_text()
    return detect_year(year)


target_folder = Path("data/composer_works/")

total = [0]
def get_composition_list(wikipedia_content_file):
    wikipedia_url = load_from_json(wikipedia_content_file)["wikipedia_url"]
    wikipedia_html = download_page(wikipedia_url)

    imslp_url = detect_imslp_url(wikipedia_html)
    if imslp_url is None:
        works_data = []
    else:
        imslp_html = download_page(imslp_url)
        composer = re.findall("Category:(.*)</h1>", imslp_html)[0].strip()
        works_data = get_work_data_from_imslp_html(imslp_html, composer)
        for work in progress_bar(works_data):
            work_html = download_page(work["imslp_url"], sleep_after=0.1)
            work["year"] = get_publication_year(work_html)
    
    save_to_json(works_data, target_folder / wikipedia_content_file.name)

process_files(
    get_composition_list,
    source_folder="./data/composer_wikipedia_content/",
    target_folder=target_folder,
    num_processes=3
)


    

## Compile composer data

In [36]:
target_folder = Path("data/composer_full_data")

def compile_data(source_file):
    data = load_from_json(source_file)
    birth, death = data["birth_year"], data["death_year"]

    wikipedia_data = load_from_json(f"data/composer_wikipedia_content/{source_file.name}")
    data["wikipedia_url"] = wikipedia_data["wikipedia_url"]

    events = load_from_json(f"data/composer_event_summaries_with_fun/{source_file.name}")
    data["events"] = [e for e in events if e["year"] and (birth <= e["year"] <= death)]
    data["events"] = sorted(data["events"], key=lambda e: (e["year"], e["title"])) 

    works = load_from_json(f"data/composer_works/{source_file.name}")
    data["works"] = [w for w in works if w["year"] and (birth <= w["year"] <= death)]
    data["works"] = sorted(data["works"], key=lambda w: (w["year"], w["title"])) 

    save_to_json(data, target_folder / source_file.name)
    
process_files(
    compile_data,
    source_folder="./data/composer_basic_metadata/",
    target_folder=target_folder,
    replace_existing=True,
    num_processes=1
)

In [35]:
all_composer_basic_metadata = [
    load_from_json(f)
    for f in Path("./data/composer_basic_metadata/").glob("*.json")
]
all_composer_basic_metadata = sorted(all_composer_basic_metadata, key=lambda c: c["last_name"])
save_to_json(all_composer_basic_metadata, "data/composers.json")

## Summarize the events of all years


In [18]:
@gpt_function
def select_major_world_events(text) -> list:
    """Return a list of the top ~10 major events described in the text.
    
    Prefer major technical advances, or major events which
    would have made the front page of European newspapers.
    Prefer events which could have had an impact on citizens
    and in particular music composers.
    
    Schema for each event:
    {"title": str, "summary": str, "year": int, "city": str, "country": str}
    
    The year should always be a single integer, and only an integer.
    Never use quotation marks inside the summary.
    """
    
years_dir = Path("data/years/")
years_dir.mkdir(exist_ok=True)
for year in range(1500, 2000):
    save_to_json({"year": year}, years_dir / f"{year}.json")

target_folder = Path("data/year_events")

def get_world_events(year_file):
    year = year_file.stem
    wikipedia_url = f"https://en.wikipedia.org/wiki/{year}"
    wikipedia_html = download_page(wikipedia_url)
    sections = extract_sections_from_wikipedia_page(wikipedia_html, section_whitelist=["Events"])
    events_txt = sections[0]["content"]
    events = select_major_world_events(text=events_txt)
    save_to_json(events, target_folder / year_file.name)
    
    

process_files(
    function=get_world_events,
    source_folder=years_dir,
    target_folder=target_folder,
    num_processes=5
)


### Compile world events to a single file

In [29]:
year_world_events = {
    int(f.stem): load_from_json(f)
    for f in Path("data/year_events/").glob("*.json")
}
save_to_json({k: v for (k, v) in sorted(year_world_events.items())}, "data/world_events.json")