# Data collection

This notebook:
- Asks ChatGPT for a list of ~150 candidates
- Runs Chatgpt on Wikipedia to summarize the main life events for each composer
- Downloads work data from IMSLP
- Summarizes world events for all years 1500-2000

It works in batches and by steps, storing the result of each step under `result_folder/{composer_name}.json`.

In [2]:
from pathlib import Path
import utils

from collections import Counter
from fastprogress import progress_bar
from gpt_function_decorator import gpt_function

## Generate the composer list

### Ask for a list of composers

In [4]:
@gpt_function(gpt_model='gpt-4o')
def list_famous_composers(n_composers) -> list:
    """Return a list of the most famous classical composers
    who lived between 1600 before 1976"""

composer_list_file = Path("data") / "raw_chatgpt_composers_list.json"
if not composer_list_file.exists():
    composers = [
        composer
        for i in progress_bar(range(10))
        for composer in list_famous_composers(n_composers=100)
    ]
    utils.save_to_json(composers, composer_list_file)

### Keep only composers mentioned more than once

In [5]:
composer_list = utils.load_from_json("data/raw_chatgpt_composers_list.json")
composer_counts = Counter(composer_list)
most_mentioned_composer_list = [c for c in composer_counts if composer_counts[c] > 1]
target_dir = Path("./data/composer_names")
target_dir.mkdir(exist_ok=True)
for composer in most_mentioned_composer_list:
    utils.save_to_json({"name": composer}, target_dir / f"{composer}.json")

### Get basic metadata on each composer

In [20]:
await utils.process_folder(
    function=utils.get_basic_metadata_from_wikipedia,
    source_folder="./data2/composer_names",
    target_folder="./data2/composer_basic_metadata/",
    replace_existing=True,
    num_processes=1
)

## Collect life events from Wikipedia

### Get sections from wikipedia page

In [22]:
await utils.process_folder(
    function=utils.collect_wikipedia_page_and_separate_sections,
    source_folder="./data/composer_basic_metadata",
    target_folder="./data/composer_wikipedia_content/",
    replace_existing=True,
    num_processes=1
)

### Summarize the composer wikipedia pages

In [3]:
await utils.process_folder(
    utils.list_life_events_in_sections,
    source_folder="./data/composer_wikipedia_content/",
    target_folder="data/composer_event_summaries/",
    num_processes=1
)

### Add fun to the events

In [5]:
await utils.process_folder(
    utils.add_fun_to_events,
    source_folder="./data/composer_event_summaries/",
    target_folder="./data/composer_works/",
    num_processes=3
)

## Get compositions

In [8]:
await utils.process_folder(
    utils.get_works_data_from_imslp,
    source_folder="./data/composer_wikipedia_content/",
    target_folder="./data/composer_works/",
    num_processes=1
)

## Compile composer data

In [11]:
def compile_data(composer_metadata):
    data = {**composer_metadata}
    json_filename = data["full_name"] + ".json"
    birth, death = data["birth_year"], data["death_year"]

    wikipedia_data = utils.load_from_json(f"data/composer_wikipedia_content/{json_filename}")
    data["wikipedia_url"] = wikipedia_data["wikipedia_url"]

    events = utils.load_from_json(f"data/composer_event_summaries_with_fun/{json_filename}")
    data["events"] = [e for e in events if e["year"] and (birth <= e["year"] <= death)]
    data["events"] = sorted(data["events"], key=lambda e: (e["year"], e["title"])) 

    works = utils.load_from_json(f"data/composer_works/{json_filename}")
    data["works"] = [w for w in works if w["year"] and (birth <= w["year"] <= death)]
    data["works"] = sorted(data["works"], key=lambda w: (w["year"], w["title"]))
    
    return data
    
utils.process_folder(
    compile_data,
    source_folder="./data/composer_basic_metadata/",
    target_folder="./data/full_composer_data/",
    replace_existing=True,
    num_processes=1
)

In [13]:
all_composer_basic_metadata = [
    utils.load_from_json(f)
    for f in Path("./data/composer_basic_metadata/").glob("*.json")
]
all_composer_basic_metadata = sorted(all_composer_basic_metadata, key=lambda c: c["last_name"])
utils.save_to_json(all_composer_basic_metadata, "data/composers.json")

## Summarize the events of all years


In [15]:
years_dir = Path("data/years/")
years_dir.mkdir(exist_ok=True)
for year in range(1500, 2000):
    utils.save_to_json({"year": year}, years_dir / f"{year}.json")

utils.process_folder(
    utils.get_world_events,
    source_folder="./data/years/",
    target_folder="./data/year_events/",
    replace_existing=True,
    num_processes=1
)

### Compile the events in a single file

In [None]:
year_world_events = {
    int(f.stem): load_from_json(f)
    for f in Path("data/year_events/").glob("*.json")
}
save_to_json({k: v for (k, v) in sorted(year_world_events.items())}, "data/world_events.json")