In [None]:
import json
import copy

## Load data

In [None]:
with open("songs-flower.json", "r", encoding='utf8') as f:
    flower_full = json.load(f)
with open("songs-kamai.json", "r", encoding='utf8') as f:
    kamai_full = json.load(f)

# clean data
# for flower, null means song doesn't exist, remove it
to_remove = [i for i in flower_full if flower_full[i] is None]
for i in to_remove:
    flower_full.pop(i)

## Helper functions

In [None]:
def map_remove_duplicates(data: dict, key) -> dict:
    result = {}
    dupes = set()
    for id,info in data.items():
        k = key(info)
        if k in dupes:
            continue
        if k in result:
            dupes.add(k)
            result.pop(k)
            continue
        result[k] = id
    return result

def match_based_on(flower, kamai, key):
    kf = map_remove_duplicates(flower, key)
    kk = map_remove_duplicates(kamai, key)
    result = {}
    for k in kf:
        if k in kk:
            result[kf[k]] = kk[k]
    return result

def filter_ids(flower, kamai, matching):
    new_flower = copy.copy(flower)
    new_kamai = copy.copy(kamai)
    for k in matching.keys():
        new_flower.pop(k)
    for k in matching.values():
        new_kamai.pop(k)
    return new_flower, new_kamai

## Match based on title

In [None]:
def clean_title(title: str) -> str:
    # tilde character used between the databases is different
    title = title.replace('〜', '').replace('～', '')
    title = title.replace('(UPPER)', 'UPPER')
    # some song names have trailing whitespace
    title = title.strip()
    return title

flower, kamai = flower_full, kamai_full
total_matched = {}

matched = match_based_on(flower, kamai, lambda data: clean_title(data['title']))
flower, kamai = filter_ids(flower, kamai, matched)

total_matched = {**total_matched, **matched}
total_matched_title = total_matched
matched_title = matched
flower_after_title, kamai_after_title = flower, kamai

print("Ids matched:", len(matched))
print("Remaining flower ids:", len(flower))
print("Remaining kamai ids:", len(kamai))


## Grab extra flower data

The charts we didn't manage to match with just the title, we'll have to get more information by loading the full song page.

Please update the Flower data file with song data for the below ids and re-run this notebook with the new data.

In [None]:
print(list(flower_after_title.keys()))

## Match based on difficulty spread

In [None]:
def difficulty_spread(data) -> tuple:
    return tuple(data['charts'].get(i, None) for i in ['Easy', 'Normal', 'Hyper', 'EX'])

flower, kamai = flower_after_title, kamai_after_title
total_matched = total_matched_title

# match with spread
matched = match_based_on(flower, kamai, difficulty_spread)
flower, kamai = filter_ids(flower, kamai, matched)

# because spreads can be really general, try matching using spread AND title. This does eliminate a couple of songs.
matched2 = match_based_on(flower, kamai, lambda data: (clean_title(data['title']), difficulty_spread(data)))
flower, kamai = filter_ids(flower, kamai, matched2)
matched = {**matched, **matched2}

total_matched = {**total_matched, **matched}
total_matched_spread = total_matched
matched_spread = matched
flower_after_spread, kamai_after_spread = flower, kamai

print("Ids matched:", len(matched))
print("Remaining flower ids:", len(flower))
print("Remaining kamai ids:", len(kamai))

## Done

In [None]:
print(total_matched)

In [None]:
# Paste the result of this into the main scraper script.

KAMAI_TO_FLOWER_ID = []
for fid,kid in total_matched.items():
    fid = int(fid)
    kid = int(kid)

    if len(KAMAI_TO_FLOWER_ID) <= kid:
        KAMAI_TO_FLOWER_ID.extend([None] * (kid - len(KAMAI_TO_FLOWER_ID) + 1))
    # print(len(KAMAI_TO_FLOWER_ID), kid)
    # assert len(KAMAI_TO_FLOWER_ID) == kid+1
    KAMAI_TO_FLOWER_ID[kid] = fid

print(json.dumps(KAMAI_TO_FLOWER_ID, separators=(',', ':')))