In [12]:
import json
import os 
import re 
from tqdm import tqdm

text_keys = ['contentQuickTips', 'description', 'tips']
url_keys = ['urlQuickTip', 'url']

def load_json(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return json.load(f)
def clean_txt(txt):
    txt = txt.replace('_', ' ')
    return txt.strip()

def reformat_uc_ipm():
    """Loads in all the old UC IPM data and attempts to put into the same format as natural enemies json"""
    fns = [f"./uc_ipm_old/{fn}" for fn in os.listdir('./uc_ipm_old') if fn.endswith('.json')]
    for fn in tqdm(fns): 
        data = load_json(fn)
        for item in data:
            new_item = {"title": "", "link": "", "pubdate": "", "displaydata": "", "authors": [], "content": [], "images": []}
            for key, value in item.items():
                if 'url' in key and isinstance(value, str):
                    new_item['link'] = value
                elif 'name' in key and isinstance(value, str):
                    new_item['title'] = value
                elif 'image' in key and isinstance(value, list):
                    if value and 'link' in value[0]:
                        new_item['images'] = [{'src': img_item['link'], 'alt': img_item['caption']} for img_item in value]
                    elif value and  'src' in value[0]:
                        new_item['images'] = [{'src': img_item['src'], 'alt': img_item['caption']} for img_item in value]
                elif isinstance(value, str) and value:
                    header = clean_txt(key)
                    text = clean_txt(value)
                    new_item['content'].append({"header": header, "text": text})
            if new_item['content'] or new_item['images']:
                yield new_item
def main():
    """Calls reformat_uc_ipm and saves as json"""
    data = list(reformat_uc_ipm())
    with open('./data/old_uc_ipm.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
main()

100%|██████████| 15/15 [00:00<00:00, 637.91it/s]
