In [5]:
import json
import os 
import re 
from tqdm import tqdm

text_keys = ['contentQuickTips', 'description', 'tips']
url_keys = ['urlQuickTip', 'url']

def load_json(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return json.load(f)
def clean_txt(txt):
    txt = txt.replace('_', ' ')
    return txt.strip()

def clean_header(txt, title):
    txt = re.sub('([^A-Z]+)', r'\1 ', txt)
    txt = txt[0].upper() + txt[1:]
    if "Pest Note" in txt: 
        txt = txt.replace("Pest Note", "").strip()
        txt = f"{title} {txt}"
    return txt.strip()

def reformat_uc_ipm():
    """Loads in all the old UC IPM data and attempts to put into the same format as natural enemies json"""
    fns = [f"./uc_ipm_old/{fn}" for fn in os.listdir('./uc_ipm_old') if fn.endswith('.json')]
    for fn in tqdm(fns): 
        data = load_json(fn)
        for item in data:
            new_item = {"title": "", "link": "", "pubdate": "", "displaydata": "", "authors": [], "content": [], "images": []}
            title = ''
            for key, value in item.items():
                if 'url' in key and isinstance(value, str):
                    new_item['link'] = value.replace("?src=exchbt", "") 
                elif 'name' in key and isinstance(value, str):
                    title = value
                    new_item['title'] = value
                elif 'image' in key and isinstance(value, list):
                    if value and  'src' in value[0]:
                        new_item['images'] = [{'src': img_item['src'].replace("?src=exchbt", ""), 'alt': img_item['caption']} for img_item in value]
                    elif value and 'link' in value[0] and value[0]['link']:
                        new_item['images'] = [{'src': img_item['link'].replace("?src=exchbt", ""), 'alt': img_item['caption']} for img_item in value]
                elif isinstance(value, str) and value:
                    header = clean_txt(key)
                    text = clean_txt(value)
                    new_item['content'].append({"header": clean_header(header, title), "text": text})
            if new_item['content'] or new_item['images']:
                yield new_item
def main():
    """Calls reformat_uc_ipm and saves as json"""
    data = list(reformat_uc_ipm())
    with open('./data/old_uc_ipm.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
main()

100%|██████████| 15/15 [00:00<00:00, 433.11it/s]


In [2]:
import json
with open('raw_ask_extension_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
data[0]

{'faq-id': 109900,
 'title': 'When can I plant blue spruce trees in Colorado? #109900',
 'created': '2012-12-03 15:53:47',
 'updated': '2014-09-16 18:32:47',
 'state': 'Colorado',
 'county': 'El Paso County',
 'tags': ['trees and shrubs'],
 'question': 'I need to plant two blue spruce trees that are currently in 24" diameter plastic containers with drain holes in the bottom sides.\n\nLocation: northeast side of Colorado Springs.\n\nThese trees are currently outside on the patio and susceptible to the wind and sun. The trees were watered this past Saturday and seem to be healthy.\n\nQuestion: Can these trees be planted now? Currently the soil is not frozen and night time temps are 35 to 40 degrees.\n\nI have downloaded and read CMG GardenNotes #633 as a reference.\n\nAny advice would be greatly appreciated. ',
 'answer': {'1': {'response': 'Jerry, \nyou can plant them now (a) OR temporarily "plant" them, still in containers, so that roots have some insulation from cold (b).\n\n(a) if yo