In [11]:
import os
import json

# Path to the data folder
data_folder = "./data/extracted/"

In [None]:
def cleaning_json_files(data_folder):
    """
    Process and clean shop data from multiple JSON files in a directory.
    
    Extracts shop entries from each file's 'docs' array, transforms hierarchical categories
    into flat lists, cleans keyword strings, and returns standardized shop records.
    
    Args:
        data_folder (str): Path to directory containing source JSON files
    
    Returns:
        list: Dictionaries with structured shop data containing:
            - title (str)
            - categories (list)
            - subcategories (list) 
            - level (dict)
            - keywords (list)
            - description (str)
    """
    # List all JSON files in the folder
    json_files = os.listdir(data_folder)

    # Shops list
    shops = []

    # Read each JSON file
    for file_name in json_files:
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, 'r') as file:
            data = json.load(file)
            for datapoint in data['docs']:
                # Extracting shop details one-by-one
                store = {}
                store['title'] = datapoint['title']
                
                # Filtering categories and subcategories
                categories = []
                subcategories = []
                for category in datapoint['categoryTree']:
                    categories.append(category['title'])
                    
                    for sub in category['subs']:
                        subcategories.append(sub['title'])
                    
                store['categories'] = categories
                store['subcategories'] = subcategories
                
                # Storing venue
                store['venue'] = datapoint['venue']
                
                # Filtering and storing keywords as list
                keywords = []
                keywords_str = datapoint['keywords']
                for keyword in keywords_str.split(','):
                    if(keyword != '' and keyword != '&'):
                        keywords.append(keyword)
                store['keywords'] = keywords
                
                # Storing description of shop
                store['description'] = datapoint['text']
                
                shops.append(store)
                
    return shops

In [29]:
shops = cleaning_json_files(data_folder=data_folder)

with open('./data/shops.json', 'w') as json_file:
    json.dump(shops, json_file, indent=4)

In [None]:
import pandas as pd

df = pd.read_json('./data/shops.json')
df.head(10)

Unnamed: 0,title,categories,subcategories,venue,keywords,description
0,GODIVA,[Food & Beverages],[Desserts],GF.Ice Rink Railing,"[beverages, chocoholic, chocolatelover, desser...",GODIVA is the world’s most iconic premium choc...
1,GORDON RAMSAY BAR & GRILL,[Food & Beverages],[Western],"Lobby Level, Sunway Resort","[bar, beef wellington, beverages, british, cla...",Gordon Ramsay Bar & Grill in Sunway City Kuala...
2,GRAND IMPERIAL RESTAURANT,[Food & Beverages],[Chinese],"Suite 101, First Floor, The Pinnacle Annex","[beverages, chinese restaurant, food & beverag...","Established since 2008, we are Malaysia’s lead..."
3,Gadget Hub,[Digital Lifestyle],"[Add-Ons, Gadgets]",F1.705-2.PushCart,"[add-ons, digital lifestyle, gadget hub, gadge...",
4,GadgetHub,[Digital Lifestyle],[],F1.28,"[bag, beauty light, cable, car grab, casing, c...",GadgetHub has always been a pioneer and at the...
5,Galaxy Ace,[Fashion],"[Accessories, Unisex]","LG2,M.057.PushCart","[accessories, ace, fashion, galaxy ace, unisex]",
6,Galaxy Collection,[Toys & Hobbies],[Gifts],"LG2,M.062.PushCart","[collection, galaxy collection, gifts, hobbies...",
7,Galaxy Gift,"[Asian Avenue, Toys & Hobbies]",[Gifts],F1.AV.32,"[accessories, asian avenue, asianavenue, avenu...","Retailing of customized plate, accessories, so..."
8,Garrett Popcorn Shops®,[Food & Beverages],"[Halal, Snack]",LG2.RINK.B,"[beverages, caramel, food & beverages, garrett...",Garrett Popcorn Shops® - A Chicago Tradition f...
9,Geb,[Fashion],[Ladies],LG1.77,"[bohemian, fashion, geb, ladies]",Retailing of fashion apparel and related acces...
