In [1]:
import os
import json
import pandas as pd

def parse_metadata_files(root_dir):
    # List to hold all the JSON data
    all_data = []

    # Traverse the directory structure
    for subdir, _, files in os.walk(root_dir):
        # Check if dataset-metadata.json is in the current directory
        if 'dataset-metadata.json' in files:
            # Construct the full path to the JSON file
            file_path = os.path.join(subdir, 'dataset-metadata.json')
            # Open and read the JSON file
            with open(file_path, 'r') as json_file:
                data = json.load(json_file)
                if data['description'] != '' and data['keywords'] != []:
                    keywords = data['keywords'][:5] + [''] * (5 - len(data['keywords']))
                    all_data.append((data['title'], data['subtitle'], data['description'], *keywords))

    # Create a DataFrame from the list of JSON data
    df = pd.DataFrame(all_data, columns=['title', 'subtitle', 'description', 'keyword 1', 'keyword 2', 'keyword 3', 'keyword 4', 'keyword 5'])
    return df

# Define the root directory
root_dir = '../datasets/kaggle_datasets_parse_by_tags'

# Parse the metadata files and create the DataFrame
metadata_df = parse_metadata_files(root_dir)

# Display the DataFrame
metadata_df

Unnamed: 0,title,subtitle,description,keyword 1,keyword 2,keyword 3,keyword 4,keyword 5
0,Eye Gaze,Simulated and real datasets of eyes looking in...,# Context\nThe main reason for making this dat...,arts and entertainment,earth and nature,social science,image,eyes and vision
1,Military Aircraft Detection Dataset,military aircraft images with aircraft type an...,## Overview\nThis dataset is designed for obje...,arts and entertainment,military,aviation,computer vision,classification
2,Bhagavad Gita Dataset,All verses in Sanskrit with their Hindi and En...,#Context\nThe Bhagavad Gita (Sanskrit: भगवद् ग...,religion and belief systems,linguistics,nlp,text,translation
3,Bin Baz Fatwas,Main Source: https://github.com/Alsarmad/binba...,"**Dataset Description**\nThe ""Fatwaas from Bin...",religion and belief systems,nlp,text,text-to-text generation,arabic
4,Nepali Cheers Liquor store product details,Alcoholic Beverages sold in one of a online li...,Data scraped from Nepali Online Liquor selling...,alcohol,python,nepali,,
...,...,...,...,...,...,...,...,...
4028,US Mass Shootings,Last 50 Years (1966-2021),### Context\n\nMass Shootings in the United St...,united states,crime,,,
4029,Lo_Vi: Machine Translation,Lao-Vietnamese Corpus (ALT version),Lao - Vietnamese Corpus from ALT:\nUsage:\nwit...,token classification,text segmentation,translation,,
4030,Water waste in Ukraine extract,Waste water treatment plants in Ukraine extrac...,HydroWASTE: Global wastewater treatment plant ...,global,environment,energy,engineering,water transport
4031,goodbooks-10k,"Ten thousand books, one million ratings. Also ...",**This version of the dataset is obsolete. It ...,literature,,,,


In [2]:
metadata_df.to_csv('../datasets/kaggle_metadata_parse_by_tags.csv', index=False)