In [1]:
import json

def extract_relevant_info(json_data):
    core_data = json_data.get("abstracts-retrieval-response", {}).get("coredata", {})
    authors = core_data.get("dc:creator", {}).get("author", [])
    keywords = json_data.get("abstracts-retrieval-response", {}).get("authkeywords", {}).get("author-keyword", [])
    doi = core_data.get("prism:doi", "")
    
    return {
        "title": core_data.get("dc:title", ""),
        "abstract": core_data.get("dc:description", ""),
        "publication_name": core_data.get("prism:publicationName", ""),
        "publication_year": core_data.get("prism:coverDate", "").split("-")[0],
        "authors": [author.get("ce:indexed-name", "") for author in authors],
        "keywords": [keyword.get("$", "") for keyword in keywords],
        "doi": doi,
    }

def process_file(file_path):
    try:
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            return extract_relevant_info(json_data)
    except Exception as e:
        return {"file": file_path, "error": str(e)}
    

In [None]:
import pandas as pd
import os


years = ['2023','2022','2021','2020','2019','2018']

    
# Collect all file paths in the year-wise folders
all_file_paths = []
for year in years:
    base_dir = os.path.join("data", year)
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            all_file_paths.append(os.path.join(root, file))

    # Initialize a list to store processed data
    processed_data_all = []

    # Loop through all collected file paths and process them
    for file_path in all_file_paths:
        result = process_file(file_path)
        processed_data_all.append(result)

    # Create a DataFrame from the processed data
    df = pd.DataFrame(processed_data_all)
    df.dropna(subset=["title", "abstract"], inplace=True)
    df.drop(columns=["error", "file"], inplace=True)