In [None]:
import pandas as pd
import json
import os

In [None]:
years = [2018, 2019, 2020, 2021, 2022, 2023]
years

In [None]:
for year in years:
    file_rename_path = f'../Project/{year}'
    
    for file in os.listdir(file_rename_path):
        full_path = os.path.join(file_rename_path, file)
        if not file.endswith(".json") and os.path.isfile(full_path):
            new_name = file + ".json"
            new_full_path = os.path.join(file_rename_path, new_name)
            os.rename(full_path, new_full_path)
            print(f"Renamed the file {file} to {new_name}")

#### Test reading file data 


In [None]:
df = pd.read_json('../Project/2018/201800000.json')

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
print(df.head())

In [None]:
df.info()

In [None]:
df

In [None]:
for i in df['abstracts-retrieval-response'].get('authors').get('author'):
    print(i.get('ce:indexed-name'))

In [None]:
for i in df['abstracts-retrieval-response'].items():
    print(i)

In [None]:
lang = df['abstracts-retrieval-response'].language.get('@xml:lang')
lang

In [None]:
# df['abstracts-retrieval-response'].coredata

In [None]:
data_rows = []

In [None]:
for year in years:
    folder_path = f"../Project/{year}"
    for file_name in os.listdir(folder_path):
        if file_name.startswith(str(year)) and file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)

                    author_groups = (
                        data.get("abstracts-retrieval-response", {})
                        .get("item", {})
                        .get("bibrecord", {})
                        .get("head", {})
                        .get("author-group", [])
                    )

                    countries = []
                    for author in author_groups:
                        affiliation = author.get("affiliation", {})
                        country = affiliation.get("country", "Unknown")
                        countries.append(country)

                    countries_string = ",".join(countries)

                    auth_keywords = (
                        data.get("abstracts-retrieval-response", {})
                        .get("authkeywords", {})
                        .get("author-keyword", [])
                    )

                    keywords = [
                        keyword.get("$", "")
                        for keyword in auth_keywords
                        if isinstance(keyword, dict)
                    ]
                    keywords_string = ",".join(keywords) if keywords else "null"

                    row_data = pd.json_normalize(
                        data.get("abstracts-retrieval-response", {})
                    )
                    row = {
                        col: row_data[col].iloc[0] if col in row_data else pd.NA
                        for col in row_data.columns
                    }

                    row["item.bibrecord.head.author-group.affiliation.country"] = (
                        countries_string
                    )
                    row["authkeywords.author-keyword"] = keywords_string

                    data_rows.append(row)

                except Exception as e:
                    continue
                    # print(f"Error processing file {file_name}: {e}")

In [None]:
data_rows

In [None]:
df = pd.DataFrame(data_rows)

In [None]:
df.to_csv('../MergedData.csv', index=False)

In [None]:
df

In [None]:
listToRemove = []

In [None]:
for col in df.columns:
    nullCnt = df[col].isna().sum()
    print(nullCnt, end=" ")
    if (nullCnt > int(0.1 * len(df[col]))):
        listToRemove.append(col)

In [None]:
listToRemove