In [8]:
%pip install pyspark
%pip install logger

Note: you may need to restart the kernel to use updated packages.
Collecting logger
  Downloading logger-1.4.tar.gz (1.2 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: logger
  Building wheel for logger (setup.py): started
  Building wheel for logger (setup.py): finished with status 'done'
  Created wheel for logger: filename=logger-1.4-py3-none-any.whl size=1761 sha256=5b19809eddbf7d5e2a4fd4d0716ef470a0888b8cd2a6cef50866bced9bc721c9
  Stored in directory: c:\users\yiwahpsp\appdata\local\pip\cache\wheels\cc\ef\15\aadfb106e1cc7ac1d668efc189bcd98c444211847f7d91bd02
Successfully built logger
Installing collected packages: logger
Successfully installed logger-1.4
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import json
import glob
import os
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

In [2]:
raw_data_root_path = 'raw-data'

## Change files in raw data to .json file

In [58]:
for root, dirs, files in os.walk(raw_data_root_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            if '.' not in file:
                new_path = file_path + '.json'
                os.rename(file_path, new_path)
                file_path = new_path
                print(f"Renamed {file_path} to {new_path}")

## Make Flatten file

In [5]:
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '.')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '.')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# Example usage

spark = SparkSession.builder.appName("jsonFlatten").getOrCreate()
with open('raw-data\\2023\\202300002.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

flattened = flatten_json(json_data)
for key, value in flattened.items():
    print(f"{key}: {value}")

abstracts-retrieval-response.item.ait:process-info.ait:status.@state: update
abstracts-retrieval-response.item.ait:process-info.ait:status.@type: core
abstracts-retrieval-response.item.ait:process-info.ait:status.@stage: S300
abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@day: 07
abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@timestamp: 2023-08-07T17:46:05.000005-04:00
abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year: 2023
abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@month: 08
abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@day: 15
abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@year: 2023
abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@month: 12
abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.@pui-match: primary
abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.@has-funding-info: 1
abstracts-retrieval-respo

In [6]:
def get_json_schema_and_data(json_data, current_path="", indent=0):
    """Recursively prints the schema and data of JSON with full path."""
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            new_path = f"{current_path}.{key}" if current_path else key
            if isinstance(value, (dict, list)):
                print(' ' * indent + f"{new_path}: {type(value).__name__}")
                get_json_schema_and_data(value, new_path, indent + 2)
            else:
                print(' ' * indent + f"{new_path}: {type(value).__name__} = {value}")
    
    elif isinstance(json_data, list):
        if len(json_data) > 0:
            print(' ' * indent + f"{current_path}: List of {type(json_data[0]).__name__}")
            for idx, item in enumerate(json_data):
                get_json_schema_and_data(item, f"{current_path}[{idx}]", indent + 2)
        else:
            print(' ' * indent + f"{current_path}: Empty List")
    
    else:
        print(' ' * indent + f"{current_path}: {type(json_data).__name__} = {json_data}")

# Example usage
json_file_path = 'raw-data/2020/202000000.json'
with open(json_file_path, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

print("JSON Schema and Data:")
get_json_schema_and_data(json_data)

JSON Schema and Data:
abstracts-retrieval-response: dict
  abstracts-retrieval-response.item: dict
    abstracts-retrieval-response.item.ait:process-info: dict
      abstracts-retrieval-response.item.ait:process-info.ait:status: dict
        abstracts-retrieval-response.item.ait:process-info.ait:status.@state: str = update
        abstracts-retrieval-response.item.ait:process-info.ait:status.@type: str = core
        abstracts-retrieval-response.item.ait:process-info.ait:status.@stage: str = S300
      abstracts-retrieval-response.item.ait:process-info.ait:date-delivered: dict
        abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@day: str = 14
        abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@timestamp: str = 2021-11-14T16:58:37.000037-05:00
        abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year: str = 2021
        abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@month: str = 11
   

## Get classification of enhancement

In [36]:
def extract_classifications(json_file):
    global enhancement_count_error_files, enhancement_count_files
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    classifications = data.get('abstracts-retrieval-response', {}).get('item', {}).get('bibrecord', {}).get('head', {}).get('enhancement', {}).get('classificationgroup', {}).get('classifications', [])
    
    records = []
    for classification in classifications:
        classification_type = classification.get('@type', '')
        classification_data = classification.get('classification', {})
        
        if isinstance(classification_data, list):
            for cls in classification_data:
                record = {
                    'file': Path(json_file).name,
                    'type': classification_type,
                    'classification': cls.get('$', ''),
                    'classification_code': cls.get('classification-code', ''),
                    'classification_description': cls.get('classification-description', ''),
                }
                records.append(record)
        elif isinstance(classification_data, dict):
            record = {
                'file': Path(json_file).name,
                'type': classification_type,
                'classification': classification_data.get('$', classification_data.get('classification', '')),
                'classification_code': classification_data.get('classification-code', ''),
                'classification_description': classification_data.get('classification-description', ''),
            }
            records.append(record)
        else:
            # Handle cases where 'classification' is neither dict nor list
            record = {
                'file': Path(json_file).name,
                'type': classification_type,
                'classification': classification.get('classification', ''),
                'classification_code': classification.get('classification-code', ''),
                'classification_description': classification.get('classification-description', ''),
            }
            records.append(record)
    
    return records

def process_json_folders(root_folder_path):
    global enhancement_count_error_files, enhancement_count_files
    all_records = []
    root_path = Path(root_folder_path)
    
    for json_file in root_path.rglob('*.json'):
        try:
            enhancement_count_files += 1
            records = extract_classifications(json_file)
            all_records.extend(records)
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
            enhancement_count_error_files += 1
    
    if all_records:
        df = pd.DataFrame(all_records)
        df.to_csv('classification_enhancement/classifications_separated.csv', index=False, encoding='utf-8')
        print(f"Classifications saved to classifications_separated.csv")
        print(f"Records processed with errors: {enhancement_count_error_files} from {enhancement_count_files} files")
        print(f"Approximate error: {enhancement_count_error_files / enhancement_count_files * 100:.2f}%")
    else:
        print("No records found to save.")

# Process JSON folders
enhancement_count_files = 0
enhancement_count_error_files = 0
process_json_folders(raw_data_root_path)

Classifications saved to classifications_separated.csv
Records processed with errors: 0 from 20216 files
Approximate error: 0.00%


### Narrowing down the classifications

In [58]:
df_classifications = pd.read_csv('classification_enhancement/classifications_separated.csv')
df_classifications.describe(include='all')

Unnamed: 0,file,type,classification,classification_code,classification_description
count,145669,145669,85921,59748,59748
unique,20216,8,348,1735,1751
top,202202738.json,ASJC,MEDI,902,FLUIDEX; Related Topics
freq,29,50064,5639,4290,4290


In [7]:
shape_df_classifications = df_classifications.shape
columns_df_classifications = df_classifications.columns
print('Shape:', shape_df_classifications)
print('Columns:', columns_df_classifications)

Shape: (145669, 5)
Columns: Index(['file', 'type', 'classification', 'classification_code',
       'classification_description'],
      dtype='object')


In [8]:
df_classifications.head(5)

Unnamed: 0,file,type,classification,classification_code,classification_description
0,201800000.json,ASJC,2700,,
1,201800000.json,SUBJABBR,MEDI,,
2,201800001.json,ASJC,2208,,
3,201800001.json,ASJC,2504,,
4,201800001.json,CPXCLASS,,402.0,Buildings and Towers


### Classification Description

In [96]:
df_classifications_by_descriptions = df_classifications.drop(columns=['type','file','classification_code'])
df_classifications_by_descriptions.head(5)


Unnamed: 0,classification,classification_description
0,2700,
1,MEDI,
2,2208,
3,2504,
4,,Buildings and Towers


In [97]:
df_classifications_by_descriptions['classification_description'].fillna(df_classifications_by_descriptions['classification'], inplace=True)
df_classifications_by_descriptions.head(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_classifications_by_descriptions['classification_description'].fillna(df_classifications_by_descriptions['classification'], inplace=True)


Unnamed: 0,classification,classification_description
0,2700,2700
1,MEDI,MEDI
2,2208,2208
3,2504,2504
4,,Buildings and Towers


In [98]:
df_classifications_by_descriptions['classification_description'].isnull().sum()
df_classifications_by_descriptions.drop(columns=['classification'], inplace=True)

In [99]:
df_classifications_by_descriptions.head(5)

Unnamed: 0,classification_description
0,2700
1,MEDI
2,2208
3,2504
4,Buildings and Towers


In [100]:
df_classifications_by_descriptions['classification_description'] = df_classifications_by_descriptions['classification_description'].apply(lambda x: '; '.join(x.split('; ')[:1]))
df_classifications_by_descriptions = df_classifications_by_descriptions.groupby(['classification_description']).size().reset_index(name='Count')
df_classifications_by_descriptions.head(5)

Unnamed: 0,classification_description,Count
0,1000,1088
1,1100,229
2,1101,123
3,1102,137
4,1103,362


In [101]:
df_classifications_by_descriptions.sort_values(by='Count', ascending=False, inplace=True)
df_classifications_by_descriptions.head(10)

Unnamed: 0,classification_description,Count
838,MEDI,5639
659,FLUIDEX,4290
564,ENGI,2994
554,Drug Literature Index,2752
393,BIOC,2436
465,Clinical and Experimental Biochemistry,2316
427,CHEM,2161
990,PHYS,2072
836,MATE,1988
323,AGRI,1901


In [102]:
shape_df_classifications_by_descriptions = df_classifications_by_descriptions.shape
columns_df_classifications_by_descriptions = df_classifications_by_descriptions.columns
print('Shape:', shape_df_classifications_by_descriptions)
print('Columns:', columns_df_classifications_by_descriptions)

Shape: (1314, 2)
Columns: Index(['classification_description', 'Count'], dtype='object')


In [103]:
df_classifications_by_descriptions.to_csv('classification_enhancement/classifications_by_description.csv', index=False, encoding='utf-8')

### Classification type

In [75]:
df_classifications_groupby_type = df_classifications.drop(columns=['classification_code','classification_description','classification'])
df_classifications_groupby_type.describe(include='all')

Unnamed: 0,file,type
count,145669,145669
unique,20216,8
top,202202738.json,ASJC
freq,29,50064


In [77]:
df_classifications_groupby_type = df_classifications_groupby_type.groupby('type').count().reset_index()
df_classifications_groupby_type.rename(columns={df_classifications_groupby_type.columns[1]: 'count'}, inplace=True)
df_classifications_groupby_type.drop_duplicates(inplace=True)
df_classifications_groupby_type.sort_values(by='count', inplace=True)
df_classifications_groupby_type.head(5)


Unnamed: 0,type,count
4,ENCOMPASSCLASS,1075
6,GEOCLASS,1828
1,CABSCLASS,3224
5,FLXCLASS,5935
3,EMCLASS,18629


In [78]:
df_classifications_groupby_type.drop_duplicates(inplace=True)

In [79]:
shape_df_classifications_groupby_type = df_classifications_groupby_type.shape
columns_df_classifications_groupby_type = df_classifications_groupby_type.columns
print('Shape:', shape_df_classifications_groupby_type)
print('Columns:', columns_df_classifications_groupby_type)

Shape: (8, 2)
Columns: Index(['type', 'count'], dtype='object')


In [80]:
df_classifications_groupby_type.to_csv('classification_enhancement/classifications_by_type.csv', index=False, encoding='utf-8')

### Classification type and descriptions

In [86]:
df_classifications_by_type_and_descriptions = df_classifications.drop(columns=['file','classification_code'])
df_classifications_by_type_and_descriptions.describe(include='all')


Unnamed: 0,type,classification,classification_description
count,145669,85921,59748
unique,8,348,1751
top,ASJC,MEDI,FLUIDEX; Related Topics
freq,50064,5639,4290


In [87]:
df_classifications_by_type_and_descriptions['classification_description'].fillna(df_classifications_by_type_and_descriptions['classification'], inplace=True)
df_classifications_by_type_and_descriptions.head(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_classifications_by_type_and_descriptions['classification_description'].fillna(df_classifications_by_type_and_descriptions['classification'], inplace=True)


Unnamed: 0,type,classification,classification_description
0,ASJC,2700,2700
1,SUBJABBR,MEDI,MEDI
2,ASJC,2208,2208
3,ASJC,2504,2504
4,CPXCLASS,,Buildings and Towers


In [88]:
df_classifications_by_type_and_descriptions['classification_description'].isnull().sum()
df_classifications_by_type_and_descriptions.drop(columns=['classification'], inplace=True)
df_classifications_by_type_and_descriptions.head(5)

Unnamed: 0,type,classification_description
0,ASJC,2700
1,SUBJABBR,MEDI
2,ASJC,2208
3,ASJC,2504
4,CPXCLASS,Buildings and Towers


In [89]:
df_classifications_by_type_and_descriptions['classification_description'] = df_classifications_by_type_and_descriptions['classification_description'].apply(lambda x: '; '.join(x.split('; ')[:1]))
df_classifications_by_type_and_descriptions.head(10)

Unnamed: 0,type,classification_description
0,ASJC,2700
1,SUBJABBR,MEDI
2,ASJC,2208
3,ASJC,2504
4,CPXCLASS,Buildings and Towers
5,CPXCLASS,Semiconductor Devices and Integrated Circuits
6,CPXCLASS,"Electronic Equipment, Radar, Radio and Television"
7,CPXCLASS,Television Systems and Equipment
8,CPXCLASS,"Computer Software, Data Handling and Applications"
9,CPXCLASS,Organic Polymers


In [90]:
df_classifications_by_type_and_descriptions = df_classifications_by_type_and_descriptions.groupby(['type', 'classification_description']).size().reset_index(name='Count')
df_classifications_by_type_and_descriptions.sort_values(by='Count', ascending=False, inplace=True)
df_classifications_by_type_and_descriptions.head(10)

Unnamed: 0,type,classification_description,Count
1312,SUBJABBR,MEDI,5639
1014,FLXCLASS,FLUIDEX,4290
1306,SUBJABBR,ENGI,2994
971,EMCLASS,Drug Literature Index,2752
1296,SUBJABBR,BIOC,2436
966,EMCLASS,Clinical and Experimental Biochemistry,2316
1299,SUBJABBR,CHEM,2161
1317,SUBJABBR,PHYS,2072
1310,SUBJABBR,MATE,1988
1294,SUBJABBR,AGRI,1901


In [91]:
shape_df_classifications_by_type_and_descriptions = df_classifications_by_type_and_descriptions.shape
columns_df_classifications_by_type_and_descriptions = df_classifications_by_type_and_descriptions.columns
print('Shape:', shape_df_classifications_by_type_and_descriptions)
print('Columns:', columns_df_classifications_by_type_and_descriptions)

Shape: (1321, 3)
Columns: Index(['type', 'classification_description', 'Count'], dtype='object')


In [93]:
df_classifications_by_type_and_descriptions.to_csv('classification_enhancement/classifications_by_type_and_description.csv', index=False, encoding='utf-8')

## Get authkeywords

In [None]:
def extract_author_keywords(json_file):
    """Extract author keywords from JSON file"""
    global author_keywords_count_files, author_keywords_count_error_files
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    records = []
    
    # Check if path exists and has data
    auth_data = data.get('abstracts-retrieval-response', {})
    keywords_data = auth_data.get('authkeywords')
    
    if keywords_data is not None:
        # Process keywords only if data exists
        keywords_list = keywords_data.get('author-keyword', [])
        
        if isinstance(keywords_list, dict):
            keywords_list = [keywords_list]
        
        for keyword in keywords_list:
            record = {
                'file': Path(json_file).name,
                'auth_keyword': keyword.get('$', '')
            }
            records.append(record)
    else:
        print(f"No authkeywords found in {json_file}")
        author_keywords_count_error_files += 1
    
    return records
def process_json_folders(root_folder_path):
    """Process all JSON files in root folder and subfolders."""
    global author_keywords_count_files, author_keywords_count_error_files
    all_records = []
    root_path = Path(root_folder_path)
    
    # Create output directory if it doesn't exist
    output_dir = Path('author_keywords')
    output_dir.mkdir(exist_ok=True)
    
    # Process each JSON file
    for json_file in root_path.rglob('*.json'):
        try:
            author_keywords_count_files += 1
            records = extract_author_keywords(json_file)
            all_records.extend(records)  # Add records to the main list
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
            author_keywords_count_error_files += 1
    
    if all_records:
        # Convert to DataFrame
        df = pd.DataFrame(all_records)
        # Save to CSV
        output_csv_path = output_dir / 'author_keywords_separate.csv'
        df.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"Author keywords saved to {output_csv_path}")
        print(f"Total records processed: {len(all_records)}")
        print(f"Records processed with errors: {author_keywords_count_error_files} from {author_keywords_count_files}")
    else:
        print("No records found to save.")

# Usage
author_keywords_count_error_files = 0
author_keywords_count_files = 0
process_json_folders(raw_data_root_path)

No authkeywords found in raw-data\2018\201800000.json
No authkeywords found in raw-data\2018\201800001.json
No authkeywords found in raw-data\2018\201800007.json
No authkeywords found in raw-data\2018\201800009.json
No authkeywords found in raw-data\2018\201800014.json
No authkeywords found in raw-data\2018\201800016.json
No authkeywords found in raw-data\2018\201800020.json
No authkeywords found in raw-data\2018\201800021.json
No authkeywords found in raw-data\2018\201800022.json
No authkeywords found in raw-data\2018\201800023.json
No authkeywords found in raw-data\2018\201800024.json
No authkeywords found in raw-data\2018\201800025.json
No authkeywords found in raw-data\2018\201800026.json
No authkeywords found in raw-data\2018\201800027.json
No authkeywords found in raw-data\2018\201800028.json
No authkeywords found in raw-data\2018\201800029.json
No authkeywords found in raw-data\2018\201800030.json
No authkeywords found in raw-data\2018\201800031.json
No authkeywords found in raw

- Total records processed: 83074
- Records processed with errors: 3762 from 20216 files. Approximate 18.61 %

In [104]:
df_author_keywords = pd.read_csv('author_keywords/author_keywords_separate.csv')
df_author_keywords.describe(include='all')

Unnamed: 0,file,auth_keyword
count,83074,83073
unique,16454,51856
top,202300390.json,Thailand
freq,30,746


In [107]:
df_author_keywords.shape

(83073, 2)

In [106]:
df_author_keywords.dropna(inplace=True)

In [108]:
author_keywords_groupby_keys = (df_author_keywords.groupby('auth_keyword').size().reset_index(name='Count').sort_values(by='Count', ascending=False))
author_keywords_groupby_keys.head(10)

Unnamed: 0,auth_keyword,Count
33471,Thailand,746
4800,COVID-19,282
14706,Hadron-Hadron scattering (experiments),155
16845,Inflammation,130
29238,SARS-CoV-2,115
14555,HIV,114
4687,CMS,98
24119,Oxidative stress,83
2491,Asia,81
19691,Machine learning,78


In [109]:
author_keywords_groupby_keys.to_csv('author_keywords/author_keywords_groupby_keys.csv', index=False, encoding='utf-8')

## Get Links from core data

In [119]:
def extract_links(json_file):
    """Extract links from JSON file"""
    global link_count_files, link_count_error_files
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    records = []
    
    # Navigate to links section
    title = (data.get('abstracts-retrieval-response', {})
            .get('coredata', {})
            .get('dc:title',{}))
    links = (data.get('abstracts-retrieval-response', {})
            .get('coredata', {})
            .get('link', []))
    
    # Process each link
    for link in links:
        record = {
            'file': title,
            'fa': link.get('@_fa', ''),
            'rel_type': link.get('@rel', ''),
            'href': link.get('@href', '')
        }
        records.append(record)
    
    return records

def process_json_folders(root_folder_path):
    """Process all JSON files in root folder and subfolders"""
    global link_count_files, link_count_error_files
    all_records = []
    root_path = Path(root_folder_path)
    
    # Create output directory
    Path('links').mkdir(exist_ok=True)
    
    # Process each JSON file
    for json_file in root_path.rglob('*.json'):
        try:
            link_count_files += 1
            records = extract_links(json_file)
            all_records.extend(records)
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
    
    if all_records:
        df = pd.DataFrame(all_records)
        output_file = 'links/links_separate.csv'
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Processed {len(all_records)} links to {output_file}")
        print(f"Records processed with errors: {link_count_error_files} from {link_count_files} files")
        print(f"Approximate error: {link_count_error_files / link_count_files * 100:.2f}%")
    else:
        print("No links found")
        link_count_error_files += 1

# Usage
link_count_files = 0
link_count_error_files = 0
process_json_folders(raw_data_root_path)

Processed 60648 links to links/links_separate.csv
Records processed with errors: 0 from 20216 files
Approximate error: 0.00%


In [141]:
df_links = pd.read_csv('links/links_separate.csv')
df_links.describe(include='all')

Unnamed: 0,file,fa,rel_type,href
count,60648,60648,60648,60648
unique,20141,1,3,60648
top,Preface,True,self,https://api.elsevier.com/content/abstract/scop...
freq,54,60648,20216,1


In [142]:
df_links.drop(columns=['fa'], inplace=True)

In [143]:
shape_df_links = df_links.shape
columns_df_links = df_links.columns
print('Shape:', shape_df_links)
print('Columns:', columns_df_links)

Shape: (60648, 3)
Columns: Index(['file', 'rel_type', 'href'], dtype='object')


In [144]:
df_links = df_links.drop_duplicates()
df_links.head(10)

Unnamed: 0,file,rel_type,href
0,Public health and international epidemiology f...,self,https://api.elsevier.com/content/abstract/scop...
1,Public health and international epidemiology f...,scopus,https://www.scopus.com/inward/record.uri?partn...
2,Public health and international epidemiology f...,scopus-citedby,https://www.scopus.com/inward/citedby.uri?part...
3,Flexible Printed Active Antenna for Digital Te...,self,https://api.elsevier.com/content/abstract/scop...
4,Flexible Printed Active Antenna for Digital Te...,scopus,https://www.scopus.com/inward/record.uri?partn...
5,Flexible Printed Active Antenna for Digital Te...,scopus-citedby,https://www.scopus.com/inward/citedby.uri?part...
6,Parametric study of hydrogen production via so...,self,https://api.elsevier.com/content/abstract/scop...
7,Parametric study of hydrogen production via so...,scopus,https://www.scopus.com/inward/record.uri?partn...
8,Parametric study of hydrogen production via so...,scopus-citedby,https://www.scopus.com/inward/citedby.uri?part...
9,Superhydrophobic coating from fluoroalkylsilan...,self,https://api.elsevier.com/content/abstract/scop...


In [146]:
df_links_self = df_links[df_links['rel_type'] == 'self']
df_links_scopus = df_links[df_links['rel_type'] == 'scopus']
df_links_scopus_citedby = df_links[df_links['rel_type'] == 'scopus-citedby']

In [148]:
df_links_self.head(10)
df_links_self.drop(columns=['rel_type'], inplace=True)
df_links_self.to_csv('links/links_self.csv', index=False, encoding='utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_links_self.drop(columns=['rel_type'], inplace=True)


In [149]:
df_links_scopus.head(10)
df_links_scopus.drop(columns=['rel_type'], inplace=True)
df_links_scopus.to_csv('links/links_scopus.csv', index=False, encoding='utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_links_scopus.drop(columns=['rel_type'], inplace=True)


In [150]:
df_links_scopus_citedby.head(10)
df_links_scopus_citedby.drop(columns=['rel_type'], inplace=True)
df_links_scopus_citedby.to_csv('links/links_scopus_citedby.csv', index=False, encoding='utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_links_scopus_citedby.drop(columns=['rel_type'], inplace=True)


## Get subject areas

In [None]:
def extract_subject_areas(json_file):
    """Extract subject areas from JSON file"""
    global subject_areas_count_files, sum_subject_areas_count_error_files
    records = []
    
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from {json_file}: {e}")
                return records
    except FileNotFoundError:
        print(f"File not found: {json_file}")
        return records
    except Exception as e:
        print(f"Error reading file {json_file}: {e}")
        return records
    
    # Navigate to required data
    try:
        title = (data.get('abstracts-retrieval-response', {})
                .get('coredata', {})
                .get('dc:title', ''))
        
        subject_areas = (data.get('abstracts-retrieval-response', {})
                        .get('subject-areas', {})
                        .get('subject-area', []))
        
        # Validate data
        if not title:
            print(f"No title found in {json_file}")
        
        # Handle single subject area case
        if isinstance(subject_areas, dict):
            subject_areas = [subject_areas]
        elif not subject_areas:
            print(f"No subject areas found in {json_file}")
            return records
        
        # Process each subject area
        for subject in subject_areas:
            if not isinstance(subject, dict):
                continue
                
            record = {
                'file': Path(json_file).name,
                'title': title,
                'Subject_Name': subject.get('$', ''),
            }
            records.append(record)
            
    except Exception as e:
        print(f"Error processing data from {json_file}: {e}")
        
    return records

def process_json_folders(root_folder_path):
    """Process all JSON files in root folder and subfolders"""
    global subject_areas_count_files, sum_subject_areas_count_error_files
    all_records = []
    root_path = Path(root_folder_path)
    
    # Create output directory
    Path('subject_areas').mkdir(exist_ok=True)
    
    # Process each JSON file
    for json_file in root_path.rglob('*.json'):
        try:
            subject_areas_count_files += 1
            records = extract_subject_areas(json_file)
            all_records.extend(records)
        except Exception as e:
            sum_subject_areas_count_error_files += 1
            print(f"Error processing {json_file}: {e}")
    
    if all_records:
        df = pd.DataFrame(all_records)
        output_file = 'subject_areas/subject_areas_separate.csv'
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Processed {len(all_records)} subject areas to {output_file}")
        print(f"Records processed with errors: {sum_subject_areas_count_error_files} from {subject_areas_count_files} files")
        print(f"Approximate error: {sum_subject_areas_count_error_files / subject_areas_count_files * 100:.2f}%")
    else:
        print("No subject areas found")

subject_areas_count_files = 0
sum_subject_areas_count_error_files = 0
process_json_folders(raw_data_root_path)

No title found in raw-data\2019\201902385.json
Processed 50064 subject areas to subject_areas/subject_areas_separate.csv
Records processed with errors: 0 from 20216 files
Approximate error: 0.00%


In [155]:
df_subject_areas = pd.read_csv('subject_areas/subject_areas_separate.csv')
df_subject_areas.describe(include='all')

Unnamed: 0,file,title,Subject_Name
count,50064,50062,50064
unique,20216,20140,321
top,202001606.json,Preface,Multidisciplinary
freq,12,44,1088


In [156]:
shape_df_subject_areas = df_subject_areas.shape
columns_df_subject_areas = df_subject_areas.columns
print('Shape:', shape_df_subject_areas)
print('Columns:', columns_df_subject_areas)

Shape: (50064, 3)
Columns: Index(['file', 'title', 'Subject_Name'], dtype='object')


In [157]:
df_subject_areas.head(5)

Unnamed: 0,file,title,Subject_Name
0,201800000.json,Public health and international epidemiology f...,Medicine (all)
1,201800001.json,Flexible Printed Active Antenna for Digital Te...,Electrical and Electronic Engineering
2,201800001.json,Flexible Printed Active Antenna for Digital Te...,"Electronic, Optical and Magnetic Materials"
3,201800002.json,Parametric study of hydrogen production via so...,Chemistry (all)
4,201800002.json,Parametric study of hydrogen production via so...,Chemical Engineering (all)


In [162]:
list_subject_areas = df_subject_areas['Subject_Name'].value_counts()
list_subject_areas.sort_values(ascending=False, inplace=True)
list_subject_areas.head(10)
list_subject_areas.to_csv('subject_areas/subject_areas_list.csv', index=True, encoding='utf-8')

In [163]:
list_subject_areas.shape

(321,)

## Get references

In [24]:
def extract_references_summary(json_file):
    """Extract references and return as list per file"""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        abs_resp = data.get('abstracts-retrieval-response', {})
        references = (abs_resp.get('item', {})
                     .get('bibrecord', {})
                     .get('tail', {})
                     .get('bibliography', {})
                     .get('reference'))
        
        if references is None:
            return {
                'file': Path(json_file).name,
                'references': []
            }
            
        if isinstance(references, dict):
            references = [references]
            
        ref_titles = []
        for ref in references:
            if isinstance(ref, dict):
                ref_title = ref.get('ref-info', {}).get('ref-sourcetitle')
                if ref_title:
                    ref_titles.append(ref_title)
        
        return {
            'file': Path(json_file).name,
            'references': ref_titles
        }
                    
    except Exception as e:
        print(f"Error processing {json_file}: {e}")
        return {
            'file': Path(json_file).name,
            'references': []
        }

def process_json_folders_summary(root_folder_path):
    """Process files and create summary DataFrame"""
    all_records = []
    root_path = Path(root_folder_path)
    
    for json_file in root_path.rglob('*.json'):
        record = extract_references_summary(json_file)
        all_records.append(record)
    
    if all_records:
        df = pd.DataFrame(all_records)
        # Convert references list to string for CSV storage
        df['references'] = df['references'].apply(lambda x: '|'.join(x) if x else '')
        
        output_file = 'references_summary.csv'
        df.to_csv(output_file, index=False)
        print(f"Processed {len(df)} files to {output_file}")
        return df
    
    return pd.DataFrame(columns=['file', 'references'])

# Usage
root_folder_path = 'raw-data'
summary_df = process_json_folders_summary(root_folder_path)

Error processing raw-data\2018\201800036.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800075.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800089.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800109.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800112.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800113.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800114.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800115.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800390.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800491.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\201800587.json: 'NoneType' object has no attribute 'get'
Error processing raw-data\2018\2

In [25]:
df_references = pd.read_csv('references/references_separate.csv')
df_references.describe(include='all')

Unnamed: 0,file,title,reference
count,886277,886234,885866
unique,20204,20128,177448
top,202103556.json,Guidelines for the use and interpretation of a...,JHEP
freq,4084,4084,6345


In [29]:
shape_df_references = df_references.shape
columns_df_references = df_references.columns
print('Shape:', shape_df_references)
print('Columns:', columns_df_references)


Shape: (886277, 2)
Columns: Index(['file', 'reference'], dtype='object')


In [30]:
df_references.head(5)

Unnamed: 0,file,reference
0,201800000.json,Science.
1,201800000.json,The future of public health
2,201800000.json,International encyclopedia of public health.
3,201800000.json,Encyclopedia of public health.
4,201800000.json,Definition of Public health


In [27]:
df_references.drop(columns=['title'], inplace=True)

In [35]:
pivot_df_references = (
    df_references.pivot_table(
        index=["file"],
        values="reference",
        aggfunc=lambda x: list(x.dropna().astype(str))  # Combine references into a list, handle NaN values
    )
    .reset_index()
)

pivot_df_references.to_csv("references/references_pivot.csv", index=False, encoding='utf-8')

## Get title

In [9]:
def extract_title_summary(json_file):
    """Extract references and return as list per file"""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        abs_resp = data.get('abstracts-retrieval-response', {})
        title = abs_resp.get('coredata', {}).get('dc:title', np.nan)
        publish_name = abs_resp.get('coredata', {}).get('prism:publicationName', np.nan)
            
        return {
            'file': Path(json_file).name,
            'title': title,
            'publish_name': publish_name,
        }
                    
    except Exception as e:
        print(f"Error processing {json_file}: {e}")
        return {
            'file': Path(json_file).name,
            'title': np.nan,
            'publish_name': np.nan,
        }

def process_json_folders_summary(root_folder_path):
    """Process files and create summary DataFrame"""
    all_records = []
    root_path = Path(root_folder_path)
    
    for json_file in root_path.rglob('*.json'):
        record = extract_title_summary(json_file)
        all_records.append(record)
    
    if all_records:
        df = pd.DataFrame(all_records)
        output_file = 'title_publish/title_publish_name_separate.csv'
        df.to_csv(output_file, index=False)
        print(f"Processed {len(df)} files to {output_file}")
        return df
    
    return pd.DataFrame(columns=['file', 'title', 'publish_name'])

# Usage
root_folder_path = 'raw-data'
summary_df = process_json_folders_summary(root_folder_path)

Processed 20216 files to title_publish/title_publish_name_separate.csv
