In [9]:
import pandas as pd
import numpy as np
import json
import glob
import os
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

## Code

In [33]:
def get_array_or_dict_values(data, path, key):
    """Extract values from deeply nested arrays/dictionaries."""
    try:
        value = data
        for p in path:
            if isinstance(value, dict):
                value = value.get(p, None)
            elif isinstance(value, list):
                # Handle list of dicts or list of lists
                temp_values = []
                for item in value:
                    if isinstance(item, dict):
                        item_value = item.get(p, None)
                        if isinstance(item_value, list):
                            temp_values.extend(item_value)
                        elif item_value is not None:
                            temp_values.append(item_value)
                    elif isinstance(item, list):
                        temp_values.extend(item)
                value = temp_values if temp_values else None
            else:
                return [None]

        # Handle final value
        if value is None:
            return [None]
        elif isinstance(value, dict):
            # Handle dictionary case
            if key in value:
                return [value[key]]
            elif '$' in value:  # Special case for $ key
                return [value['$']]
            return [None]
        elif isinstance(value, list):
            # Handle list case
            results = []
            for item in value:
                if isinstance(item, dict):
                    if key in item:
                        results.append(item[key])
                    elif '$' in item:  # Special case for $ key
                        results.append(item['$'])
                    else:
                        results.append(None)
                else:
                    results.append(item if item is not None else None)
            return results
        else:
            return [value]
    except Exception as e:
        print(f"Error extracting from path {path}: {e}")
        return [None]

In [34]:
def extract_features(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    abs_resp = data.get('abstracts-retrieval-response', {})
    
    # Special handling for organizations
    def get_organizations(data):
        orgs = []
        author_groups = data.get('item', {}).get('bibrecord', {}).get('head', {}).get('author-group', [])
        
        if isinstance(author_groups, dict):
            author_groups = [author_groups]
            
        for group in author_groups:
            if isinstance(group, dict):
                affiliation = group.get('affiliation', {})
                if isinstance(affiliation, dict):
                    org = affiliation.get('organization', [])
                    if isinstance(org, dict):
                        org = [org]
                    if isinstance(org, list):
                        for o in org:
                            if isinstance(o, dict):
                                orgs.append(o.get('$', None))
                            else:
                                orgs.append(o)
        
        return orgs if orgs else [None]

    features = {
        'organizations': {
            'custom': get_organizations
        },
        'classifications': {
            'path': ['item', 'bibrecord', 'head', 'enhancement', 
                    'classificationgroup', 'classifications'],
            'key': '@type'
        },
        'affiliations': {
            'path': ['affiliation'],
            'key': 'affilname'
        },
        'auth-keywords': {
            'path': ['authkeywords', 'author-keyword'],
            'key': '$'
        },
        'subjects': {
            'path': ['subject-areas', 'subject-area'],
            'key': '$'
        },
        'authors': {
            'path': ['authors', 'author'],
            'key': 'ce:indexed-name'
        }
    }

    record = {'file': Path(json_file).name}

    for feature_name, config in features.items():
        if 'custom' in config:
            # Use custom extraction function for organizations
            record[feature_name] = config['custom'](abs_resp)
        elif 'subpath' in config:
            values = []
            main_array = get_array_or_dict_values(abs_resp, config['path'], None)
            for item in main_array:
                if isinstance(item, dict):
                    sub_values = get_array_or_dict_values(item, config['subpath'], config['key'])
                    values.extend(sub_values)
            record[feature_name] = values
        else:
            record[feature_name] = get_array_or_dict_values(abs_resp, config['path'], config['key'])

    return record

In [35]:
# Process all JSON files and merge features into a DataFrame
def process_json_files(root_folder_path):
    """Process all JSON files and extract features."""
    all_records = []
    root_path = Path(root_folder_path)

    # Define expected columns
    expected_columns = [
        'file', 'organizations', 'classifications', 
        'affiliations', 'auth-keywords',
        'subjects', 'authors'
    ]

    for json_file in root_path.rglob('*.json'):
        try:
            record = extract_features(json_file)
            # Ensure all expected columns exist with NaN as default
            for col in expected_columns:
                if col not in record:
                    record[col] = np.nan
                elif isinstance(record[col], list) and not record[col]:
                    record[col] = np.nan
            all_records.append(record)
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
            # Add empty record with NaN values if file processing fails
            empty_record = {
                'file': Path(json_file).name,
                **{col: np.nan for col in expected_columns if col != 'file'}
            }
            all_records.append(empty_record)

    if all_records:
        df = pd.DataFrame(all_records)
        # Combine features into single rows by grouping
        grouped_df = (
            df.groupby('file').agg(
                {col: lambda x: ', '.join(filter(None, x.dropna().astype(str))) for col in df.columns if col != 'file'}
            ).reset_index()
        )
        output_file = 'data/features_separate.csv'
        grouped_df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Processed {len(all_records)} files to {output_file}")
        return grouped_df
    else:
        print("No records found")
        return pd.DataFrame(columns=expected_columns)

In [36]:
root_folder = 'raw-data'  # Replace with your JSON root folder
df = process_json_files(root_folder)
df.head()

Processed 20216 files to data/features_separate.csv


Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors
0,201800000.json,['Department of Preventive and Social Medicine...,"['ASJC', 'SUBJABBR']","['Stanford University School of Medicine', 'Ch...",[None],['Medicine (all)'],"['Pongpirul K.', 'Lungren M.P.']"
1,201800001.json,"['Department of Electrical Engineering', 'Wire...","['ASJC', 'CPXCLASS', 'FLXCLASS', 'SUBJABBR']",['Chulalongkorn University'],[None],"['Electrical and Electronic Engineering', 'Ele...","['Pratumsiri T.', 'Janpugdee P.']"
2,201800002.json,['Center of Excellence in Catalysis and Cataly...,"['CPXCLASS', 'ENCOMPASSCLASS', 'FLXCLASS', 'AS...",['Chulalongkorn University'],"['Circulating fluidized bed', 'Computational f...","['Chemistry (all)', 'Chemical Engineering (all...","['Phuakpunk K.', 'Chalermsinsuwan B.', 'Putivi..."
3,201800003.json,"['Department of Chemistry', 'Faculty of Scienc...","['CPXCLASS', 'FLXCLASS', 'ASJC', 'SUBJABBR']","['Hirosaki University', 'Chulalongkorn Univers...","['Encapsulation', 'Fluoroalkylsilane', 'Natura...","['Chemistry (all)', 'Condensed Matter Physics'...","['Saengkaew J.', 'Le D.', 'Samart C.', 'Sawada..."
4,201800004.json,"['Program in Petrochemistry', 'Faculty of Scie...","['EMCLASS', 'ASJC', 'SUBJABBR']","['Chulalongkorn University', 'Thailand Nationa...","['acpcPNA', 'Electrochemical impedance spectro...","['Analytical Chemistry', 'Biochemistry', 'Envi...","['Teengam P.', 'Siangproh W.', 'Tuantranont A...."


In [37]:
df.describe(include='all')

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors
count,20216,20216,20216,20216,20216,20216,20216
unique,20216,17209,38,10528,16304,3105,18689
top,201800000.json,['Chulalongkorn University'],"['ASJC', 'SUBJABBR']",['Chulalongkorn University'],[None],['Multidisciplinary'],"['Ukritchon B.', 'Keawsawasvong S.']"
freq,1,180,7769,4225,3762,1049,17


In [38]:
df.info

<bound method DataFrame.info of                  file                                      organizations  \
0      201800000.json  ['Department of Preventive and Social Medicine...   
1      201800001.json  ['Department of Electrical Engineering', 'Wire...   
2      201800002.json  ['Center of Excellence in Catalysis and Cataly...   
3      201800003.json  ['Department of Chemistry', 'Faculty of Scienc...   
4      201800004.json  ['Program in Petrochemistry', 'Faculty of Scie...   
...               ...                                                ...   
20211  202302885.json  ['Department of Chemical Technology', 'Faculty...   
20212  202302886.json  ['Department of Chemistry', 'Faculty of Scienc...   
20213  202302887.json  ['Centre for Education and International Devel...   
20214  202302888.json  ['Program of Fisheries Science', 'Faculty of I...   
20215  202302889.json  ['Faculty of Applied Science and Center for Pr...   

                                         classification

In [39]:
df.shape

(20216, 7)

## Join table between features and references

### Create data frame of reference pivot

In [23]:
df_references = pd.read_csv('references/references_pivot.csv')
df_references.describe(include='all')

Unnamed: 0,file,reference
count,20204,20204
unique,20204,19788
top,201800000.json,[]
freq,1,411


In [27]:
shape_df_references = df_references.shape
print("Shape df references: ",shape_df_references)
df_references.head(5)

Shape df references:  (20204, 2)


Unnamed: 0,file,reference
0,201800000.json,"['Science.', 'The future of public health', 'I..."
1,201800001.json,"['Proc. CAMA 2015', 'Proc. 2015 Thailand-Japan..."
2,201800002.json,"['AICHE J.', 'Int. J. Hydrog. Energy', 'Chem. ..."
3,201800003.json,"['Desalination', 'J. Membr. Sci.', 'Appl. Cata..."
4,201800004.json,"['Int. J. Tubercul. Lung Dis.', 'Lancet Infect..."


In [25]:
df_references.info

<bound method DataFrame.info of                  file                                          reference
0      201800000.json  ['Science.', 'The future of public health', 'I...
1      201800001.json  ['Proc. CAMA 2015', 'Proc. 2015 Thailand-Japan...
2      201800002.json  ['AICHE J.', 'Int. J. Hydrog. Energy', 'Chem. ...
3      201800003.json  ['Desalination', 'J. Membr. Sci.', 'Appl. Cata...
4      201800004.json  ['Int. J. Tubercul. Lung Dis.', 'Lancet Infect...
...               ...                                                ...
20199  202302885.json  ['ChemSusChem', 'Chemical Economics Handbook. ...
20200  202302886.json  ['Imported and Manufactured Food Program Inspe...
20201  202302887.json  ['Leaving Terrorism Behind: Individual and Col...
20202  202302888.json  ['International Journal of Fisheries and Aquac...
20203  202302889.json  ['World Development', 'International Migration...

[20204 rows x 2 columns]>

### Create data frame of features

In [40]:
df_features = pd.read_csv('data/features_separate.csv')
df_features.describe(include='all')

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors
count,20216,20216,20216,20216,20216,20216,20216
unique,20216,17209,38,10528,16304,3105,18689
top,201800000.json,['Chulalongkorn University'],"['ASJC', 'SUBJABBR']",['Chulalongkorn University'],[None],['Multidisciplinary'],"['Ukritchon B.', 'Keawsawasvong S.']"
freq,1,180,7769,4225,3762,1049,17


In [41]:
shape_df_features = df_features.shape
print("Shape df features: ",shape_df_features)
df_features.head(10)

Shape df features:  (20216, 7)


Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors
0,201800000.json,['Department of Preventive and Social Medicine...,"['ASJC', 'SUBJABBR']","['Stanford University School of Medicine', 'Ch...",[None],['Medicine (all)'],"['Pongpirul K.', 'Lungren M.P.']"
1,201800001.json,"['Department of Electrical Engineering', 'Wire...","['ASJC', 'CPXCLASS', 'FLXCLASS', 'SUBJABBR']",['Chulalongkorn University'],[None],"['Electrical and Electronic Engineering', 'Ele...","['Pratumsiri T.', 'Janpugdee P.']"
2,201800002.json,['Center of Excellence in Catalysis and Cataly...,"['CPXCLASS', 'ENCOMPASSCLASS', 'FLXCLASS', 'AS...",['Chulalongkorn University'],"['Circulating fluidized bed', 'Computational f...","['Chemistry (all)', 'Chemical Engineering (all...","['Phuakpunk K.', 'Chalermsinsuwan B.', 'Putivi..."
3,201800003.json,"['Department of Chemistry', 'Faculty of Scienc...","['CPXCLASS', 'FLXCLASS', 'ASJC', 'SUBJABBR']","['Hirosaki University', 'Chulalongkorn Univers...","['Encapsulation', 'Fluoroalkylsilane', 'Natura...","['Chemistry (all)', 'Condensed Matter Physics'...","['Saengkaew J.', 'Le D.', 'Samart C.', 'Sawada..."
4,201800004.json,"['Program in Petrochemistry', 'Faculty of Scie...","['EMCLASS', 'ASJC', 'SUBJABBR']","['Chulalongkorn University', 'Thailand Nationa...","['acpcPNA', 'Electrochemical impedance spectro...","['Analytical Chemistry', 'Biochemistry', 'Envi...","['Teengam P.', 'Siangproh W.', 'Tuantranont A...."
5,201800005.json,"['Chulalongkorn Business School', 'Chulalongko...","['ASJC', 'SUBJABBR']",['Chulalongkorn Business School'],"['Design of experiment', 'Optimal fleet size',...","['Business and International Management', 'Str...",['Setamanit S.-O.']
6,201800006.json,"['Department of Pediatrics', 'Faculty of Medic...","['CABSCLASS', 'EMCLASS', 'ASJC', 'SUBJABBR']","['Chulalongkorn University', 'King Chulalongko...","['Female', 'Hyperammonemia', 'Novel mutations'...",['Genetics'],"['Chongsrisawat V.', 'Damrongphol P.', 'Ittiwu..."
7,201800007.json,"['Department of Radiation Oncology', 'Medical ...","['ASJC', 'SUBJABBR']","['Chulalongkorn University', 'The University o...",[None],['Medicine (all)'],"['Johnstone C.', 'Ghia A.J.', 'Prayongrat A.']"
8,201800008.json,"['Department of Computer Engineering', 'Facult...","['ASJC', 'CPXCLASS', 'FLXCLASS', 'SUBJABBR']","['Chulalongkorn University', 'Thailand Nationa...","['Attention mechanism', 'Bi-directional GRU', ...","['Decision Sciences (miscellaneous)', 'Informa...","['Kowsrihawat K.', 'Vateekul P.', 'Boonkwan P.']"
9,201800009.json,"['Petroleum and Petrochemical College', 'Chula...","['ASJC', 'SUBJABBR']",['Chulalongkorn University'],[None],"['Materials Science (all)', 'Condensed Matter ...","['Pitakchatwong C.', 'Chirachanchai S.']"


### Join data frame

In [43]:
df_join_features = pd.merge(df_features, df_references, on='file', how='left')
df_join_features.head(5)

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors,reference
0,201800000.json,['Department of Preventive and Social Medicine...,"['ASJC', 'SUBJABBR']","['Stanford University School of Medicine', 'Ch...",[None],['Medicine (all)'],"['Pongpirul K.', 'Lungren M.P.']","['Science.', 'The future of public health', 'I..."
1,201800001.json,"['Department of Electrical Engineering', 'Wire...","['ASJC', 'CPXCLASS', 'FLXCLASS', 'SUBJABBR']",['Chulalongkorn University'],[None],"['Electrical and Electronic Engineering', 'Ele...","['Pratumsiri T.', 'Janpugdee P.']","['Proc. CAMA 2015', 'Proc. 2015 Thailand-Japan..."
2,201800002.json,['Center of Excellence in Catalysis and Cataly...,"['CPXCLASS', 'ENCOMPASSCLASS', 'FLXCLASS', 'AS...",['Chulalongkorn University'],"['Circulating fluidized bed', 'Computational f...","['Chemistry (all)', 'Chemical Engineering (all...","['Phuakpunk K.', 'Chalermsinsuwan B.', 'Putivi...","['AICHE J.', 'Int. J. Hydrog. Energy', 'Chem. ..."
3,201800003.json,"['Department of Chemistry', 'Faculty of Scienc...","['CPXCLASS', 'FLXCLASS', 'ASJC', 'SUBJABBR']","['Hirosaki University', 'Chulalongkorn Univers...","['Encapsulation', 'Fluoroalkylsilane', 'Natura...","['Chemistry (all)', 'Condensed Matter Physics'...","['Saengkaew J.', 'Le D.', 'Samart C.', 'Sawada...","['Desalination', 'J. Membr. Sci.', 'Appl. Cata..."
4,201800004.json,"['Program in Petrochemistry', 'Faculty of Scie...","['EMCLASS', 'ASJC', 'SUBJABBR']","['Chulalongkorn University', 'Thailand Nationa...","['acpcPNA', 'Electrochemical impedance spectro...","['Analytical Chemistry', 'Biochemistry', 'Envi...","['Teengam P.', 'Siangproh W.', 'Tuantranont A....","['Int. J. Tubercul. Lung Dis.', 'Lancet Infect..."


In [44]:
df_join_features.info

<bound method DataFrame.info of                  file                                      organizations  \
0      201800000.json  ['Department of Preventive and Social Medicine...   
1      201800001.json  ['Department of Electrical Engineering', 'Wire...   
2      201800002.json  ['Center of Excellence in Catalysis and Cataly...   
3      201800003.json  ['Department of Chemistry', 'Faculty of Scienc...   
4      201800004.json  ['Program in Petrochemistry', 'Faculty of Scie...   
...               ...                                                ...   
20211  202302885.json  ['Department of Chemical Technology', 'Faculty...   
20212  202302886.json  ['Department of Chemistry', 'Faculty of Scienc...   
20213  202302887.json  ['Centre for Education and International Devel...   
20214  202302888.json  ['Program of Fisheries Science', 'Faculty of I...   
20215  202302889.json  ['Faculty of Applied Science and Center for Pr...   

                                         classification

In [45]:
df_join_features.describe(include='all')

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors,reference
count,20216,20216,20216,20216,20216,20216,20216,20204
unique,20216,17209,38,10528,16304,3105,18689,19788
top,201800000.json,['Chulalongkorn University'],"['ASJC', 'SUBJABBR']",['Chulalongkorn University'],[None],['Multidisciplinary'],"['Ukritchon B.', 'Keawsawasvong S.']",[]
freq,1,180,7769,4225,3762,1049,17,411


In [46]:
shape_df_join_features = df_join_features.shape
column_df_join_features = df_join_features.columns
print("Shape df join features: ",shape_df_join_features)
print("Columns df join features: ",column_df_join_features)

Shape df join features:  (20216, 8)
Columns df join features:  Index(['file', 'organizations', 'classifications', 'affiliations',
       'auth-keywords', 'subjects', 'authors', 'reference'],
      dtype='object')


In [47]:
df_join_features.to_csv('data/features_summation.csv', index=False, encoding='utf-8')