In [None]:
def get_array_or_dict_values(data, path, key):
    """Extract values from deeply nested arrays/dictionaries."""
    try:
        value = data
        for p in path:
            if isinstance(value, dict):
                value = value.get(p, None)
            elif isinstance(value, list):
                # Handle list of dicts or list of lists
                temp_values = []
                for item in value:
                    if isinstance(item, dict):
                        item_value = item.get(p, None)
                        if isinstance(item_value, list):
                            temp_values.extend(item_value)
                        elif item_value is not None:
                            temp_values.append(item_value)
                    elif isinstance(item, list):
                        temp_values.extend(item)
                value = temp_values if temp_values else None
            else:
                return [None]

        # Handle final value
        if value is None:
            return [None]
        elif isinstance(value, dict):
            # Handle dictionary case
            if key in value:
                return [value[key]]
            elif '$' in value:  # Special case for $ key
                return [value['$']]
            return [None]
        elif isinstance(value, list):
            # Handle list case
            results = []
            for item in value:
                if isinstance(item, dict):
                    if key in item:
                        results.append(item[key])
                    elif '$' in item:  # Special case for $ key
                        results.append(item['$'])
                    else:
                        results.append(None)
                else:
                    results.append(item if item is not None else None)
            return results
        else:
            return [value]
    except Exception as e:
        print(f"Error extracting from path {path}: {e}")
        return [None]

def extract_features(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    abs_resp = data.get('abstracts-retrieval-response', {})
    
    # Special handling for organizations
    def get_organizations(data):
        orgs = []
        author_groups = data.get('item', {}).get('bibrecord', {}).get('head', {}).get('author-group', [])
        
        if isinstance(author_groups, dict):
            author_groups = [author_groups]
            
        for group in author_groups:
            if isinstance(group, dict):
                affiliation = group.get('affiliation', {})
                if isinstance(affiliation, dict):
                    org = affiliation.get('organization', [])
                    if isinstance(org, dict):
                        org = [org]
                    if isinstance(org, list):
                        for o in org:
                            if isinstance(o, dict):
                                orgs.append(o.get('$', None))
                            else:
                                orgs.append(o)
        
        return orgs if orgs else [None]

    features = {
        'organizations': {
            'custom': get_organizations
        },
        'classifications': {
            'path': ['item', 'bibrecord', 'head', 'enhancement', 
                    'classificationgroup', 'classifications'],
            'key': '@type'
        },
        'affiliations': {
            'path': ['affiliation'],
            'key': 'affilname'
        },
        'auth-keywords': {
            'path': ['authkeywords', 'author-keyword'],
            'key': '$'
        },
        'subjects': {
            'path': ['subject-areas', 'subject-area'],
            'key': '$'
        },
        'authors': {
            'path': ['authors', 'author'],
            'key': 'ce:indexed-name'
        }
    }

    record = {'file': Path(json_file).name}

    for feature_name, config in features.items():
        if 'custom' in config:
            # Use custom extraction function for organizations
            record[feature_name] = config['custom'](abs_resp)
        elif 'subpath' in config:
            values = []
            main_array = get_array_or_dict_values(abs_resp, config['path'], None)
            for item in main_array:
                if isinstance(item, dict):
                    sub_values = get_array_or_dict_values(item, config['subpath'], config['key'])
                    values.extend(sub_values)
            record[feature_name] = values
        else:
            record[feature_name] = get_array_or_dict_values(abs_resp, config['path'], config['key'])

    return record
  

# Process all JSON files and merge features into a DataFrame
def process_json_files(root_folder_path):
    """Process all JSON files and extract features."""
    all_records = []
    root_path = Path(root_folder_path)

    # Define expected columns
    expected_columns = [
        'file', 'organizations', 'classifications', 
        'affiliations', 'auth-keywords',
        'subjects', 'authors'
    ]

    for json_file in root_path.rglob('*.json'):
        try:
            record = extract_features(json_file)
            # Ensure all expected columns exist with NaN as default
            for col in expected_columns:
                if col not in record:
                    record[col] = np.nan
                elif isinstance(record[col], list) and not record[col]:
                    record[col] = np.nan
            all_records.append(record)
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
            # Add empty record with NaN values if file processing fails
            empty_record = {
                'file': Path(json_file).name,
                **{col: np.nan for col in expected_columns if col != 'file'}
            }
            all_records.append(empty_record)

    if all_records:
        df = pd.DataFrame(all_records)
        # Combine features into single rows by grouping
        grouped_df = (
            df.groupby('file').agg(
                {col: lambda x: ', '.join(filter(None, x.dropna().astype(str))) for col in df.columns if col != 'file'}
            ).reset_index()
        )
        output_file = 'data/features_separate.csv'
        grouped_df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Processed {len(all_records)} files to {output_file}")
        return grouped_df
    else:
        print("No records found")
        return pd.DataFrame(columns=expected_columns)

root_folder = 'raw-data'  # Replace with your JSON root folder
df = process_json_files(root_folder)
df.head()

In [None]:
df_data_file = pd.read_csv("data/features_summation.csv", encoding='utf-8')
df_data_file.info()
df_data_file.describe(include='all')
df_data_file.shape
df_data_file.head(5)

In [None]:
df_data_file.drop(columns=['file', 'authors', 'reference','title'], inplace=True)
df_data_file_null_out = df_data_file_null_out.applymap(lambda x: None if x == '[None]' or x == '[]' or x == 'None' else x)
df_data_file_null_out.dropna(inplace=True)
df_data_file_null_out.drop_duplicates(inplace=True)

In [None]:
# Apply -log transformation to all columns
df_transformed = df_data_file_cutoff.applymap(lambda x: -np.log(x) if np.issubdtype(type(x), np.number) and x > 0 else x)

# Display the transformed DataFrame
print(df_transformed.head())

In [None]:
# Calculate the 1st and 99th percentiles
lower_percentile = df_data_file_cutoff['auth-keywords'].value_counts().quantile(0.01)
upper_percentile = df_data_file_cutoff['auth-keywords'].value_counts().quantile(0.99)

# Filter the DataFrame to keep only the rows with 'auth-keywords' within the desired range
df_data_file_cutoff = df_data_file_cutoff[
  df_data_file_cutoff['auth-keywords'].map(df_data_file_cutoff['auth-keywords'].value_counts()).between(lower_percentile, upper_percentile)
]

# Display the shape of the new DataFrame
df_data_file_cutoff.shape

In [None]:
df_data_file_subject = pd.read_csv('data_pandas/1_features_drop_null.csv', encoding='utf-8')
df_data_file_subject.shape
df_data_file_subject.describe(include='all')

# Ensure the subjects column contains valid string representations of lists
def safe_literal_eval(val):
	try:
		return ast.literal_eval(val)
	except (ValueError, SyntaxError):
		return []

# Split the subjects column into multiple rows
df_data_file_subject['subjects'] = df_data_file_subject['subjects'].apply(safe_literal_eval)

df_data_file_subject = df_data_file_subject.explode('subjects').reset_index(drop=True)

# Clean subject values
df_data_file_subject['subject'] = (
    df_data_file_subject['subjects']
    .str.replace(r'[\[\]\"\']', '', regex=True)  # Remove unwanted characters
    .str.strip()                                # Trim whitespace
    .str.replace(r'\(.*?\)', '', regex=True)    # Remove parentheses and their contents
    .str.lower()                                # Convert to lowercase
)

def is_valid_utf8(text):
    try:
        text.encode('utf-8')
        return True
    except UnicodeEncodeError:
        return False

# Filter rows with only valid UTF-8 characters
df_data_file_subject['subject'] = df_data_file_subject['subject'].apply(lambda x: x if is_valid_utf8(x) else np.nan)

# Drop the original subjects column (if required)
df_data_file_subject.drop(columns=['subjects'], inplace=True)

df_data_file_subject.dropna(inplace=True)
df_data_file_subject.drop_duplicate(inplace=True)

import json
with open('clustering_subject.json', encoding='utf-8') as file:
  mapping_subjects = json.load(file)
  
import nltk
from nltk.stem import PorterStemmer
from fuzzywuzzy import fuzz

subject_cluster_mapping = {cluster_data['Cluster']: set(cluster_data['Subjects']) for cluster_data in mapping_subjects}
stemmer = PorterStemmer()

def map_subjects(subject):
    subject = subject.lower()
    tokens = subject.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    for cluster, subjects in subject_cluster_mapping.items():
        for cluster_subject in subjects:
            cluster_tokens = cluster_subject.lower().split()
            stemmed_cluster_tokens = [stemmer.stem(token) for token in cluster_tokens]

            # Exact match
            if set(stemmed_tokens) == set(stemmed_cluster_tokens):
                return cluster

            # Fuzzy matching based on token similarity
            fuzzy_ratio = fuzz.token_sort_ratio(subject, cluster_subject)
            if fuzzy_ratio > 80:
                return cluster

    return 'Other'

df_data_file_subject_clustering['subject-cluster'] = df_data_file_subject_clustering['subject'].apply(lambda x: map_subjects(x.strip()))

df_data_file_subject_clustering.drop(columns=['subject'], inplace=True)

In [None]:
human_readable_mapping = {
    'ASJC': 'all subject journal classification',
    'SUBJABBR': 'subject abbreviation',
    'EMCLASS': 'emerging class',
    'FLXCLASS': 'flexible classification',
    'CPXCLASS': 'complex classification',
    'CABSCLASS': 'cabinet classification',
    'GEOCLASS': 'geographic classification',
    'ENCOMPASSCLASS': 'encompass classification'
}

# Replace abbreviations with human-readable terms
df_data_file_classification['classifications'] = df_data_file_classification['classifications'].replace(human_readable_mapping)
df_data_file_classification['classifications'] = df_data_file_classification['classifications'].apply(ast.literal_eval)
df_data_file_classification = df_data_file_classification.explode('classifications').reset_index(drop=True)
