In [21]:
import pandas as pd
import numpy as np
import json
import glob
import os
import logging
import ast
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

In [3]:
df = pd.read_csv('data/features_summation.csv')
df.describe(include='all')

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors,reference,title,publish_name
count,20216,20216,20216,20216,20216,20216,20216,20204,20215,20216
unique,20216,17209,38,10528,16304,3105,18689,19788,20140,5501
top,201800000.json,['Chulalongkorn University'],"['ASJC', 'SUBJABBR']",['Chulalongkorn University'],[None],['Multidisciplinary'],"['Ukritchon B.', 'Keawsawasvong S.']",[],Preface,Scientific Reports
freq,1,180,7769,4225,3762,1049,17,411,18,438


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216 entries, 0 to 20215
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   file             20216 non-null  object
 1   organizations    20216 non-null  object
 2   classifications  20216 non-null  object
 3   affiliations     20216 non-null  object
 4   auth-keywords    20216 non-null  object
 5   subjects         20216 non-null  object
 6   authors          20216 non-null  object
 7   reference        20204 non-null  object
 8   title            20215 non-null  object
 9   publish_name     20216 non-null  object
dtypes: object(10)
memory usage: 1.5+ MB


In [5]:
shape_df = df.shape
columns_df = df.columns
print("Shape of features_summation.csv: ", shape_df)
print("Columns of features_summation.csv: ", columns_df)

Shape of features_summation.csv:  (20216, 10)
Columns of features_summation.csv:  Index(['file', 'organizations', 'classifications', 'affiliations',
       'auth-keywords', 'subjects', 'authors', 'reference', 'title',
       'publish_name'],
      dtype='object')


In [6]:
df.head(5)

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors,reference,title,publish_name
0,201800000.json,['Department of Preventive and Social Medicine...,"['ASJC', 'SUBJABBR']","['Stanford University School of Medicine', 'Ch...",[None],['Medicine (all)'],"['Pongpirul K.', 'Lungren M.P.']","['Science.', 'The future of public health', 'I...",Public health and international epidemiology f...,"Radiology in Global Health: Strategies, Implem..."
1,201800001.json,"['Department of Electrical Engineering', 'Wire...","['ASJC', 'CPXCLASS', 'FLXCLASS', 'SUBJABBR']",['Chulalongkorn University'],[None],"['Electrical and Electronic Engineering', 'Ele...","['Pratumsiri T.', 'Janpugdee P.']","['Proc. CAMA 2015', 'Proc. 2015 Thailand-Japan...",Flexible Printed Active Antenna for Digital Te...,Progress in Electromagnetics Research Symposium
2,201800002.json,['Center of Excellence in Catalysis and Cataly...,"['CPXCLASS', 'ENCOMPASSCLASS', 'FLXCLASS', 'AS...",['Chulalongkorn University'],"['Circulating fluidized bed', 'Computational f...","['Chemistry (all)', 'Chemical Engineering (all...","['Phuakpunk K.', 'Chalermsinsuwan B.', 'Putivi...","['AICHE J.', 'Int. J. Hydrog. Energy', 'Chem. ...",Parametric study of hydrogen production via so...,Chemical Engineering Science
3,201800003.json,"['Department of Chemistry', 'Faculty of Scienc...","['CPXCLASS', 'FLXCLASS', 'ASJC', 'SUBJABBR']","['Hirosaki University', 'Chulalongkorn Univers...","['Encapsulation', 'Fluoroalkylsilane', 'Natura...","['Chemistry (all)', 'Condensed Matter Physics'...","['Saengkaew J.', 'Le D.', 'Samart C.', 'Sawada...","['Desalination', 'J. Membr. Sci.', 'Appl. Cata...",Superhydrophobic coating from fluoroalkylsilan...,Applied Surface Science
4,201800004.json,"['Program in Petrochemistry', 'Faculty of Scie...","['EMCLASS', 'ASJC', 'SUBJABBR']","['Chulalongkorn University', 'Thailand Nationa...","['acpcPNA', 'Electrochemical impedance spectro...","['Analytical Chemistry', 'Biochemistry', 'Envi...","['Teengam P.', 'Siangproh W.', 'Tuantranont A....","['Int. J. Tubercul. Lung Dis.', 'Lancet Infect...",Electrochemical impedance-based DNA sensor usi...,Analytica Chimica Acta


In [7]:
df.isnull().sum()

file                0
organizations       0
classifications     0
affiliations        0
auth-keywords       0
subjects            0
authors             0
reference          12
title               1
publish_name        0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
for column in columns_df:
  print(f"Value counts for column: {column}")
  print(df[column].value_counts())
  print("\n")

Value counts for column: file
file
201800000.json    1
202200396.json    1
202200403.json    1
202200402.json    1
202200401.json    1
                 ..
202000867.json    1
202000866.json    1
202000865.json    1
202000864.json    1
202302889.json    1
Name: count, Length: 20203, dtype: int64


Value counts for column: organizations
organizations
['Chulalongkorn University']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [10]:
df['title'].values

array(['Public health and international epidemiology for radiology',
       'Flexible Printed Active Antenna for Digital Television Reception',
       'Parametric study of hydrogen production via sorption enhanced steam methane reforming in a circulating fluidized bed riser',
       ...,
       'Social justice, education and peacebuilding: conflict transformation in Southern Thailand',
       'Effects of black soldier fly (Hermetia illucens) larvae as a fish meal replacement on growth performance, feed utilisation, morphological characters and carcass composition of Thai climbing perch (Anabas testudineus)',
       'Effects of remittances on household poverty and inequality in Cambodia'],
      dtype=object)

In [11]:
df_temp = df
columns_df_temp = df_temp.columns

In [22]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [23]:
def convert_to_comma_separated(x):
    """
    Convert a list or a string representation of a list into a comma-separated string.
    
    Args:
        x: The cell value to process.
        
    Returns:
        A comma-separated string if input is a list or a parsable string of a list.
        Returns the original value otherwise.
    """
    if isinstance(x, list):
        # If it's already a list, join with commas
        return ', '.join(str(item) for item in x)
    elif isinstance(x, str) and x.startswith('['):
        try:
            # First, try parsing with json.loads()
            parsed = json.loads(x)
            if isinstance(parsed, list):
                return ', '.join(str(item) for item in parsed)
        except json.JSONDecodeError:
            try:
                # Fallback to ast.literal_eval() for Python-like list strings
                parsed = ast.literal_eval(x)
                if isinstance(parsed, list):
                    return ', '.join(str(item) for item in parsed)
            except (ValueError, SyntaxError):
                # If parsing fails, log the error and return the original string
                logger.warning(f"Failed to parse list for value: {x}")
    # Return the original value if it's neither a list nor a parsable string
    return x

In [24]:
def clean_dataframe_columns(df_temp: pd.DataFrame) -> pd.DataFrame:
    """
    Convert list-like columns or string representations of lists into comma-separated strings.
    
    Args:
        df_temp: The DataFrame to process.
        
    Returns:
        The cleaned DataFrame with specified columns transformed.
    """
    for col in df_temp.columns:
        # Check if any cell in the column is a list or a string that starts with '['
        if df_temp[col].apply(lambda x: isinstance(x, list) or (isinstance(x, str) and x.strip().startswith('['))).any():
            logger.info(f"Cleaning column: {col}")
            df_temp[col] = df_temp[col].apply(convert_to_comma_separated)
    return df_temp

In [25]:
df_temp = clean_dataframe_columns(df_temp)

INFO:__main__:Cleaning column: organizations


INFO:__main__:Cleaning column: classifications
INFO:__main__:Cleaning column: affiliations
INFO:__main__:Cleaning column: auth-keywords
INFO:__main__:Cleaning column: subjects
INFO:__main__:Cleaning column: authors
INFO:__main__:Cleaning column: reference


In [26]:
df_temp.head(5)

Unnamed: 0,file,organizations,classifications,affiliations,auth-keywords,subjects,authors,reference,title,publish_name
0,201800000.json,"Department of Preventive and Social Medicine, ...","ASJC, SUBJABBR","Stanford University School of Medicine, Chulal...",,Medicine (all),"Pongpirul K., Lungren M.P.","Science., The future of public health, Interna...",Public health and international epidemiology f...,"Radiology in Global Health: Strategies, Implem..."
1,201800001.json,"Department of Electrical Engineering, Wireless...","ASJC, CPXCLASS, FLXCLASS, SUBJABBR",Chulalongkorn University,,"Electrical and Electronic Engineering, Electro...","Pratumsiri T., Janpugdee P.","Proc. CAMA 2015, Proc. 2015 Thailand-Japan Mic...",Flexible Printed Active Antenna for Digital Te...,Progress in Electromagnetics Research Symposium
2,201800002.json,Center of Excellence in Catalysis and Catalyti...,"CPXCLASS, ENCOMPASSCLASS, FLXCLASS, ASJC, SUBJ...",Chulalongkorn University,"Circulating fluidized bed, Computational fluid...","Chemistry (all), Chemical Engineering (all), I...","Phuakpunk K., Chalermsinsuwan B., Putivisutisa...","AICHE J., Int. J. Hydrog. Energy, Chem. Eng. J...",Parametric study of hydrogen production via so...,Chemical Engineering Science
3,201800003.json,"Department of Chemistry, Faculty of Science an...","CPXCLASS, FLXCLASS, ASJC, SUBJABBR","Hirosaki University, Chulalongkorn University,...","Encapsulation, Fluoroalkylsilane, Natural rubb...","Chemistry (all), Condensed Matter Physics, Phy...","Saengkaew J., Le D., Samart C., Sawada H., Nis...","Desalination, J. Membr. Sci., Appl. Catal. B.,...",Superhydrophobic coating from fluoroalkylsilan...,Applied Surface Science
4,201800004.json,"Program in Petrochemistry, Faculty of Science,...","EMCLASS, ASJC, SUBJABBR","Chulalongkorn University, Thailand National El...","acpcPNA, Electrochemical impedance spectroscop...","Analytical Chemistry, Biochemistry, Environmen...","Teengam P., Siangproh W., Tuantranont A., Vila...","Int. J. Tubercul. Lung Dis., Lancet Infect. Di...",Electrochemical impedance-based DNA sensor usi...,Analytica Chimica Acta


In [27]:
df_temp.to_csv('data/features_summation_not_list.csv', index=False, encoding='utf-8')