In [17]:
import pandas as pd
import os

In [18]:
path_to_your_dataset = '/Users/arup/Documents/ISB/Datasets/MAG'

In [19]:
def get_nan_info(dataframe):
    # Step 1: Get columns with NaN values
    nan_columns = dataframe.columns[dataframe.isna().any()].tolist()

    # Step 2: Calculate percentage of NaN values in each column
    nan_percentage = (dataframe.isna().sum() / len(dataframe)) * 100

    # Step 3: Combine the results into a DataFrame
    nan_info = pd.DataFrame({'NaN Count': dataframe.isna().sum(), 'NaN Percentage': nan_percentage})
    nan_info = nan_info.loc[nan_info['NaN Count'] > 0]  # Filter out columns with no NaNs

    return nan_info

In [20]:
# Path to the folder containing CSV files
folder_path = '/Users/arup/Documents/ISB/Datasets/MAG'

# Dictionary specifying which columns to select from each CSV file
columns_to_select = {
    'papers.csv': ['entity_id', 'dcterms_title'],
    'Authors_disambiguated.csv': ['entity_id', 'foaf_name'],
    'Affiliations.csv': ['entity_id', 'foaf_name', 'city_name', 'state_name', 'country_name', 'country_official_name'],
    'ConferenceSeries.csv': ['entity_id', 'foaf_name'],
    'FieldOfStudyLabeled.csv': ['entity_id', 'fos_list'],
    'FieldsOfStudy.csv': ['entity_id', 'foaf_name'],
    'Journals.csv': ['entity_id', 'foaf_name']
}

# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

In [21]:
# Iterate over each CSV file
for file_name, columns in columns_to_select.items():
    file_path = os.path.join(folder_path, file_name)
    
    # Read only selected columns from the CSV file
    df = pd.read_csv(file_path, usecols=columns)
    
    # Convert 'patent_id' column to string to ensure compatibility for merging
    df['entity_id'] = df['entity_id'].astype(str)

    # Merge data based on patent_id column
    if not merged_df.empty:
        merged_df = pd.merge(merged_df, df, on='entity_id', how='outer', suffixes=('_' + file_name.split('.')[0], ''))
    else:
        merged_df = df

In [22]:
# Print the merged DataFrame
merged_df

Unnamed: 0,entity_id,dcterms_title,foaf_name_Affiliations,foaf_name_ConferenceSeries,city_name,state_name,country_name,country_official_name,foaf_name_FieldsOfStudy,fos_list,foaf_name_Journals,foaf_name
0,87802,The need for data based evaluation of biogeoch...,,,,,,,,,,
1,106995,Jewelry pin or pendant,,,,,,,,,,
2,185276,Integral glass encapsulation for solar arrays....,,,,,,,,,,
3,188640,CALDERAS DE VAPOR CON CIRCULACION FORZADA LA M...,,,,,,,,,,
4,198536,ICPP,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
391571,2738911231,,,,,,,,,,,Castilla: Estudios de Literatura
391572,2736617174,,,,,,,,,,,European Medical Journal Rheumatology
391573,2737449111,,,,,,,,,,,Cinta de Moebio: Revista de Epistemologia de C...
391574,2494728167,,,,,,,,,,,Stem Cells Translational Medicine


In [23]:
get_nan_info(merged_df)

Unnamed: 0,NaN Count,NaN Percentage
dcterms_title,152911,39.050146
foaf_name_Affiliations,240221,61.347223
foaf_name_ConferenceSeries,391551,99.993616
city_name,391551,99.993616
state_name,391551,99.993616
country_name,391551,99.993616
country_official_name,391554,99.994382
foaf_name_FieldsOfStudy,391572,99.998978
fos_list,390836,99.81102
foaf_name_Journals,390836,99.81102


In [24]:
merged_df.to_csv(path_to_your_dataset + '/MAG_clean_data.csv', index=False)

## Common entity_id for the entire MAG dataset

In [3]:
import os
import pandas as pd

# Function to read and process a CSV file
def process_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Check if 'entity_id' column exists
    if 'entity_id' in df.columns:
        # Extract entity IDs and convert them to strings
        entity_ids = df['entity_id'].astype(str).tolist()
        return entity_ids
    else:
        # If 'entity_id' column doesn't exist, return an empty list
        return []

# Directory containing CSV files
folder_path = '/Users/arup/Documents/ISB/Datasets/MAG'

# List to store entity IDs from each CSV file
all_entity_ids = []

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        entity_ids = process_csv(file_path)
        all_entity_ids.append(entity_ids)

# Find common entity IDs
common_entity_ids = set(all_entity_ids[0]).intersection(*all_entity_ids[1:])

# Count the number of common entity IDs
num_common_entity_ids = len(common_entity_ids)

  df = pd.read_csv(file_path)


In [4]:
print("Number of common entity IDs:", num_common_entity_ids)
print("Common entity IDs:", common_entity_ids)

Number of common entity IDs: 0
Common entity IDs: set()
