### Checkpoint 1: Elimino le colonne non necessarie: tutte tranne id, title, author, venue e pub_date 

In [3]:
import os
import pandas as pd

# Define the source folder path
source_folder_path = '../citations/update_metadata'
# Define the destination folder path
destination_folder_path = '../citations/checkpoint_1'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder_path, exist_ok=True)

In [4]:
# Loop through each file in the source folder
for file_name in os.listdir(source_folder_path):
    if file_name.endswith(".csv"):  # Assicurati che sia un file CSV
        source_file_path = os.path.join(source_folder_path, file_name)
        
        # Read the dataset into a DataFrame
        df = pd.read_csv(source_file_path)
        
        # Keep only the columns necessary
        columns_to_keep = ['id', 'title', 'author', 'venue', 'pub_date']
        df = df.dropna(subset=columns_to_keep)  # Rimuovi le righe con valori nulli in queste colonne
        df = df[columns_to_keep]  # Rimuovi tutte le altre colonne non elencate
        
        #  Define the destination file path
        destination_file_path = os.path.join(destination_folder_path, file_name)
        
        # Save the cleaned DataFrame to the new folder
        df.to_csv(destination_file_path, index=False)

print('Processo di pulizia completato e file salvati nella cartella "checkpoint_1".')

Processo di pulizia completato e file salvati nella cartella "checkpoint_1".


### Checkpoint 2: elimino le righe in cui compaiono valori nulli in id, title, author, venue o pub_date

In [5]:
import os
import pandas as pd

# Define the source folder path
source_folder_path = '../citations/checkpoint_1'
# Define the destination folder path
destination_folder_path = '../citations/checkpoint_2'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder_path, exist_ok=True)

In [6]:
# Loop through each file in the source folder
for file_name in os.listdir(source_folder_path):
    source_file_path = os.path.join(source_folder_path, file_name)
    
    # Read the dataset into a DataFrame
    df = pd.read_csv(source_file_path)
    
    # Drop rows where any of the specified columns have null values
    df.dropna(subset=['id', 'title', 'author', 'venue', 'pub_date'], inplace=True)
    
    # Define the destination file path
    destination_file_path = os.path.join(destination_folder_path, file_name)
    
    # Save the cleaned DataFrame to the new folder
    df.to_csv(destination_file_path, index=False)

print('Cleaning process completed and files saved in "checkpoint_2" folder.')

Cleaning process completed and files saved in "checkpoint_2" folder.


### Checkpoint 2.5: Elimino csv vuoti

In [12]:
import os
import pandas as pd

# Definisci il percorso della cartella da controllare
folder_path = '../citations/checkpoint_2'

# Loop attraverso ogni file nella cartella specificata
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):  # Assicurati che sia un file CSV
        file_path = os.path.join(folder_path, file_name)
        
        # Leggi il dataset in un DataFrame per verificare se è vuoto
        df = pd.read_csv(file_path)
        
        # Se il DataFrame è vuoto (nessuna riga), elimina il file
        if df.empty:
            os.remove(file_path)
            

print('Process completed. Empty datasets have been removed.')


Process completed. Empty datasets have been removed.


### Checkpoint 3: Elimino righe in cui compaiono caratteri cinesi, giapponesi, ideogrammi in generale

In [1]:
import os
import pandas as pd
import re

# Define the source and destination folder paths
source_folder_path = '../citations/checkpoint_2'
destination_folder_path = '../citations/checkpoint_3'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder_path, exist_ok=True)

# Regular expression to match Chinese, Japanese, or other ideographs
ideograph_regex = r'[\u2E80-\u2EFF\u2F00-\u2FDF\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3100-\u312F\u3200-\u32FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]'

# Function to check if a string contains ideographic characters
def contains_ideograph(text):
    if pd.isnull(text):
        return False
    # Explicitly convert the text to a string before the search
    return re.search(ideograph_regex, str(text)) is not None

In [2]:
# Loop through each file in the source folder
for file_name in os.listdir(source_folder_path):
    if file_name.endswith(".csv"):  # Ensure the file is a CSV
        source_file_path = os.path.join(source_folder_path, file_name)
        
        # Read the dataset into a DataFrame
        df = pd.read_csv(source_file_path)
        
        # Verify and keep only the existing necessary columns
        columns_to_keep = ['id', 'title', 'author', 'venue', 'pub_date']
        existing_columns_to_keep = [col for col in columns_to_keep if col in df.columns]
        
        # Remove rows with null values in these existing columns
        df = df.dropna(subset=existing_columns_to_keep)
        
        # Keep only the columns that exist
        df = df[existing_columns_to_keep]
        
        # Remove rows containing ideographic characters only from existing columns
        for column in existing_columns_to_keep:
            if column in df.columns:  # Additional check before applying contains_ideograph
                df = df[~df[column].apply(contains_ideograph)]
        
        # Check if the DataFrame is empty after cleaning
        if not df.empty:
            # Define the destination file path
            destination_file_path = os.path.join(destination_folder_path, file_name)
            
            # Save the cleaned DataFrame to the new folder
            df.to_csv(destination_file_path, index=False)
 

print('Cleaning process completed and files saved in "checkpoint_3" folder. Empty datasets were not saved.')


Cleaning process completed and files saved in "checkpoint_3" folder. Empty datasets were not saved.


In [10]:
### Checkpoint 3.5: Elimino csv vuoti

In [9]:
import os
import pandas as pd

# Definisci il percorso della cartella da controllare
folder_path = '../citations/checkpoint_3'
deleted_files = 0  # Contatore per i file eliminati

# Loop attraverso ogni file nella cartella specificata
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):  # Assicurati che sia un file CSV
        file_path = os.path.join(folder_path, file_name)
        
        try:
            # Leggi il dataset in un DataFrame per verificare se è vuoto
            df = pd.read_csv(file_path)
            
            # Se il DataFrame è vuoto (nessuna riga), elimina il file
            if df.empty:
                os.remove(file_path)
                print(f"Removed empty dataset: {file_name}")
                deleted_files += 1  # Incrementa il contatore
                
        except pd.errors.EmptyDataError:
            # Se il file è completamente vuoto, viene catturata questa eccezione
            os.remove(file_path)
            deleted_files += 1  # Incrementa il contatore

# Nota la modifica nella stringa di formattazione finale
print(f'Process completed. {deleted_files} empty datasets have been removed from checkpoint_3.')


Process completed. 22 empty datasets have been removed from checkpoint_3.


### Checkpoint 4: "Espando" le righe in modo che ogni autore abbia la propria riga mantenendo le informazioni originali delle altre colonne (come l'ID, il titolo, ecc.) invariate. ("unpivoting" o "melting" dei dati)

In [1]:
import os
import pandas as pd

# Definisci i percorsi delle cartelle sorgente e destinazione
source_folder_path = '../citations/checkpoint_3'
destination_folder_path = '../citations/checkpoint_4'

# Crea la cartella di destinazione se non esiste
os.makedirs(destination_folder_path, exist_ok=True)

In [None]:
# Loop attraverso ogni file nella cartella sorgente
for file_name in os.listdir(source_folder_path):
    if file_name.endswith(".csv"):  # Assicurati che sia un file CSV
        source_file_path = os.path.join(source_folder_path, file_name)
        
        # Leggi il dataset in un DataFrame
        df = pd.read_csv(source_file_path)
        
        # Mantieni solo le colonne necessarie e rimuovi righe con valori nulli
        columns_to_keep = ['id', 'title', 'author', 'venue', 'pub_date']
        df = df.dropna(subset=columns_to_keep)[columns_to_keep]
        
        # Espandi la colonna "author" per avere una riga per ogni autore
        # Converti la colonna "author" in una lista di autori, separati da ";"
        df['author'] = df['author'].str.split('; ')
        # Espandi la lista degli autori in righe separate
        df = df.explode('author').reset_index(drop=True)
        
        # Rimuovi righe contenenti caratteri ideografici, se necessario
        # (Qui puoi inserire il codice per filtrare i caratteri ideografici, se richiesto)

        # Definisci il percorso del file di destinazione
        destination_file_path = os.path.join(destination_folder_path, file_name)
        
        # Salva il DataFrame pulito nella nuova cartella
        df.to_csv(destination_file_path, index=False)

print('Processo di pulizia completato e file salvati nella cartella "checkpoint_4".')

In [1]:
import os

# Definisci i percorsi delle cartelle da controllare
folder_path_checkpoint_3 = '../citations/checkpoint_3'
folder_path_checkpoint_4 = '../citations/checkpoint_4'

# Inizializza i contatori
count_checkpoint_3 = 0
count_checkpoint_4 = 0

# Conta i file in checkpoint_3
for file_name in os.listdir(folder_path_checkpoint_3):
    if file_name.endswith(".csv"):
        count_checkpoint_3 += 1

# Conta i file in checkpoint_4
for file_name in os.listdir(folder_path_checkpoint_4):
    if file_name.endswith(".csv"):
        count_checkpoint_4 += 1

# Stampa il numero di file in ciascuna cartella e il confronto
print(f"Numero di file .csv in checkpoint_3: {count_checkpoint_3}")
print(f"Numero di file .csv in checkpoint_4: {count_checkpoint_4}")

if count_checkpoint_3 > count_checkpoint_4:
    print("Ci sono più file in checkpoint_3 rispetto a checkpoint_4.")
elif count_checkpoint_3 < count_checkpoint_4:
    print("Ci sono più file in checkpoint_4 rispetto a checkpoint_3.")
else:
    print("Il numero di file in checkpoint_3 e checkpoint_4 è uguale.")


Numero di file .csv in checkpoint_3: 27136
Numero di file .csv in checkpoint_4: 27136
Il numero di file in checkpoint_3 e checkpoint_4 è uguale.


### Checkpoint 4 _processed: Estraggo gli identificatori ISSN e ISBN dalla colonna “venue”, creo un nuovo field “issn/isbn” e “espando” le righe in modo che ogni riga abbia associato un valore di issn o di isbn.

In [None]:
import pandas as pd
import os
import re

# Impostazioni percorso
folder_path = '/path/to/checkpoint_4'  # Assicurati che sia il percorso corretto
output_folder = '/path/to/checkpoint_4_processed'  # Nuova cartella per i file elaborati

# Crea la cartella di output se non esiste
os.makedirs(output_folder, exist_ok=True)

# Regex per estrarre ISSN e ISBN
issn_isbn_pattern = re.compile(r'issn:\s*(\d{4}-\d{3}[0-9Xx])|isbn:\s*(\d{13})')

# Funzione per elaborare ciascun file
def process_file(file_path, output_path):
    df = pd.read_csv(file_path)

    if 'venue' in df.columns:
        # Estrai e espandi ISSN e ISBN
        df['issn/isbn'] = df['venue'].apply(lambda x: issn_isbn_pattern.findall(str(x)) if pd.notna(x) else [])
        df = df.explode('issn/isbn')
        df['issn/isbn'] = df['issn/isbn'].apply(lambda x: x[0] if isinstance(x, tuple) and x[0] else x[1] if isinstance(x, tuple) and len(x) > 1 else None)
        
        # Salva il DataFrame modificato
        df.to_csv(output_path, index=False)
        print(f"Processed and saved: {output_path}")
    else:
        print(f"No 'venue' column in {file_path}")

# Processa ogni file nella cartella
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        output_path = os.path.join(output_folder, filename)
        process_file(file_path, output_path)
