In [20]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Verification Functions

## Check IDs format

In [21]:
def verif_format_ids(data, data_id_column):
    # Define the expected format pattern for the IDs
    expected_format_pattern = r'^LIDC-IDRI-\d{4}$'

    # Initialize a list to store IDs that don't match the format
    different_format_ids = []

    # Check each ID in the specified column, converting to strings as needed
    for id_value in data[data_id_column]:
        id_str = str(id_value)  # Convert the ID value to a string
        # Check if the ID matches the expected format pattern using regular expressions
        if not re.match(expected_format_pattern, id_str):
            different_format_ids.append(id_value)  # Store IDs that don't match the format

    # Output messages based on the verification results
    if not different_format_ids:  # If no IDs with different formats were found
        print(f"The format of IDs in column '{data_id_column}' is the same (LIDC-IDRI- followed by a four-digit number).")
    else:  # If IDs with different formats were found
        print(f"The format of IDs in column '{data_id_column}' is not the same.")
        print("IDs with different formats:")
        for id_value in different_format_ids:  # Print the IDs that don't match the expected format
            print(id_value)
            

## Check duplicated IDs

In [22]:
def duplicated(data, data_id_column):
    # Find duplicated IDs
    duplicated_ids = data[data_id_column][data[data_id_column].duplicated(keep=False)]

    # Print duplicate IDs (or not)
    if not duplicated_ids.empty:
        print("Duplicated IDs in column '" + data_id_column + "':")
        #diz qual o valor duplicado
        print(duplicated_ids.unique())
    else:
        print("No duplicated IDs found in column '" + data_id_column + "'.")
        
def duplicated(data, data_id_column):
    # Find duplicated IDs within the specified column
    duplicated_ids = data[data_id_column][data[data_id_column].duplicated(keep=False)]

    # Print duplicate IDs (if any)
    if not duplicated_ids.empty:  # Check if there are any duplicated IDs
        print("Duplicated IDs in column '" + data_id_column + "':")
        print(duplicated_ids.unique())  # Print the unique values of the duplicated IDs
    else:
        print("No duplicated IDs found in column '" + data_id_column + "'.")  # Print a message if no duplicates were found
        

## Check if doesn't exist diferent ids between two datasets

In [23]:
def verify_different_ids(data1, data2, column1, column2):
    # Get unique IDs from both columns
    unique_ids1 = set(data1[column1].unique())
    unique_ids2 = set(data2[column2].unique())
    
    # Check for differences
    different_ids = unique_ids1.symmetric_difference(unique_ids2)
    
    return len(different_ids) > 0


## Check if there are still IDs with white spaces after removal

In [24]:
def ids_with_whitespace(data, data_id_column):
    # Check for IDs with whitespace in df_metadata
    ids_with_whitespace_data = data[data[data_id_column].str.contains(' ')]

    # Print IDs with whitespace, if any
    if not ids_with_whitespace_data.empty:
        print("IDs with whitespace in :")
        print(ids_with_whitespace_data)
    else:
        print("No IDs with whitespace")


## Check for missing data

In [25]:
def missing_data(data):
    # Check missing values in all columns
    missing = data.isna().sum()
    
    # Filter columns with missing values
    columns_with_missing_data = missing[missing > 0]
    
    # Print columns with missing data and their counts
    for column, count in columns_with_missing_data.items():
        print(f"Missing values in {column}: {count}")


In [26]:
def find_missing_values(dataframe):
    # Create an empty dictionary to store the results
    missing_values_dict = {}
    
    # Iterate over columns
    for column in dataframe.columns:
        # Get rows with missing values in the current column
        missing_rows = dataframe.index[dataframe[column].isna()].tolist()
        
        # If there are missing rows in the column, store them in the dictionary
        if missing_rows:
            missing_values_dict[column] = missing_rows
      
    if not missing_values_dict: # If the dictionary is empty (no missing values found)
        print("No missing values found in the DataFrame.")
    
    return missing_values_dict # Return the dictionary containing columns and their missing row indices

## Check for null values

In [27]:
def check_null_values(data):

    null_count = data.isnull().sum().sum()  # Get the total count of null values
    has_null_values = null_count > 0  # Check if there are null values

    return has_null_values, null_count # Return a tuple with information about null values


## Check for common IDs after processing

In [28]:
def check_common_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    # Extract unique IDs from each dataset based on the provided columns
    unique_subject_ids_metadata = dataset1[column_id_dataset1].unique()
    unique_tcia_ids_nodule = dataset2[column_id_dataset2].unique()

    # Find the common IDs between the two datasets
    common_ids = set(unique_subject_ids_metadata) & set(unique_tcia_ids_nodule)

    if len(common_ids) == 0:  # If no common IDs are found
        print("No common IDs found between the two datasets.")
    else:  # If common IDs are found
        print("Common IDs found between the two datasets:")
        print(common_ids)  # Display the common IDs


## Check for differences in IDs

In [29]:
def diff_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    # Extract IDs and convert them into sets for both datasets
    subject_ids_metadata = set(dataset1[column_id_dataset1])
    tcia_ids_nodule = set(dataset2[column_id_dataset2])

    # Check if the sets of IDs are equal (if all IDs in both datasets match)
    if subject_ids_metadata == tcia_ids_nodule:
        print("All IDs in both datasets match.")
    else:
        print("IDs in the datasets do not match.")


## Check for duplicated columns

In [30]:

def check_duplicate_columns(data):
    duplicate_columns = set()  # Initialize an empty set to store columns with duplicate values
    
    # Iterate through each pair of columns and check if they have the same values
    for i in range(len(data.columns)):
        col1 = data.iloc[:, i]  # Get the first column for comparison
        for j in range(i + 1, len(data.columns)):
            col2 = data.iloc[:, j]  # Get the second column for comparison
            if col1.equals(col2):  # Check if the two columns have the same values
                duplicate_columns.add(data.columns[i])  # Add column names to the set if their values match
                duplicate_columns.add(data.columns[j])
    
    return list(duplicate_columns)  # Convert the set to a list and return the columns with duplicate values


## IDs Inspection

In [31]:
def inspect_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    # Extract unique IDs for each dataset based on the specified columns
    unique_subject_ids_metadata = set(dataset1[column_id_dataset1])
    unique_tcia_ids_nodule = set(dataset2[column_id_dataset2])

    # Find IDs that exist only in one dataset and not in the other
    ids_only_in_metadata = unique_subject_ids_metadata - unique_tcia_ids_nodule
    ids_only_in_nodule = unique_tcia_ids_nodule - unique_subject_ids_metadata

    # Print the IDs that are unique to each dataset
    print("IDs only in metadata:", ids_only_in_metadata)
    print("IDs only in nodule:", ids_only_in_nodule)


# Data Processing Functions

## Remove WhiteSpaces

In [32]:
def remove_whiteSpace(data, data_id_column):
    data[data_id_column] = data[data_id_column].str.strip()
    

## Remove missing data

In [33]:
# Remove missing values
def remove_missing_data(data, data_column):
    data.dropna(subset=[data_column], inplace=True)

# Fill missing values in the specified column with an empty string
def fill_missing_data(data, data_id_column):
    data[data_id_column].fillna('', inplace=True)


## Exclude Unknown rows

In [43]:
def remove_unknown_rows(data, data_column, value_to_remove=0):
    # Filter the DataFrame to exclude rows where the value in data_column matches value_to_remove:
    data = data[data[data_column] != value_to_remove]
    return data


## Process duplicated data using mean

In [35]:
def process_duplicated(data, data_id_column):
    # Drop duplicate rows based on the specified column, keeping the first occurrence
    data.drop_duplicates(subset=data_id_column, keep='first', inplace=True)

    # Group the data by the specified column and calculate the mean of numeric columns
    data = data.groupby(data_id_column).mean(numeric_only=True).reset_index()


## Get common ids_only

In [36]:
def get_common_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    # Extract unique IDs from each dataset based on the provided columns
    unique_subject_ids_metadata = dataset1[column_id_dataset1].unique()
    unique_tcia_ids_nodule = dataset2[column_id_dataset2].unique()

    # Find the common IDs between the two datasets
    common_ids = list(set(unique_subject_ids_metadata) & set(unique_tcia_ids_nodule))

    return common_ids  # Return a list of common IDs found in both datasets


## Change the ID format

In [37]:
def remove_prefix(data, data_column_id, prefix):
    # Remove the prefix
    data[data_column_id] = data[data_column_id].str.replace(prefix, '')

## Remove null values

In [38]:
def remove_null_values(data):
    # Remove rows with any null (NaN) values
    data_cleaned = data.dropna()
    return data_cleaned  # Return the DataFrame without null values


## Replace Nan with Zero

In [39]:
def replace_nan_with_zero(df):
    # Fill NaN values with zero and return the modified DataFrame
    return df.fillna(0)


## Get Columns with dtype string

In [40]:
def get_string_columns(dataframe):
    string_columns = dataframe.select_dtypes(include=['object']).columns
    # Return the Index of columns containing string data
    return string_columns
