In [13]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Verification Functions

### Check IDs format

In [14]:
def verif_format_ids(data, data_id_column):
    # Define the expected format pattern
    expected_format_pattern = r'^LIDC-IDRI-\d{4}$'

    # Initialize a list to store IDs that don't match the format
    different_format_ids = []

    # Check each ID in the column, converting to strings as needed
    for id_value in data[data_id_column]:
        id_str = str(id_value)
        if not re.match(expected_format_pattern, id_str):
            different_format_ids.append(id_value)

    if not different_format_ids:
        print(f"The format of IDs in column '{data_id_column}' is the same (LIDC-IDRI- followed by a four-digit number).")
    else:
        print(f"The format of IDs in column '{data_id_column}' is not the same.")
        print("IDs with different formats:")
        for id_value in different_format_ids:
            print(id_value)


### Check duplicated IDs

In [15]:
def duplicated(data, data_id_column):
    # Find duplicated IDs
    duplicated_ids = data[data_id_column][data[data_id_column].duplicated(keep=False)]

    # Print duplicate IDs (or not)
    if not duplicated_ids.empty:
        print("Duplicated IDs in column '" + data_id_column + "':")
        #diz qual o valor duplicado
        print(duplicated_ids.unique())
    else:
        print("No duplicated IDs found in column '" + data_id_column + "'.")


### check if doesn't exist diferent ids between two datasets

In [16]:
def verify_different_ids(data1, data2, column1, column2):
    # Get unique IDs from both columns
    unique_ids1 = set(data1[column1].unique())
    unique_ids2 = set(data2[column2].unique())
    
    # Check for differences
    different_ids = unique_ids1.symmetric_difference(unique_ids2)
    
    return len(different_ids) > 0


### Check if there are still IDs with white spaces after removal

In [17]:
def ids_with_whitespace(data, data_id_column):
    # Check for IDs with whitespace in df_metadata
    ids_with_whitespace_data = data[data[data_id_column].str.contains(' ')]

    # Print IDs with whitespace, if any
    if not ids_with_whitespace_data.empty:
        print("IDs with whitespace in :")
        print(ids_with_whitespace_data)
    else:
        print("No IDs with whitespace")




### check for missing data

In [18]:
def missing_data(data):
    # Check missing values in all columns
    missing = data.isna().sum()
    
    # Filter columns with missing values
    columns_with_missing_data = missing[missing > 0]
    
    # Print columns with missing data and their counts
    for column, count in columns_with_missing_data.items():
        print(f"Missing values in {column}: {count}")


In [None]:
def find_missing_values(dataframe):
    # Create an empty dictionary to store the results
    missing_values_dict = {}
    
    # Iterate over columns
    for column in dataframe.columns:
        # Get rows with missing values in the current column
        missing_rows = dataframe.index[dataframe[column].isna()].tolist()
        
        # If there are missing rows in the column, store them in the dictionary
        if missing_rows:
            missing_values_dict[column] = missing_rows
      
    if not missing_values_dict:
        print("No missing values found in the DataFrame.")
    
    return missing_values_dict

### Check for null values

In [19]:
def check_null_values(data):

    null_count = data.isnull().sum().sum()  # Get the total count of null values
    has_null_values = null_count > 0  # Check if there are null values

    return has_null_values, null_count



## check for common IDs after processing

In [20]:
def check_common_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    unique_subject_ids_metadata = dataset1[column_id_dataset1].unique()
    unique_tcia_ids_nodule = dataset2[column_id_dataset2].unique()

    common_ids = set(unique_subject_ids_metadata) & set(unique_tcia_ids_nodule)

    if len(common_ids) == 0:
        print("No common IDs found between the two datasets.")
    else:
        print("Common IDs found between the two datasets:")
        print(common_ids)


## Check for differences in IDs

In [None]:
def diff_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    subject_ids_metadata = set(dataset1[column_id_dataset1])
    tcia_ids_nodule = set(dataset2[column_id_dataset2])

    if subject_ids_metadata == tcia_ids_nodule:
        print("All IDs in both datasets match.")
    else:
        print("IDs in the datasets do not match.")


## Check for duplicated columns

In [None]:
def check_duplicate_columns(data):
    duplicate_columns = set()
    
    # Iterate through each pair of columns and check if they have the same values
    for i in range(len(data.columns)):
        col1 = data.iloc[:, i]
        for j in range(i + 1, len(data.columns)):
            col2 = data.iloc[:, j]
            if col1.equals(col2):
                duplicate_columns.add(data.columns[i])
                duplicate_columns.add(data.columns[j])
    
    return list(duplicate_columns)

## IDs Inspection

In [None]:
def inspect_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    unique_subject_ids_metadata = set(dataset1[column_id_dataset1])
    unique_tcia_ids_nodule = set(dataset2[column_id_dataset2])

    ids_only_in_metadata = unique_subject_ids_metadata - unique_tcia_ids_nodule
    ids_only_in_nodule = unique_tcia_ids_nodule - unique_subject_ids_metadata

    print("IDs only in metadata:", ids_only_in_metadata)
    print("IDs only in nodule:", ids_only_in_nodule)


# Data Processing Functions

### Remove WhiteSpaces

In [21]:
def remove_whiteSpace(data, data_id_column):
    data[data_id_column] = data[data_id_column].str.strip()

### Remove missing data

In [22]:
# remover valores ausentes
def remove_missing_data(data, data_column):
    data.dropna(subset=[data_column], inplace=True)

# Preencher valores ausentes na coluna com uma string vazia
def fill_missing_data(data, data_id_column):
    data[data_id_column].fillna('', inplace=True)


### Exclude Unknown rows

In [23]:

def remove_unknown_rows(data, data_column, value_to_remove=0):
    data = data[data[data_column] != value_to_remove]
    return data


#### Process duplicated data using mean

In [24]:
def process_duplicated(data, data_id_column):
    data.drop_duplicates(subset=data_id_column, keep='first', inplace=True)
    data = data.groupby(data_id_column).mean(numeric_only=True).reset_index()


## Get common ids_only

In [None]:
def get_common_ids(dataset1, column_id_dataset1, dataset2, column_id_dataset2):
    unique_subject_ids_metadata = dataset1[column_id_dataset1].unique()
    unique_tcia_ids_nodule = dataset2[column_id_dataset2].unique()

    common_ids = list(set(unique_subject_ids_metadata) & set(unique_tcia_ids_nodule))

    return common_ids


## Change the id format

In [None]:
def remove_prefix(data, data_column_id, prefix):
    data[data_column_id] = data[data_column_id].str.replace(prefix, '')

## Remove null Values

In [None]:
def remove_null_values(data):
    data_cleaned = data.dropna()
    return data_cleaned

## Replace Nan with Zero

In [None]:
def replace_nan_with_zero(df):
    return df.fillna(0)

## Get Columns with dtype string

In [None]:
def get_string_columns(dataframe):
    string_columns = dataframe.select_dtypes(include=['object']).columns
    return string_columns


