In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Verification Functions

### Check IDs format

In [3]:
def verif_format_ids(data, data_id_column):
    # Define the expected format pattern
    expected_format_pattern = r'^LIDC-IDRI-\d{4}$'

    # Initialize a list to store IDs that don't match the format
    different_format_ids = []

    # Check each ID in the column, converting to strings as needed
    for id_value in data[data_id_column]:
        id_str = str(id_value)
        if not re.match(expected_format_pattern, id_str):
            different_format_ids.append(id_value)

    if not different_format_ids:
        print(f"The format of IDs in column '{data_id_column}' is the same (LIDC-IDRI- followed by a four-digit number).")
    else:
        print(f"The format of IDs in column '{data_id_column}' is not the same.")
        print("IDs with different formats:")
        for id_value in different_format_ids:
            print(id_value)


### Check duplicated IDs

In [4]:
def duplicated(data, data_id_column):
    # Find duplicated IDs
    duplicated_ids = data[data_id_column][data[data_id_column].duplicated(keep=False)]

    # Print duplicate IDs (or not)
    if not duplicated_ids.empty:
        print("Duplicated IDs in column '" + data_id_column + "':")
        #diz qual o valor duplicado
        print(duplicated_ids.unique())
    else:
        print("No duplicated IDs found in column '" + data_id_column + "'.")


### check if doesn't exist diferent ids between two datasets

In [5]:
def verify_different_ids(data1, data2, column1, column2):
    # Get unique IDs from both columns
    unique_ids1 = set(data1[column1].unique())
    unique_ids2 = set(data2[column2].unique())
    
    # Check for differences
    different_ids = unique_ids1.symmetric_difference(unique_ids2)
    
    return len(different_ids) > 0


### Check if there are still IDs with white spaces after removal

In [6]:
def ids_with_whitespace(data, data_id_column):
    # Check for IDs with whitespace in df_metadata
    ids_with_whitespace_data = data[data[data_id_column].str.contains(' ')]

    # Print IDs with whitespace, if any
    if not ids_with_whitespace_data.empty:
        print("IDs with whitespace in :")
        print(ids_with_whitespace_data)
    else:
        print("No IDs with whitespace")




### check for missing data

In [7]:
def missing_data(data):
    # Check missing values in all columns
    missing = data.isna().sum()
    for column, count in missing.items():
        print(f"Missing values in {column}: {count}")




### Check for null values

In [8]:
def check_null_values(data):

    null_count = data.isnull().sum().sum()  # Get the total count of null values
    has_null_values = null_count > 0  # Check if there are null values

    return has_null_values, null_count



# Data Processing Functions

### Remove WhiteSpaces

In [9]:
def remove_whiteSpace(data, data_id_column):
    data[data_id_column] = data[data_id_column].str.strip()

### Remove missing data

In [10]:
# remover valores ausentes
def remove_missing_data(data, data_column):
    data.dropna(subset=[data_column], inplace=True)

# Preencher valores ausentes na coluna com uma string vazia
def fill_missing_data(data, data_id_column):
    data[data_id_column].fillna('', inplace=True)


### Exclude Unknown rows

In [11]:

def remove_unknown_rows(data, data_column, value_to_remove=0):
    data = data[data[data_column] != value_to_remove]
    return data


#### Process duplicated data using mean

In [12]:
def process_duplicated(data, data_id_column):
    data.drop_duplicates(subset=data_id_column, keep='first', inplace=True)
    data = data.groupby(data_id_column).mean(numeric_only=True).reset_index()
