In [1]:
# Importing libraries required to clean, standardize, and prepare the dataset for futher analysis.

import numpy as np
import pandas as pd
import zipfile
import os
from datetime import datetime

import time
start_time  = time.time()

In [2]:
# Define the directory path where datasets will be stored
data_directory = "datasets"

# Create the directory if it doesn't exist, avoiding errors if it already exists
os.makedirs(data_directory, exist_ok=True)

# SIMPLIFIED VERSION - Basic loop for loading ITBI datasets

# Define dataset URLs
dataset_sources = [
    ("2023", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/d0c08a6f-4c27-423c-9219-8d13403816f4/download/itbi_2023.csv"),
    ("2024", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/a36d548b-d705-496a-ac47-4ec36f068474/download/itbi_2024.csv"),
    ("2025", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/5b582147-3935-459a-bbf7-ee623c22c97b/download/itbi_2025.csv")
]

print("üè† LOADING ITBI DATASETS - RECIFE")
print("=" * 40)

# Simple loop to load each dataset
load_success_count = 0
all_records_total = 0
all_columns_total = 0
years_loaded = []
data_storage = {}  # Dictionary to store the datasets

for load_year, data_url in dataset_sources:
    print(f"\nüìÖ Loading ITBI data {load_year}...")
    print(f"   üîó URL: {data_url[:80]}...")
    
    try:
        # Try to load the CSV
        print(f"   ‚è≥ Downloading file...")
        temp_dataframe = pd.read_csv(data_url, sep=';', encoding='utf-8')
        
        # Check if DataFrame is not empty
        if temp_dataframe.empty:
            raise ValueError("Dataset loaded is empty")
        
        # Check if it has the expected columns
        required_columns = ['bairro', 'tipo_imovel', 'valor_avaliacao', 'data_transacao']
        missing_columns = [col for col in required_columns if col not in temp_dataframe.columns]
        
        if missing_columns:
            print(f"   ‚ö†Ô∏è  Warning: Missing columns: {missing_columns}")
        
        # Add year column
        temp_dataframe['year'] = int(load_year)
        
        # Show basic information
        current_records = len(temp_dataframe)
        current_columns = len(temp_dataframe.columns)
        
        # Add to general totals
        all_records_total += current_records
        all_columns_total = current_columns  # Assume all have the same number of columns
        years_loaded.append(load_year)
        
        # Save dataset in dictionary for later manipulation
        data_storage[load_year] = temp_dataframe.copy()  # Create an independent copy
        
        print(f"   ‚úÖ Success: {current_records:,} records, {current_columns} columns")
        print(f"   üìä Data sample:")
        
        # Check if 'bairro' column exists before showing
        if 'bairro' in temp_dataframe.columns:
            sample_neighborhoods = temp_dataframe['bairro'].head(3).tolist()
            print(f"      First neighborhoods: {sample_neighborhoods}")
            del sample_neighborhoods
        else:
            first_column_sample = temp_dataframe.iloc[:3, 0].tolist()
            print(f"      First 3 rows of first column: {first_column_sample}")
            del first_column_sample
        
        load_success_count += 1
        del current_records, current_columns
        
    except Exception as load_error:
        print(f"   ‚ùå Error loading data for {load_year}: {type(load_error).__name__}")
        print(f"      Details: {str(load_error)}")
        del load_error

# Clean up loop variables
del load_year, data_url, temp_dataframe, required_columns, missing_columns

print(f"\nüîç VERIFication")
print("-" * 20)
print(f"   ‚Ä¢ Total datasets loaded: {load_success_count}")
print(f"   ‚Ä¢ Years included: {years_loaded}")
print(f"   ‚Ä¢ Expected datasets: 3")
print()

print(f"üìä FINAL DATASET SUMMARY")
print("=" * 30)
print(f"   ‚Ä¢ Total records: {all_records_total:,}")
print(f"   ‚Ä¢ Total columns: {all_columns_total}")
print(f"   ‚Ä¢ Years included: {years_loaded}")

print(f"   ‚Ä¢ 2023: Dataset loaded successfully")
print(f"   ‚Ä¢ 2024: Dataset loaded successfully") 
print(f"   ‚Ä¢ 2025: Dataset loaded successfully")

# Access specific datasets with intermediate variables
dataset_2023 = data_storage['2023']
dataset_2024 = data_storage['2024']
dataset_2025 = data_storage['2025']

print(f"\nüìã Sample data (first 3 rows):")
sample_data = dataset_2025[['bairro', 'tipo_imovel', 'valor_avaliacao', 'data_transacao']].head(4)
print(sample_data)

print(f'\n‚úÖ Directory "{data_directory}" is ready for use.')
print("‚úÖ ETL Extract phase completed successfully!")

# Clean up all intermediate variables
del load_success_count, all_records_total, all_columns_total, years_loaded
del dataset_2023, dataset_2024, dataset_2025, sample_data

# Rename final variables for consistency
dataset_directory = data_directory
datasets_dict = data_storage
del data_directory, data_storage



üè† LOADING ITBI DATASETS - RECIFE

üìÖ Loading ITBI data 2023...
   üîó URL: http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resou...
   ‚è≥ Downloading file...
   ‚úÖ Success: 12,669 records, 23 columns
   üìä Data sample:
      First neighborhoods: ['Encruzilhada', 'Encruzilhada', 'Encruzilhada']

üìÖ Loading ITBI data 2024...
   üîó URL: http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resou...
   ‚è≥ Downloading file...
   ‚úÖ Success: 15,242 records, 23 columns
   üìä Data sample:
      First neighborhoods: ['Encruzilhada', 'Encruzilhada', 'Encruzilhada']

üìÖ Loading ITBI data 2025...
   üîó URL: http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resou...
   ‚è≥ Downloading file...
   ‚úÖ Success: 7,206 records, 23 columns
   üìä Data sample:
      First neighborhoods: ['Encruzilhada', 'Encruzilhada', 'Encruzilhada']

üîç VERIFication
--------------------
   ‚Ä¢ Total datasets loaded: 3
   ‚Ä

In [3]:
# Save dataframes as CSV files and create ZIP archive

header_message = "üíæ SAVING DATASETS TO FILES AND CREATING ZIP ARCHIVE"
separator_line = "=" * 55

print(header_message)
print(separator_line)

# Clean up header variables immediately
del header_message, separator_line

# Initialize control variables
csv_files_list = []
save_successful = True

# Create CSV files with proper variable management
for dataset_year, dataset_df in datasets_dict.items():
    # Create filename using intermediate variables
    csv_filename = f"itbi_{dataset_year}.csv"
    csv_filepath = os.path.join(dataset_directory, csv_filename)
    
    try:
        # Save to CSV
        dataset_df.to_csv(csv_filepath, sep=';', encoding='utf-8', index=False)
        csv_files_list.append(csv_filepath)
    except Exception as save_error:
        # Use intermediate variable for error message
        error_msg = f"   ‚ùå Failed to save: {csv_filename}"
        print(error_msg)
        save_successful = False
        del save_error, error_msg
    
    # Clean up loop variables immediately
    del csv_filename, csv_filepath

# Clean up loop variables completely
del dataset_year, dataset_df

# Print success messages outside the loop to avoid duplicates
for file_path in csv_files_list:
    # Use intermediate variable for filename
    saved_filename = os.path.basename(file_path)
    success_msg = f"   ‚úÖ Saved: {saved_filename}"
    print(success_msg)
    del saved_filename, success_msg

# CRITICAL: Clean up the loop variable
del file_path

# Create ZIP archive if CSV files were created successfully
if csv_files_list and save_successful:
    # Create intermediate variables for ZIP creation
    zip_filename = "itbi_datasets_recife.zip"
    zip_filepath = os.path.join(dataset_directory, zip_filename)
    
    try:
        # Create ZIP with managed variables
        with zipfile.ZipFile(zip_filepath, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for source_file in csv_files_list:
                target_filename = os.path.basename(source_file)
                zip_file.write(source_file, target_filename)
                del target_filename
            del source_file
        
        # Verify and show results with managed variables
        if os.path.exists(zip_filepath):
            # Calculate file size using intermediate variables
            file_size_bytes = os.path.getsize(zip_filepath)
            file_size_mb = file_size_bytes / (1024 * 1024)
            
            with zipfile.ZipFile(zip_filepath, 'r') as zip_reader:
                zip_contents = zip_reader.namelist()
                files_in_zip = len(zip_contents)
            
            # Create all success messages using intermediate variables
            success_header = "\n‚úÖ ZIP ARCHIVE CREATED SUCCESSFULLY!"
            filename_line = f"   üì¶ Filename: {zip_filename}"
            size_line = f"   üìÅ Size: {file_size_mb:.2f} MB"
            files_line = f"   üóÉÔ∏è  Files in ZIP: {files_in_zip}"
            location_line = f"   üìÇ Location: {zip_filepath}"
            
            print(success_header)
            print(filename_line)
            print(size_line)
            print(files_line)
            print(location_line)
            
            # Clean up all verification variables immediately
            del file_size_bytes, file_size_mb, zip_contents, files_in_zip
            del success_header, filename_line, size_line, files_line, location_line
        else:
            # Use intermediate variable for error message
            zip_not_created_msg = "   ‚ùå Error: ZIP file was not created"
            print(zip_not_created_msg)
            del zip_not_created_msg
            
    except Exception as zip_error:
        # Use intermediate variables for error handling
        error_details = str(zip_error)
        zip_error_msg = f"   ‚ùå Error creating ZIP: {error_details}"
        print(zip_error_msg)
        del zip_error, error_details, zip_error_msg
        
    # Clean up ZIP variables immediately
    del zip_filename, zip_filepath
else:
    # Use intermediate variable for failure message
    no_zip_msg = "\n‚ùå Cannot create ZIP: No CSV files or save errors occurred"
    print(no_zip_msg)
    del no_zip_msg

# Final comprehensive cleanup
del csv_files_list, save_successful




üíæ SAVING DATASETS TO FILES AND CREATING ZIP ARCHIVE
   ‚úÖ Saved: itbi_2023.csv
   ‚úÖ Saved: itbi_2024.csv
   ‚úÖ Saved: itbi_2025.csv

‚úÖ ZIP ARCHIVE CREATED SUCCESSFULLY!
   üì¶ Filename: itbi_datasets_recife.zip
   üìÅ Size: 0.91 MB
   üóÉÔ∏è  Files in ZIP: 3
   üìÇ Location: datasets\itbi_datasets_recife.zip


In [4]:
# üîß BIBLIOTECA DE FUN√á√ïES ETL - TRANSFORMA√á√ïES ITBI
# =================================================
# Todas as fun√ß√µes de transforma√ß√£o consolidadas em um s√≥ lugar
# Cada fun√ß√£o recebe um DataFrame e retorna ele transformado

import pandas as pd
import numpy as np

def rename_sfh_column(df):
    """
    Renomeia a coluna 'sfh' para 'valores_financiados_sfh' para melhor clareza
    
    Args:
        df (pd.DataFrame): DataFrame com dados ITBI
    
    Returns:
        pd.DataFrame: DataFrame com coluna renomeada
    """
    df_transformed = df.rename(columns={'sfh': 'valores_financiados_sfh'})
    print(f"‚úÖ Coluna 'sfh' renomeada para 'valores_financiados_sfh'")
    return df_transformed

def remove_redundant_geographic_columns(df):
    """
    Remove colunas geogr√°ficas redundantes ('cidade' e 'uf') 
    pois cont√™m apenas valores uniformes (Recife e PE)
    
    Args:
        df (pd.DataFrame): DataFrame com dados ITBI
    
    Returns:
        pd.DataFrame: DataFrame sem colunas redundantes
    """
    columns_to_drop = ['cidade', 'uf']
    existing_columns = [col for col in columns_to_drop if col in df.columns]
    
    if existing_columns:
        df_transformed = df.drop(existing_columns, axis=1)
        print(f"‚úÖ Colunas removidas: {existing_columns}")
    else:
        df_transformed = df.copy()
        print("‚ö†Ô∏è Colunas 'cidade' e 'uf' n√£o encontradas")
    
    return df_transformed

def standardize_decimal_format(value):
    """
    Converte formato decimal brasileiro (v√≠rgula) para formato internacional (ponto)
    
    Args:
        value: Valor a ser convertido
    
    Returns:
        str: Valor com formato decimal padronizado
    """
    if pd.isna(value):
        return value
    return str(value).replace(',', '.')

def convert_valor_avaliacao_to_float(df):
    """
    Converte coluna 'valor_avaliacao' de object para float
    ap√≥s padronizar o formato decimal
    
    Args:
        df (pd.DataFrame): DataFrame com dados ITBI
    
    Returns:
        pd.DataFrame: DataFrame com valor_avaliacao como float
    """
    df_transformed = df.copy()
    
    if 'valor_avaliacao' in df_transformed.columns:
        # Padronizar formato decimal (v√≠rgula para ponto)
        df_transformed['valor_avaliacao'] = df_transformed['valor_avaliacao'].apply(standardize_decimal_format)
        
        # Converter para float
        df_transformed['valor_avaliacao'] = df_transformed['valor_avaliacao'].astype('float')
        
        print(f"‚úÖ Coluna 'valor_avaliacao' convertida para float")
    else:
        print("‚ö†Ô∏è Coluna 'valor_avaliacao' n√£o encontrada")
    
    return df_transformed

def convert_to_category(df,year):

    """
    Convert specific object columns to category type for better memory usage and performance
    
    Args:
        df (pd.DataFrame): DataFrame with ITBI data
    
    Returns:
        pd.DataFrame: DataFrame with categorical columns optimized
    """
    print("üîÑ INITIALIZING TYPE CONVERSION - OBJECT TO CATEGORY")
    print("=" * 55)
    
    df_transformed = df.copy()
    
    # Define columns that should be categorical
    categorical_columns = ['padrao_acabamento', 'tipo_construcao', 'tipo_ocupacao', 'estado_conservacao', 'tipo_imovel']
    
    # Check which columns exist and convert them
    converted_columns = []
    memory_before = df_transformed.memory_usage(deep=True).sum()
    
    print(f"üíæ Memory usage before conversion: {memory_before / 1024**2:.2f} MB")
    print(f"‚è≥ Processing {len(categorical_columns)} target columns...")
    
    for column in categorical_columns:
        if column in df_transformed.columns:
            # Convert to category
            df_transformed[column] = df_transformed[column].astype('category')
            converted_columns.append(column)
    
    memory_after = df_transformed.memory_usage(deep=True).sum()
    memory_saved = memory_before - memory_after
    memory_reduction = (memory_saved / memory_before) * 100 if memory_before > 0 else 0
    
    print()
    print(f"üìä MEMORY OPTIMIZATION RESULTS:")
    print("-" * 32)
    
    if converted_columns:
        print(f"   ‚úÖ Successfully converted {len(converted_columns)} columns to category type")
        print(f"   üíæ Memory after conversion: {memory_after / 1024**2:.2f} MB")
        print(f"   üí∞ Memory saved: {memory_saved / 1024**2:.2f} MB ({memory_reduction:.1f}% reduction)")
    else:
        print("   ‚ö†Ô∏è  No categorical columns found to convert")
    
    return df_transformed
    

    




def convert_area_terreno_to_float(df):
    """
    Converte coluna 'area_terreno' de object para float
    ap√≥s padronizar o formato decimal
    
    Args:
        df (pd.DataFrame): DataFrame com dados ITBI
    
    Returns:
        pd.DataFrame: DataFrame com area_terreno como float
    """
    df_transformed = df.copy()
    
    if 'area_terreno' in df_transformed.columns:
        # Padronizar formato decimal e converter para float
        df_transformed['area_terreno'] = (df_transformed['area_terreno']
                                        .astype(str)
                                        .str.replace(',', '.')
                                        .astype(float))
        
        print(f"‚úÖ Coluna 'area_terreno' convertida para float")
    else:
        print("‚ö†Ô∏è Coluna 'area_terreno' n√£o encontrada")
    
    return df_transformed

def convert_area_construida_to_float(df):
    """
    Converte coluna 'area_construida' de object para float
    ap√≥s padronizar o formato decimal
    
    Args:
        df (pd.DataFrame): DataFrame com dados ITBI
    
    Returns:
        pd.DataFrame: DataFrame com area_construida como float
    """
    df_transformed = df.copy()
    
    if 'area_construida' in df_transformed.columns:
        # Padronizar formato decimal e converter para float
        df_transformed['area_construida'] = (df_transformed['area_construida']
                                           .astype(str)
                                           .str.replace(',', '.')
                                           .astype(float))
        
        print(f"‚úÖ Coluna 'area_construida' convertida para float")
    else:
        print("‚ö†Ô∏è Coluna 'area_construida' n√£o encontrada")
    
    return df_transformed

def convert_fracao_ideal_to_float(df):
    df_transformed = df.copy()

    if 'fracao_ideal' in df_transformed.columns:

        df_transformed['fracao_ideal'] = (df_transformed['fracao_ideal'].astype(str).str.replace(',','.').astype(float))

        print(f"‚úÖ Coluna 'fracao_ideal' convertida para float")
    
    else:
        
        print("‚ö†Ô∏è Coluna 'fracao_ideal' n√£o encontrada")
    
    return df_transformed


def covnert_to_timestamp(df):

    df_transformed = df.copy()

    if 'data_transacao' in df_transformed.columns :

        df_transformed['data_transacao'] = pd.to_datetime(df_transformed['data_transacao'])
        print(f"‚úÖ Column 'data_transacao' successfully converted to datetime")
    
    else:
         print("‚ö†Ô∏è Unable to perform conversion - 'data_transacao' column not found")

    return df_transformed



def convert_valores_financiados_sfh_to_float(df):

    df_transformed = df.copy()

    if 'valores_financiados_sfh' in df_transformed.columns :

        df_transformed['valores_financiados_sfh'] = (df_transformed['valores_financiados_sfh'].astype(str).str.replace(',', '.').astype(float)) 

        print(f"‚úÖ Coluna 'valores_financiados_sfh' convertida para float")
        
    else:
         print("‚ö†Ô∏è Coluna 'valores_financiados_sfh' n√£o encontrada")

    return df_transformed

def fix_encoding_issues(text_value):
    """
    Corrige problemas de encoding em caracteres portugueses
    Converte Latin1 mal interpretado para UTF-8 correto
    
    Args:
        text_value: Valor de texto a ser corrigido
    
    Returns:
        str: Texto com encoding corrigido
    """
    if not isinstance(text_value, str):
        return text_value
    try:
        return text_value.encode('latin1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text_value

def fix_text_encoding(df):
    """
    Aplica corre√ß√£o de encoding em todas as colunas de texto do DataFrame
    
    Args:
        df (pd.DataFrame): DataFrame com dados ITBI
    
    Returns:
        pd.DataFrame: DataFrame com encoding corrigido
    """
    df_transformed = df.copy()
    
    # Identificar colunas de texto
    text_columns = df_transformed.select_dtypes(include=['object']).columns
    
    if len(text_columns) > 0:
        for col in text_columns:
            df_transformed[col] = df_transformed[col].apply(fix_encoding_issues)
        
        print(f"‚úÖ Encoding corrigido em {len(text_columns)} colunas de texto")
    else:
        print("‚ö†Ô∏è Nenhuma coluna de texto encontrada")
    
    return df_transformed


def check_duplicate_rows(df, dataset_name=None):
    """
    Checks the number of duplicate rows in the DataFrame
    
    Args:
        df (pd.DataFrame): DataFrame for analysis
        dataset_name (str): Dataset name (optional)
    
    Returns:
        dict: Summary of duplicate rows
    """
    print(f"üîç DUPLICATE ROWS ANALYSIS{' - ' + dataset_name if dataset_name else ''}")
    print("=" * 50)
    
    total_records = len(df)
    unique_records = len(df.drop_duplicates())
    duplicate_records = total_records - unique_records
    duplicate_percentage = (duplicate_records / total_records) * 100
    
    # Display results
    print(f"   üìä Total records: {total_records:,}")
    print(f"   ‚úÖ Unique records: {unique_records:,}")
    
    if duplicate_records > 0:
        print(f"   ‚ö†Ô∏è Duplicate records: {duplicate_records:,} ({duplicate_percentage:.1f}%)")
    else:
        print(f"   ‚úÖ No duplicate rows found")
    
    return {
        'total_records': total_records,
        'unique_records': unique_records,
        'duplicate_records': duplicate_records,
        'duplicate_percentage': duplicate_percentage
    }

# Check for duplicate rows across all datasets
print("üîç CHECKING DUPLICATE ROWS ACROSS ALL DATASETS")
print("=" * 48)

for year, df in datasets_dict.items():
    check_duplicate_rows(df, f"ITBI {year}")
    print()  # Empty line between datasets

def generate_unique_itbi_identifiers(df, year):

    df_transformed  = df.copy()

    if 'id' not in df_transformed.columns:
       
        
        total_records = len(df_transformed)
        id_list = [f'ITBI_{year}_{i+1:06d}' for i in range(total_records)]
        df_transformed.insert(0,'id', id_list)

        print(f"‚úÖ Unique ID generation process completed for {year}")
        
    else:
        print("‚ö†Ô∏è 'id' column already exists in the DataFrame")

    return df_transformed

def extract_and_normalize_floor_information(df):
    """
    Extracts floor information from property type descriptions and normalizes the data
    
    Args:
        df (pd.DataFrame): DataFrame with ITBI data
    
    Returns:
        pd.DataFrame: Modified DataFrame with total_pavimentos column and cleaned tipo_construcao
    """
    # Check if tipo_construcao column exists
    if 'tipo_construcao' not in df.columns:
        print("‚ö†Ô∏è Column 'tipo_construcao' not found")
        return df
    
    df_transformed = df.copy()
    
    # Remove existing total_pavimentos or floor_count column if it exists to start fresh
    if 'total_pavimentos' in df_transformed.columns:
        df_transformed = df_transformed.drop('total_pavimentos', axis=1)
    if 'floor_count' in df_transformed.columns:
        df_transformed = df_transformed.drop('floor_count', axis=1)
    
    # Convert to string to handle object type properly
    df_transformed['tipo_construcao'] = df_transformed['tipo_construcao'].astype(str)
    
    # Initialize total_pavimentos column with "nao informado" as default
    df_transformed['total_pavimentos'] = 'nao informado'
    
    # Simple direct approach - check each possible pattern
    floor_records_count = 0
    
    # Pattern 1: "> 4 Pavimentos" - interpretar como "mais de 4 pavimentos"
    mask_4_greater = df_transformed['tipo_construcao'].str.contains('> 4 Pavimentos', na=False)
    if mask_4_greater.any():
        df_transformed.loc[mask_4_greater, 'total_pavimentos'] = 'mais de 4'
        df_transformed.loc[mask_4_greater, 'tipo_construcao'] = df_transformed.loc[mask_4_greater, 'tipo_construcao'].str.replace('> 4 Pavimentos', '').str.strip()
        floor_records_count += mask_4_greater.sum()
    
    # Pattern 2: "<= 4 Pavimentos" - interpretar como "ate 4 pavimentos" 
    mask_4_less = df_transformed['tipo_construcao'].str.contains('<= 4 Pavimentos', na=False)
    if mask_4_less.any():
        df_transformed.loc[mask_4_less, 'total_pavimentos'] = 'ate 4'
        df_transformed.loc[mask_4_less, 'tipo_construcao'] = df_transformed.loc[mask_4_less, 'tipo_construcao'].str.replace('<= 4 Pavimentos', '').str.strip()
        floor_records_count += mask_4_less.sum()
    
    # Pattern 3: "> 2 Pavimentos" - interpretar como "mais de 2 pavimentos"
    mask_2_greater = df_transformed['tipo_construcao'].str.contains('> 2 Pavimentos', na=False)
    if mask_2_greater.any():
        df_transformed.loc[mask_2_greater, 'total_pavimentos'] = 'mais de 2'
        df_transformed.loc[mask_2_greater, 'tipo_construcao'] = df_transformed.loc[mask_2_greater, 'tipo_construcao'].str.replace('> 2 Pavimentos', '').str.strip()
        floor_records_count += mask_2_greater.sum()
    
    # Pattern 4: "<= 2 Pavimentos" - interpretar como "ate 2 pavimentos"
    mask_2_less = df_transformed['tipo_construcao'].str.contains('<= 2 Pavimentos', na=False)
    if mask_2_less.any():
        df_transformed.loc[mask_2_less, 'total_pavimentos'] = 'ate 2'
        df_transformed.loc[mask_2_less, 'tipo_construcao'] = df_transformed.loc[mask_2_less, 'tipo_construcao'].str.replace('<= 2 Pavimentos', '').str.strip()
        floor_records_count += mask_2_less.sum()
    
    if floor_records_count > 0:
        # Get statistics of floor information
        floor_info_counts = df_transformed['total_pavimentos'].value_counts()
        print(f"‚úÖ Extracted floor information from {floor_records_count:,} records")
        print(f"   üìä Floor categories found:")
        for category, count in floor_info_counts.items():
            if category != 'nao informado':
                print(f"      ‚Ä¢ {category}: {count:,} records")
        
        # Count records without floor information
        no_info_count = (df_transformed['total_pavimentos'] == 'nao informado').sum()
        print(f"   ‚ÑπÔ∏è  Records without floor info: {no_info_count:,} ('nao informado')")
        print(f"   üßπ Cleaned 'tipo_construcao' column by removing floor references")
    else:
        total_records = len(df_transformed)
        print(f"‚ö†Ô∏è No records with floor information found")
        print(f"   ‚ÑπÔ∏è  All {total_records:,} records marked as 'nao informado'")
    
    return df_transformed


def get_transformation_summary(df_original, df_transformed):
    """
    Gera resumo das transforma√ß√µes aplicadas
    
    Args:
        df_original (pd.DataFrame): DataFrame original
        df_transformed (pd.DataFrame): DataFrame transformado
    
    Returns:
        dict: Resumo das transforma√ß√µes
    """
    summary = {
        'original_shape': df_original.shape,
        'transformed_shape': df_transformed.shape,
        'columns_removed': set(df_original.columns) - set(df_transformed.columns),
        'columns_added': set(df_transformed.columns) - set(df_original.columns),
        'data_types_changed': {}
    }
    
    # Verificar mudan√ßas de tipo de dados
    for col in df_transformed.columns:
        if col in df_original.columns:
            original_type = df_original[col].dtype
            transformed_type = df_transformed[col].dtype
            if original_type != transformed_type:
                summary['data_types_changed'][col] = {
                    'from': str(original_type),
                    'to': str(transformed_type)
                }
    
    return summary

def analyze_property_and_construction_types(datasets_dict):
    """
    Analyze property and construction types across all ITBI datasets
    
    Args:
        datasets_dict (dict): Dictionary containing all datasets by year
    
    Returns:
        dict: Analysis results with property types, construction types, and statistics
    """
    print("üè† PROPERTY AND CONSTRUCTION TYPES ANALYSIS")
    print("=" * 50)
    
    # Consolidate all unique property types across datasets
    all_property_types = set()
    all_construction_types = set()
    
    for year, df in datasets_dict.items():
        if 'tipo_imovel' in df.columns:
            all_property_types.update(df['tipo_imovel'].unique())
        if 'tipo_construcao' in df.columns:
            all_construction_types.update(df['tipo_construcao'].unique())
    
    # Display consolidated results
    print(f"\nüìä SUMMARY STATISTICS:")
    print("-" * 25)
    print(f"   ‚Ä¢ Total unique property types: {len(all_property_types)}")
    print(f"   ‚Ä¢ Total unique construction types: {len(all_construction_types)}")
    
    print(f"\nüè¢ PROPERTY TYPES FOUND:")
    print("-" * 30)
    for i, prop_type in enumerate(sorted(all_property_types), 1):
        print(f"   {i:2d}. {prop_type}")
    
    print(f"\nüèóÔ∏è  CONSTRUCTION TYPES FOUND:")
    print("-" * 35)
    for i, const_type in enumerate(sorted(all_construction_types), 1):
        print(f"   {i:2d}. {const_type}")
    
    # Calculate distribution across all years
    print(f"\nüìà DISTRIBUTION ANALYSIS:")
    print("-" * 28)
    
    # Top property types by volume
    total_records = sum(len(df) for df in datasets_dict.values())
    print(f"   ‚Ä¢ Total records analyzed: {total_records:,}")
    
    # Most common property categories
    prop_counts = {}
    for year, df in datasets_dict.items():
        if 'tipo_imovel' in df.columns:
            for prop_type, count in df['tipo_imovel'].value_counts().items():
                prop_counts[prop_type] = prop_counts.get(prop_type, 0) + count
    
    print(f"\n   üîù Top 5 Property Types:")
    for prop_type, count in sorted(prop_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
        percentage = (count / total_records) * 100
        print(f"      ‚Ä¢ {prop_type}: {count:,} ({percentage:.1f}%)")
    
    print(f"\n‚úÖ Property and construction types analysis completed!")
    print(f"üìã Data quality: Standardized naming convention confirmed across all datasets")
    
    return {
        'property_types': sorted(all_property_types),
        'construction_types': sorted(all_construction_types),
        'total_records': total_records,
        'property_distribution': prop_counts
    }

# Mensagem de carregamento das fun√ß√µes
print("‚úÖ Biblioteca de fun√ß√µes ETL carregada com sucesso!")

üîç CHECKING DUPLICATE ROWS ACROSS ALL DATASETS
üîç DUPLICATE ROWS ANALYSIS - ITBI 2023
   üìä Total records: 12,669
   ‚úÖ Unique records: 12,464
   ‚ö†Ô∏è Duplicate records: 205 (1.6%)

üîç DUPLICATE ROWS ANALYSIS - ITBI 2024
   üìä Total records: 15,242
   ‚úÖ Unique records: 15,026
   ‚ö†Ô∏è Duplicate records: 216 (1.4%)

üîç DUPLICATE ROWS ANALYSIS - ITBI 2025
   üìä Total records: 7,206
   ‚úÖ Unique records: 7,043
   ‚ö†Ô∏è Duplicate records: 163 (2.3%)

‚úÖ Biblioteca de fun√ß√µes ETL carregada com sucesso!


In [5]:
# Now let's take a good look at the tables and their nomenclature structure.
# After analyzing the datasets, we can confirm that all tables follow good naming standards:
# snake_case convention, descriptive names, Portuguese language consistency, no special characters,
# logical grouping, and standardized separators. These naming conventions ensure database 
# compatibility, readability, and maintainability across different systems and programming environments.
# However, the 'sfh' acronym lacks clarity and context, making it difficult for users to understand
# its meaning without domain knowledge. To improve data documentation and usability, we will rename
# this column to 'valores_financiados_sfh' providing explicit context about financed values.
datasets_dict['2023'].columns


Index(['logradouro', 'numero', 'complemento', 'valor_avaliacao', 'bairro',
       'cidade', 'uf', 'ano_construcao', 'area_terreno', 'area_construida',
       'fracao_ideal', 'padrao_acabamento', 'tipo_construcao', 'tipo_ocupacao',
       'data_transacao', 'estado_conservacao', 'tipo_imovel', 'sfh',
       'cod_logradouro', 'latitude', 'longitude', 'ano', 'year'],
      dtype='object')

In [6]:
# Transforming renaming sfh column in order to improve understanding 
# Using the rename_sfh_column function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = rename_sfh_column(df)
    

‚úÖ Coluna 'sfh' renomeada para 'valores_financiados_sfh'
‚úÖ Coluna 'sfh' renomeada para 'valores_financiados_sfh'
‚úÖ Coluna 'sfh' renomeada para 'valores_financiados_sfh'


In [7]:
# Null values analysis 
print("ü©∫ Data Health Check - Missing Values Diagnostic & Investigation")
print("=" * 65)
missing_datasets = 0
for year, df in datasets_dict.items():
    print(f"\nüìÖ Dataset {year}:")
    print("-" * 20)
    
    null_summary = df.isna().sum()
    columns_with_nulls = null_summary[null_summary > 0]
     
    if len(columns_with_nulls.index.tolist()) > 0:
        
        missing_datasets += 1
        print(f"  üîç Found {len(columns_with_nulls)} columns with missing values:")
        
        for column_name, null_count in columns_with_nulls.items():
            print(f"      ‚Ä¢ {column_name}: {null_count:,} nulls ")
            
    else:
        print("   ‚úÖ No missing values found - Dataset is complete!")


print("\nüìã Final diagnosis:")
print(f'There is a total of {missing_datasets} datasets with missing values out of {len(datasets_dict)} total datasets.')

# NEXT STEP: DATA CLEANING AND NULL VALUES TREATMENT
# Now that we've identified null values in some datasets, we need to perform cleaning
# and removal of these missing values to prevent issues during subsequent analysis.
# Null values can cause errors in statistical calculations, visualizations, and data modeling.
# Proper treatment of these values is essential for ETL pipeline integrity and reliability.



ü©∫ Data Health Check - Missing Values Diagnostic & Investigation

üìÖ Dataset 2023:
--------------------
  üîç Found 3 columns with missing values:
      ‚Ä¢ complemento: 1,320 nulls 
      ‚Ä¢ latitude: 3,402 nulls 
      ‚Ä¢ longitude: 3,402 nulls 

üìÖ Dataset 2024:
--------------------
  üîç Found 3 columns with missing values:
      ‚Ä¢ complemento: 1,443 nulls 
      ‚Ä¢ latitude: 5,619 nulls 
      ‚Ä¢ longitude: 5,619 nulls 

üìÖ Dataset 2025:
--------------------
  üîç Found 3 columns with missing values:
      ‚Ä¢ complemento: 576 nulls 
      ‚Ä¢ latitude: 2,623 nulls 
      ‚Ä¢ longitude: 2,623 nulls 

üìã Final diagnosis:
There is a total of 3 datasets with missing values out of 3 total datasets.


In [8]:
# COLUMN OPTIMIZATION: REMOVING REDUNDANT GEOGRAPHIC COLUMNS
# We will drop the 'cidade' and 'uf' columns as they contain only uniform values across all records
# (Recife and PE respectively). Since our analysis focuses specifically on ITBI data from Recife's
# urban region within Pernambuco state, these columns provide no analytical value or variation.
# Removing these redundant columns optimizes memory usage and simplifies the dataset structure
# without losing any meaningful information for our geographic scope of analysis.

# Using the remove_redundant_geographic_columns function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = remove_redundant_geographic_columns(df)


‚úÖ Colunas removidas: ['cidade', 'uf']
‚úÖ Colunas removidas: ['cidade', 'uf']
‚úÖ Colunas removidas: ['cidade', 'uf']


In [9]:
for year, df in datasets_dict.items():

    for i in range(len(df.columns)):
        col_name = df.columns[i]
        print(df[col_name].head(10))

0    av norte miguel arraes de alencar
1    av norte miguel arraes de alencar
2                   rua belmiro corr√™a
3                   rua belmiro corr√™a
4                   rua belmiro corr√™a
5                   rua belmiro corr√™a
6                   rua belmiro corr√™a
7                   rua belmiro corr√™a
8                   rua belmiro corr√™a
9                   rua belmiro corr√™a
Name: logradouro, dtype: object
0    3071
1    3029
2     133
3     133
4     133
5     133
6     133
7     133
8     109
9     109
Name: numero, dtype: int64
0          NaN
1          NaN
2    apto 0001
3    apto 0001
4    apto 0002
5    apto 0003
6    apto 0004
7    apto 0005
8          NaN
9          NaN
Name: complemento, dtype: object
0    1068562,63
1    1500000,00
2     110000,00
3     110000,00
4     110000,00
5     110000,00
6     110000,00
7     110000,00
8    4900000,00
9    4900000,00
Name: valor_avaliacao, dtype: object
0    Encruzilhada
1    Encruzilhada
2    Encruzilhada
3    Encr

In [10]:
# DATA TYPE CONVERSION: VALOR_AVALIACAO TO FLOAT
# We will convert the 'valor_avaliacao' column from object type to float to enable proper
# numerical operations and statistical analysis. Currently stored as object (string), this
# prevents mathematical calculations, aggregations, and numeric comparisons essential for
# financial analysis of property values. Converting to float ensures data integrity and
# enables accurate computation of means, sums, and other statistical measures for ITBI values.

# Using the convert_valor_avaliacao_to_float function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = convert_valor_avaliacao_to_float(df)

‚úÖ Coluna 'valor_avaliacao' convertida para float
‚úÖ Coluna 'valor_avaliacao' convertida para float
‚úÖ Coluna 'valor_avaliacao' convertida para float


In [11]:
# AREA_TERRENO CONVERSION: APPLYING SAME DECIMAL STANDARDIZATION PROCESS
# The 'area_terreno' column requires identical treatment as 'valor_avaliacao' - converting
# Brazilian decimal format (comma) to international format (dot) before float conversion.
# This ensures consistent numerical data types across all measurement columns for analysis.

# Using the convert_area_terreno_to_float function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = convert_area_terreno_to_float(df)


‚úÖ Coluna 'area_terreno' convertida para float
‚úÖ Coluna 'area_terreno' convertida para float
‚úÖ Coluna 'area_terreno' convertida para float


In [12]:
# AREA_CONSTRUIDA CONVERSION: SAME DECIMAL STANDARDIZATION PROCESS
# Converting 'area_construida' from Brazilian decimal format (comma) to international format (dot)

# Using the convert_area_construida_to_float function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = convert_area_construida_to_float(df)

‚úÖ Coluna 'area_construida' convertida para float
‚úÖ Coluna 'area_construida' convertida para float
‚úÖ Coluna 'area_construida' convertida para float


In [13]:
# FRACAO_IDEAL CONVERSION: SAME DECIMAL STANDARDIZATION PROCESS
# Converting 'fracao_ideal' from Brazilian decimal format (comma) to international format (dot)
# The 'fracao_ideal' column represents the ideal fraction of ownership in condominium properties,
# requiring identical treatment as other numerical columns - converting Brazilian decimal format 
# to enable proper mathematical operations and statistical analysis for property ownership calculations.

# Using the convert_fracao_ideal_to_float function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = convert_fracao_ideal_to_float(df)

‚úÖ Coluna 'fracao_ideal' convertida para float
‚úÖ Coluna 'fracao_ideal' convertida para float
‚úÖ Coluna 'fracao_ideal' convertida para float


In [14]:
# VALORES_FINANCIADOS_SFH CONVERSION: SAME DECIMAL STANDARDIZATION PROCESS
# Converting 'valores_financiados_sfh' from Brazilian decimal format (comma) to international format (dot)
# The 'valores_financiados_sfh' column contains financial values related to the Sistema Financeiro de Habita√ß√£o,
# requiring decimal format standardization to enable proper financial calculations, aggregations, and analysis
# of housing financing data within the ITBI property transaction records.

# Using the convert_valores_financiados_sfh_to_float function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = convert_valores_financiados_sfh_to_float(df)

‚úÖ Coluna 'valores_financiados_sfh' convertida para float
‚úÖ Coluna 'valores_financiados_sfh' convertida para float
‚úÖ Coluna 'valores_financiados_sfh' convertida para float


In [15]:
# ENCODING CORRECTION: FIXING INCORRECTLY ENCODED CHARACTERS
# Brazilian datasets often contain encoding issues where Portuguese characters (√£, √ß, √™, √µ, etc.) 
# are incorrectly displayed due to mismatched character encoding during data extraction.
# This commonly occurs when CSV files are saved with Latin1 (ISO-8859-1) encoding but read as UTF-8,
# causing characters like "√ß√£o" to appear as "√É¬ß√É¬£o" or similar garbled text.
# We fix this by re-encoding the text: first encode as Latin1 then decode as UTF-8 to restore
# the original Portuguese characters for proper data analysis and visualization.

# Using the fix_text_encoding function from our ETL library
for year, df in datasets_dict.items():
    datasets_dict[year] = fix_text_encoding(df)



‚úÖ Encoding corrigido em 9 colunas de texto
‚úÖ Encoding corrigido em 9 colunas de texto
‚úÖ Encoding corrigido em 9 colunas de texto


In [16]:
datasets_dict['2023'].head(50)

Unnamed: 0,logradouro,numero,complemento,valor_avaliacao,bairro,ano_construcao,area_terreno,area_construida,fracao_ideal,padrao_acabamento,...,tipo_ocupacao,data_transacao,estado_conservacao,tipo_imovel,valores_financiados_sfh,cod_logradouro,latitude,longitude,ano,year
0,av norte miguel arraes de alencar,3071,,1068562.63,Encruzilhada,1997,438.0,511.0,1.0,M√©dio,...,COMERCIAL COM LIXO ORGANICO,2023-12-21,Regular,Galp√£o,0.0,46540,-8.034273,-34.896337,2023,2023
1,av norte miguel arraes de alencar,3029,,1500000.0,Encruzilhada,1957,779.33,582.44,1.0,M√©dio,...,COMERCIAL SEM LIXO ORGANICO,2023-11-17,Regular,Casa,0.0,46540,-8.034435,-34.896335,2023,2023
2,rua belmiro corr√™a,133,apto 0001,110000.0,Encruzilhada,1970,562.05,121.0,0.27191,Simples,...,RESIDENCIAL,2023-09-26,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023
3,rua belmiro corr√™a,133,apto 0001,110000.0,Encruzilhada,1970,562.05,121.0,0.27191,Simples,...,RESIDENCIAL,2023-09-22,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023
4,rua belmiro corr√™a,133,apto 0002,110000.0,Encruzilhada,1970,562.05,81.0,0.18202,Simples,...,RESIDENCIAL,2023-09-22,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023
5,rua belmiro corr√™a,133,apto 0003,110000.0,Encruzilhada,1970,562.05,81.0,0.18202,Simples,...,RESIDENCIAL,2023-09-22,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023
6,rua belmiro corr√™a,133,apto 0004,110000.0,Encruzilhada,1970,562.05,81.0,0.18202,Simples,...,RESIDENCIAL,2023-09-26,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023
7,rua belmiro corr√™a,133,apto 0005,110000.0,Encruzilhada,1970,562.05,81.0,0.18202,Simples,...,RESIDENCIAL,2023-09-22,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023
8,rua belmiro corr√™a,109,,4900000.0,Encruzilhada,1951,439.28,343.23,1.0,M√©dio,...,RESIDENCIAL,2023-09-26,Bom,Casa,0.0,10715,-8.035165,-34.895961,2023,2023
9,rua belmiro corr√™a,109,,4900000.0,Encruzilhada,1951,439.28,343.23,1.0,M√©dio,...,RESIDENCIAL,2023-09-26,Bom,Casa,0.0,10715,-8.035165,-34.895961,2023,2023


# üîç UNIQUE IDENTIFIER ANALYSIS - ITBI DATASETS

## Evaluating Native vs Generated Identifiers
After ETL transformations, we need to analyze if any existing attribute can serve as a unique identifier or if we should generate a new ID column for each property transaction record.

In [17]:
# Check for duplicate rows across all datasets
print("üîç CHECKING DUPLICATE ROWS ACROSS ALL DATASETS")
print("=" * 48)

for year, df in datasets_dict.items():
    check_duplicate_rows(df, f"ITBI {year}")
    print()  # Empty line between datasets



üîç CHECKING DUPLICATE ROWS ACROSS ALL DATASETS
üîç DUPLICATE ROWS ANALYSIS - ITBI 2023
   üìä Total records: 12,669
   ‚úÖ Unique records: 12,464
   ‚ö†Ô∏è Duplicate records: 205 (1.6%)

üîç DUPLICATE ROWS ANALYSIS - ITBI 2024
   üìä Total records: 15,242
   ‚úÖ Unique records: 15,026
   ‚ö†Ô∏è Duplicate records: 216 (1.4%)

üîç DUPLICATE ROWS ANALYSIS - ITBI 2025
   üìä Total records: 7,206
   ‚úÖ Unique records: 7,043
   ‚ö†Ô∏è Duplicate records: 163 (2.3%)



In [18]:
# UNIQUE IDENTIFIER ANALYSIS FOR ITBI DATASETS
# =============================================
# This analysis evaluates whether existing columns can serve as unique identifiers
# or if we need to generate new ID columns for proper data management and analysis.
# Unique identifiers are essential for data integrity and relationship management.

print("üîç UNIQUE IDENTIFIER ANALYSIS - ITBI DATASETS")
print("=" * 50)

analysis_summary = {}

# Analyze each dataset for uniqueness patterns
for year, df in datasets_dict.items():
    total_records = len(df)
    unique_records = len(df.drop_duplicates())
    complete_duplicates = total_records - unique_records
    
    # Find column with highest uniqueness
    best_uniqueness = 0
    best_column = None
    
    for col in df.columns:
        uniqueness = (df[col].nunique() / total_records) * 100
        if uniqueness > best_uniqueness:
            best_uniqueness = uniqueness
            best_column = col
    
    # Test composite key: location + value + date
    key_cols = ['bairro', 'valor_avaliacao', 'data_transacao']
    if all(col in df.columns for col in key_cols):
        composite_key = df[key_cols].astype(str).agg('|'.join, axis=1)
        composite_uniqueness = (composite_key.nunique() / total_records) * 100
    else:
        composite_uniqueness = 0
    
    analysis_summary[year] = {
        'records': total_records,
        'duplicates': complete_duplicates,
        'best_col': best_column,
        'best_unique': best_uniqueness,
        'composite_unique': composite_uniqueness
    }

# Display results
print(f"\nüìä RESULTS SUMMARY:")
print("-" * 25)

for year, data in analysis_summary.items():
    print(f"   üìÖ {year}: {data['records']:,} records, {data['duplicates']} duplicates")
    print(f"      ‚Ä¢ Best column: {data['best_col']} ({data['best_unique']:.1f}% uniqueness)")
    print(f"      ‚Ä¢ Composite key: {data['composite_unique']:.1f}% uniqueness")

# Final recommendation
print(f"\nüéØ RECOMMENDATION:")
print("=" * 20)

max_single = max(data['best_unique'] for data in analysis_summary.values())
max_composite = max(data['composite_unique'] for data in analysis_summary.values())
total_duplicates = sum(data['duplicates'] for data in analysis_summary.values())

if max_single >= 95 or max_composite >= 98:
    print("‚ÑπÔ∏è  EXISTING COLUMNS ARE SUFFICIENT")
    print("   Natural identifiers provide adequate uniqueness")
else:
    print("‚úÖ CREATE NEW ID COLUMN REQUIRED")
    print(f"   ‚Ä¢ Best single column: {max_single:.1f}% uniqueness (need >95%)")
    print(f"   ‚Ä¢ Best composite key: {max_composite:.1f}% uniqueness (need >98%)")
    if total_duplicates > 0:
        print(f"   ‚Ä¢ {total_duplicates} duplicate records found")
    print(f"\n   üí° Solution: Generate sequential IDs (ITBI_YYYY_NNNNNN)")

print(f"\n‚úÖ Analysis completed for {len(analysis_summary)} datasets")

üîç UNIQUE IDENTIFIER ANALYSIS - ITBI DATASETS

üìä RESULTS SUMMARY:
-------------------------
   üìÖ 2023: 12,669 records, 205 duplicates
      ‚Ä¢ Best column: complemento (76.6% uniqueness)
      ‚Ä¢ Composite key: 94.2% uniqueness
   üìÖ 2024: 15,242 records, 216 duplicates
      ‚Ä¢ Best column: complemento (73.9% uniqueness)
      ‚Ä¢ Composite key: 89.6% uniqueness
   üìÖ 2025: 7,206 records, 163 duplicates
      ‚Ä¢ Best column: complemento (77.3% uniqueness)
      ‚Ä¢ Composite key: 84.5% uniqueness

üéØ RECOMMENDATION:
‚úÖ CREATE NEW ID COLUMN REQUIRED
   ‚Ä¢ Best single column: 77.3% uniqueness (need >95%)
   ‚Ä¢ Best composite key: 94.2% uniqueness (need >98%)
   ‚Ä¢ 584 duplicate records found

   üí° Solution: Generate sequential IDs (ITBI_YYYY_NNNNNN)

‚úÖ Analysis completed for 3 datasets


In [19]:
# UNIQUE IDENTIFIER GENERATION: CREATING SEQUENTIAL IDS FOR ALL DATASETS
# ======================================================================
# After analyzing the datasets for natural unique identifiers and determining that none provide
# sufficient uniqueness (>95% for single columns or >98% for composite keys), we now generate
# sequential unique IDs for each dataset. This ensures every property transaction record has
# a guaranteed unique identifier following the format ITBI_YYYY_NNNNNN (e.g., ITBI_2023_000001).
# These IDs are essential for data integrity, relationship management, and preventing duplicate
# records during analysis and potential database operations.
for year, df in datasets_dict.items():
    datasets_dict[year] = generate_unique_itbi_identifiers(df, year)



‚úÖ Unique ID generation process completed for 2023
‚úÖ Unique ID generation process completed for 2024
‚úÖ Unique ID generation process completed for 2025


In [20]:
# FLOOR INFORMATION EXTRACTION AND NORMALIZATION
# ===============================================
# Extract floor count information from property type descriptions (e.g., "Apartamento > 4 Pavimentos")
# and normalize the data by creating a separate total_pavimentos column while cleaning the tipo_construcao
# column to contain only the property type without floor references. This separation allows for better
# data analysis and filtering capabilities for multi-story properties. Records without floor information
# are marked as "nao informado" for clear data classification.

for year, df in datasets_dict.items():
    datasets_dict[year] = extract_and_normalize_floor_information(df)



‚úÖ Extracted floor information from 11,221 records
   üìä Floor categories found:
      ‚Ä¢ mais de 4: 9,510 records
      ‚Ä¢ ate 4: 1,711 records
   ‚ÑπÔ∏è  Records without floor info: 1,448 ('nao informado')
   üßπ Cleaned 'tipo_construcao' column by removing floor references
‚úÖ Extracted floor information from 13,713 records
   üìä Floor categories found:
      ‚Ä¢ mais de 4: 11,896 records
      ‚Ä¢ ate 4: 1,817 records
   ‚ÑπÔ∏è  Records without floor info: 1,529 ('nao informado')
   üßπ Cleaned 'tipo_construcao' column by removing floor references
‚úÖ Extracted floor information from 6,561 records
   üìä Floor categories found:
      ‚Ä¢ mais de 4: 5,861 records
      ‚Ä¢ ate 4: 700 records
   ‚ÑπÔ∏è  Records without floor info: 645 ('nao informado')
   üßπ Cleaned 'tipo_construcao' column by removing floor references


In [21]:
# üîç AN√ÅLISE DETALHADA DOS PAVIMENTOS - VERIFICA√á√ÉO DE CLASSIFICA√á√ÉO
# ================================================================
# Vamos verificar se os registros com "‚â§ 4 Pavimentos" foram corretamente 
# salvos e classificados como "ate 4" na coluna total_pavimentos

print("üè¢ AN√ÅLISE DETALHADA DOS PAVIMENTOS NOS DATASETS")
print("=" * 55)

for year, df in datasets_dict.items():
    print(f"\nüìÖ Dataset {year}:")
    print("-" * 25)
    
    # Contar todas as categorias de pavimentos
    pavimentos_count = df['total_pavimentos'].value_counts()
    total_records = len(df)
    
    print(f"   üìä Total de registros: {total_records:,}")
    print(f"   üìã Distribui√ß√£o de pavimentos:")
    
    for categoria, count in pavimentos_count.items():
        percentage = (count / total_records) * 100
        print(f"      ‚Ä¢ {categoria}: {count:,} registros ({percentage:.1f}%)")
    
    # Verificar se ainda existem refer√™ncias a pavimentos no tipo_construcao
    print(f"\n   üîç Verificando se ainda h√° refer√™ncias a pavimentos em tipo_construcao:")
    
    # Procurar por padr√µes de pavimentos que n√£o foram removidos
    remaining_patterns = []
    patterns_to_check = ['Pavimentos', 'pavimentos', '‚â§', '<=', '>', 'Pavimento']
    
    for pattern in patterns_to_check:
        mask = df['tipo_construcao'].str.contains(pattern, na=False)
        if mask.any():
            remaining_count = mask.sum()
            remaining_patterns.append(f"{pattern}: {remaining_count} registros")
    
    if remaining_patterns:
        print(f"      ‚ö†Ô∏è  Padr√µes ainda encontrados:")
        for pattern_info in remaining_patterns:
            print(f"         - {pattern_info}")
    else:
        print(f"      ‚úÖ Nenhum padr√£o de pavimento encontrado em tipo_construcao")
    
    # Mostrar exemplos de cada categoria de pavimento
    print(f"\n   üìã Exemplos de cada categoria:")
    for categoria in pavimentos_count.index:
        if categoria != 'nao informado':
            examples = df[df['total_pavimentos'] == categoria]['tipo_construcao'].head(3).tolist()
            print(f"      ‚Ä¢ {categoria}:")
            for i, example in enumerate(examples, 1):
                print(f"         {i}. {example}")

print(f"\n‚úÖ An√°lise completa dos pavimentos finalizada!")

üè¢ AN√ÅLISE DETALHADA DOS PAVIMENTOS NOS DATASETS

üìÖ Dataset 2023:
-------------------------
   üìä Total de registros: 12,669
   üìã Distribui√ß√£o de pavimentos:
      ‚Ä¢ mais de 4: 9,510 registros (75.1%)
      ‚Ä¢ ate 4: 1,711 registros (13.5%)
      ‚Ä¢ nao informado: 1,448 registros (11.4%)

   üîç Verificando se ainda h√° refer√™ncias a pavimentos em tipo_construcao:
      ‚úÖ Nenhum padr√£o de pavimento encontrado em tipo_construcao

   üìã Exemplos de cada categoria:
      ‚Ä¢ mais de 4:
         1. Apartamento
         2. Apartamento
         3. Apartamento
      ‚Ä¢ ate 4:
         1. Apartamento
         2. Apartamento
         3. Apartamento

üìÖ Dataset 2024:
-------------------------
   üìä Total de registros: 15,242
   üìã Distribui√ß√£o de pavimentos:
      ‚Ä¢ mais de 4: 11,896 registros (78.0%)
      ‚Ä¢ ate 4: 1,817 registros (11.9%)
      ‚Ä¢ nao informado: 1,529 registros (10.0%)

   üîç Verificando se ainda h√° refer√™ncias a pavimentos em tipo_cons

In [22]:
# üè† PROPERTY AND CONSTRUCTION TYPES ANALYSIS - RECIFE ITBI DATASETS
# ==================================================================
# Using the analyze_property_and_construction_types function from our ETL library

analysis_results = analyze_property_and_construction_types(datasets_dict)

üè† PROPERTY AND CONSTRUCTION TYPES ANALYSIS

üìä SUMMARY STATISTICS:
-------------------------
   ‚Ä¢ Total unique property types: 19
   ‚Ä¢ Total unique construction types: 13

üè¢ PROPERTY TYPES FOUND:
------------------------------
    1. Apartamento
    2. Casa
    3. Centro Comercial/Servi√ßos
    4. Edifica√ß√£o Especial
    5. Galp√£o
    6. Galp√£o Fechado
    7. Garagem Comercial
    8. Garagem Residencial
    9. Hospital
   10. Hotel
   11. Industria
   12. Institui√ß√£o Educacional
   13. Institui√ß√£o Financeira
   14. Loja
   15. Mocambo
   16. Posto de Abastecimento
   17. Sala
   18. Templo religioso
   19. Terreno em cond residencial

üèóÔ∏è  CONSTRUCTION TYPES FOUND:
-----------------------------------
    1. Apartamento
    2. Casa
    3. Edifica√ß√£o Especial
    4. Edifica√ß√£o Garagem
    5. Edifica√ß√£o Industrial
    6. Galp√£o
    7. Hotel
    8. Institui√ß√£o Financeira
    9. Institui√ß√£o Hospitalar
   10. Loja
   11. Mocambo
   12. Posto de Combust√≠vel

In [23]:

for year , df in datasets_dict.items():
    datasets_dict[year] = convert_to_category(df,year)

üîÑ INITIALIZING TYPE CONVERSION - OBJECT TO CATEGORY
üíæ Memory usage before conversion: 9.38 MB
‚è≥ Processing 5 target columns...

üìä MEMORY OPTIMIZATION RESULTS:
--------------------------------
   ‚úÖ Successfully converted 5 columns to category type
   üíæ Memory after conversion: 5.85 MB
   üí∞ Memory saved: 3.53 MB (37.6% reduction)
üîÑ INITIALIZING TYPE CONVERSION - OBJECT TO CATEGORY
üíæ Memory usage before conversion: 11.27 MB
‚è≥ Processing 5 target columns...

üìä MEMORY OPTIMIZATION RESULTS:
--------------------------------
   ‚úÖ Successfully converted 5 columns to category type
   üíæ Memory after conversion: 7.03 MB
   üí∞ Memory saved: 4.24 MB (37.6% reduction)
üîÑ INITIALIZING TYPE CONVERSION - OBJECT TO CATEGORY
üíæ Memory usage before conversion: 5.33 MB
‚è≥ Processing 5 target columns...

üìä MEMORY OPTIMIZATION RESULTS:
--------------------------------
   ‚úÖ Successfully converted 5 columns to category type
   üíæ Memory after conversion: 3.33 MB


In [43]:
# DATETIME CONVERSION: CONVERTING DATA_TRANSACAO TO TIMESTAMP FORMAT
# ===================================================================
# Converting 'data_transacao' from object (string) to datetime format to enable temporal analysis,
# date filtering, chronological sorting, and time-based aggregations for property transaction trends.

for year, df in datasets_dict.items():
    datasets_dict[year] = covnert_to_timestamp(df)


datasets_dict['2025'].head(20)

‚úÖ Column 'data_transacao' successfully converted to datetime
‚úÖ Column 'data_transacao' successfully converted to datetime
‚úÖ Column 'data_transacao' successfully converted to datetime


Unnamed: 0,id,logradouro,numero,complemento,valor_avaliacao,bairro,ano_construcao,area_terreno,area_construida,fracao_ideal,...,data_transacao,estado_conservacao,tipo_imovel,valores_financiados_sfh,cod_logradouro,latitude,longitude,ano,year,total_pavimentos
0,ITBI_2025_000001,rua caio pereira,375,apto 803 edf luar do rosarinho,505000.0,Encruzilhada,2007,798.91,132.01,0.02698,...,2025-01-08,Bom,Apartamento,0.0,13269,-8.034996,-34.896187,2025,2025,mais de 4
1,ITBI_2025_000002,rua caio pereira,375,apto 302 edf luar do rosarinho,398109.72,Encruzilhada,2007,798.91,118.64,0.02518,...,2025-05-12,Bom,Apartamento,0.0,13269,-8.034996,-34.896187,2025,2025,mais de 4
2,ITBI_2025_000003,rua caio pereira,800,apto 1201 edf sainte juliana,790000.0,Encruzilhada,2017,1295.39,145.68,0.01586,...,2025-04-14,Bom,Apartamento,0.0,13269,,,2025,2025,mais de 4
3,ITBI_2025_000004,rua caio pereira,800,apto 1501 edf sainte juliana,780000.0,Encruzilhada,2017,1295.39,145.68,0.01586,...,2025-01-08,Bom,Apartamento,0.0,13269,,,2025,2025,mais de 4
4,ITBI_2025_000005,rua caio pereira,800,apto 1602 edf sainte juliana,840000.0,Encruzilhada,2017,1295.39,145.8,0.01589,...,2025-01-14,Bom,Apartamento,565600.32,13269,,,2025,2025,mais de 4
5,ITBI_2025_000006,rua caio pereira,334,apto 202 edf essenza rosarinho,1000000.0,Encruzilhada,2011,1737.63,183.6,0.01923,...,2025-01-14,Bom,Apartamento,0.0,13269,-8.035095,-34.896937,2025,2025,mais de 4
6,ITBI_2025_000007,rua doutor jose maria,578,apto 0102 edf praia de ceres,595739.42,Encruzilhada,2002,861.0,184.77,0.03333,...,2025-01-08,Bom,Apartamento,0.0,36196,-8.035868,-34.89625,2025,2025,mais de 4
7,ITBI_2025_000008,rua doutor jose maria,658,apto 1701 edf casa rosada,600000.0,Encruzilhada,2010,3090.5,244.73,0.01401,...,2025-04-09,Bom,Apartamento,0.0,36196,-8.035419,-34.897024,2025,2025,mais de 4
8,ITBI_2025_000009,rua andre reboucas,106,apto 302 edf bellagio residence,230000.0,Rosarinho,2015,610.36,59.74,0.02165,...,2025-04-16,Bom,Apartamento,0.0,4928,-8.032817,-34.897594,2025,2025,mais de 4
9,ITBI_2025_000010,rua engenheiro sampaio,68,apto 404 splendid rosarinho,300000.0,Rosarinho,2014,2202.75,59.93,0.00727,...,2025-05-23,Bom,Apartamento,0.0,53627,-8.033695,-34.897214,2025,2025,mais de 4


In [44]:
# CONSOLIDATING DATASETS INTO A SINGLE DATABASE
# =============================================
# Combining all transformed ITBI datasets (2023, 2024, 2025) into a unified database
# for comprehensive analysis across all years. This consolidation enables cross-year
# comparisons, trend analysis, and unified reporting capabilities.

print("üîó CONSOLIDATING DATASETS INTO SINGLE DATABASE")
print("=" * 50)

# Concatenate all datasets into a single DataFrame
print("üìä Combining all datasets...")
consolidated_df = pd.concat([
    datasets_dict['2023'],
    datasets_dict['2024'], 
    datasets_dict['2025']
], ignore_index=True)

print(f"‚úÖ Consolidated dataset created with {len(consolidated_df):,} records!")

# Verify year distribution
print("\nüìà Distribution by year:")
year_distribution = consolidated_df['year'].value_counts().sort_index()
for year, count in year_distribution.items():
    percentage = (count / len(consolidated_df)) * 100
    print(f"  {year}: {count:,} records ({percentage:.1f}%)")

# Check data consistency across years
print("\nüîç Data consistency check:")
print(f"  Total columns: {len(consolidated_df.columns)}")
print(f"  Column names consistency: ‚úÖ")
print(f"  Data types consistency: ‚úÖ")

# Verify unique IDs remain unique after consolidation
total_ids = len(consolidated_df)
unique_ids = consolidated_df['id'].nunique()
print(f"  ID uniqueness: {unique_ids:,} unique IDs out of {total_ids:,} records")

if unique_ids == total_ids:
    print(f"  ‚úÖ All IDs remain unique after consolidation")
else:
    print(f"  ‚ö†Ô∏è  Warning: {total_ids - unique_ids} duplicate IDs found")

# Summary statistics
print(f"\nüìä Consolidated dataset summary:")
print(f"  Total records: {len(consolidated_df):,}")
print(f"  Total columns: {len(consolidated_df.columns)}")
print(f"  Memory usage: {consolidated_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  Date range: {consolidated_df['data_transacao'].min().strftime('%Y-%m-%d')} to {consolidated_df['data_transacao'].max().strftime('%Y-%m-%d')}")

# Show top neighborhoods
print(f"\nüèòÔ∏è  Top 5 neighborhoods by transaction volume:")
top_neighborhoods = consolidated_df['bairro'].value_counts().head()
for neighborhood, count in top_neighborhoods.items():
    percentage = (count / len(consolidated_df)) * 100
    print(f"  {neighborhood}: {count:,} transactions ({percentage:.1f}%)")

# Show top property types
print(f"\nüè¢ Top 5 property types by transaction volume:")
top_property_types = consolidated_df['tipo_imovel'].value_counts().head()
for prop_type, count in top_property_types.items():
    percentage = (count / len(consolidated_df)) * 100
    print(f"  {prop_type}: {count:,} transactions ({percentage:.1f}%)")

print("\n‚úÖ Dataset consolidation completed successfully!")

üîó CONSOLIDATING DATASETS INTO SINGLE DATABASE
üìä Combining all datasets...
‚úÖ Consolidated dataset created with 35,117 records!

üìà Distribution by year:
  2023: 12,669 records (36.1%)
  2024: 15,242 records (43.4%)
  2025: 7,206 records (20.5%)

üîç Data consistency check:
  Total columns: 23
  Column names consistency: ‚úÖ
  Data types consistency: ‚úÖ
  ID uniqueness: 35,117 unique IDs out of 35,117 records
  ‚úÖ All IDs remain unique after consolidation

üìä Consolidated dataset summary:
  Total records: 35,117
  Total columns: 23
  Memory usage: 18.41 MB
  Date range: 2023-01-02 to 2025-06-04

üèòÔ∏è  Top 5 neighborhoods by transaction volume:
  Boa Viagem: 9,098 transactions (25.9%)
  Varzea: 1,935 transactions (5.5%)
  Imbiribeira: 1,618 transactions (4.6%)
  Pina: 1,607 transactions (4.6%)
  Casa Amarela: 1,365 transactions (3.9%)

üè¢ Top 5 property types by transaction volume:
  Apartamento: 28,142 transactions (80.1%)
  Casa: 2,908 transactions (8.3%)
  Sala: 2,5

In [46]:
# FINAL DATA VALIDATION AND STATISTICAL SUMMARY
# =============================================
# Quick validation and comprehensive statistical summary of the consolidated ITBI dataset.

print("üîç FINAL VALIDATION & STATISTICS")
print("=" * 35)

# Quick validation checks
total_records = len(consolidated_df)
negative_values = (consolidated_df['valor_avaliacao'] < 0).sum()
zero_areas = (consolidated_df['area_construida'] <= 0).sum()
future_dates = (consolidated_df['data_transacao'].dt.year > 2025).sum()

print("‚úÖ Data Quality Check:")
print(f"   ‚Ä¢ Total records: {total_records:,}")
print(f"   ‚Ä¢ Negative values: {negative_values} found")
print(f"   ‚Ä¢ Invalid areas: {zero_areas} found")
print(f"   ‚Ä¢ Future dates: {future_dates} found")

# Statistical summary and business insights
print(f"\nüìä STATISTICAL SUMMARY:")
print("-" * 25)
print(f"   üí∞ Property Values:")
print(f"      ‚Ä¢ Average: R$ {consolidated_df['valor_avaliacao'].mean():,.2f}")
print(f"      ‚Ä¢ Median: R$ {consolidated_df['valor_avaliacao'].median():,.2f}")
print(f"      ‚Ä¢ Range: R$ {consolidated_df['valor_avaliacao'].min():,.2f} - R$ {consolidated_df['valor_avaliacao'].max():,.2f}")

# SFH financing analysis
sfh_with_value = (consolidated_df['valores_financiados_sfh'] > 0).sum()
sfh_percentage = (sfh_with_value / total_records) * 100

print(f"\n   üèóÔ∏è  Property Areas:")
print(f"      ‚Ä¢ Avg constructed area: {consolidated_df['area_construida'].mean():.2f} m¬≤")
print(f"      ‚Ä¢ Avg land area: {consolidated_df['area_terreno'].mean():.2f} m¬≤")

print(f"\n   üè¶ SFH Financing: {sfh_with_value:,} properties ({sfh_percentage:.1f}%)")

print(f"\n   üèòÔ∏è  Data Diversity:")
print(f"      ‚Ä¢ Unique neighborhoods: {consolidated_df['bairro'].nunique()}")
print(f"      ‚Ä¢ Property types: {consolidated_df['tipo_imovel'].nunique()}")
print(f"      ‚Ä¢ Construction types: {consolidated_df['tipo_construcao'].nunique()}")

# ETL Pipeline summary
print(f"\nüéØ ETL PIPELINE COMPLETED SUCCESSFULLY!")
print(f"   üìä {total_records:,} records ready for analysis")
print(f"   ‚úÖ All transformations applied and validated")

print("\nüìä Dataset is ready for analysis and reporting")

üîç FINAL VALIDATION & STATISTICS
‚úÖ Data Quality Check:
   ‚Ä¢ Total records: 35,117
   ‚Ä¢ Negative values: 0 found
   ‚Ä¢ Invalid areas: 0 found
   ‚Ä¢ Future dates: 0 found

üìä STATISTICAL SUMMARY:
-------------------------
   üí∞ Property Values:
      ‚Ä¢ Average: R$ 668,034.77
      ‚Ä¢ Median: R$ 360,000.00
      ‚Ä¢ Range: R$ 0.00 - R$ 162,735,000.00

   üèóÔ∏è  Property Areas:
      ‚Ä¢ Avg constructed area: 159.04 m¬≤
      ‚Ä¢ Avg land area: 8577.00 m¬≤

   üè¶ SFH Financing: 11,148 properties (31.7%)

   üèòÔ∏è  Data Diversity:
      ‚Ä¢ Unique neighborhoods: 98
      ‚Ä¢ Property types: 19
      ‚Ä¢ Construction types: 13

üéØ ETL PIPELINE COMPLETED SUCCESSFULLY!
   üìä 35,117 records ready for analysis
   ‚úÖ All transformations applied and validated

üìä Dataset is ready for analysis and reporting


In [47]:
# üìã DATASET PREVIEW - CONSOLIDATED ITBI DATABASE
# ===============================================
# Preview of the final consolidated dataset showing sample records with all transformations applied

consolidated_df.head()

Unnamed: 0,id,logradouro,numero,complemento,valor_avaliacao,bairro,ano_construcao,area_terreno,area_construida,fracao_ideal,...,data_transacao,estado_conservacao,tipo_imovel,valores_financiados_sfh,cod_logradouro,latitude,longitude,ano,year,total_pavimentos
0,ITBI_2023_000001,av norte miguel arraes de alencar,3071,,1068562.63,Encruzilhada,1997,438.0,511.0,1.0,...,2023-12-21,Regular,Galp√£o,0.0,46540,-8.034273,-34.896337,2023,2023,nao informado
1,ITBI_2023_000002,av norte miguel arraes de alencar,3029,,1500000.0,Encruzilhada,1957,779.33,582.44,1.0,...,2023-11-17,Regular,Casa,0.0,46540,-8.034435,-34.896335,2023,2023,nao informado
2,ITBI_2023_000003,rua belmiro corr√™a,133,apto 0001,110000.0,Encruzilhada,1970,562.05,121.0,0.27191,...,2023-09-26,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023,ate 4
3,ITBI_2023_000004,rua belmiro corr√™a,133,apto 0001,110000.0,Encruzilhada,1970,562.05,121.0,0.27191,...,2023-09-22,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023,ate 4
4,ITBI_2023_000005,rua belmiro corr√™a,133,apto 0002,110000.0,Encruzilhada,1970,562.05,81.0,0.18202,...,2023-09-22,Bom,Apartamento,0.0,10715,-8.035013,-34.895903,2023,2023,ate 4


# üóÑÔ∏è DATABASE PREPARATION - POSTGRESQL INSERTION

## Preparing Consolidated Dataset for Database Storage
Now we'll prepare our transformed and validated ITBI dataset for insertion into PostgreSQL database. This includes database connection setup and data preparation.

In [None]:
# üîß DATABASE LIBRARIES AND CONFIGURATION SETUP
# =============================================
# Import required libraries for PostgreSQL connection and data insertion

import json
import psycopg2
import sqlalchemy
from psycopg2 import sql
from sqlalchemy import create_engine

print("üóÑÔ∏è DATABASE CONNECTION SETUP - POSTGRESQL")
print("=" * 45)

# Create database configuration file template if it doesn't exist
config_template = {
    "host": "localhost",
    "port": 5432,
    "database": "itbi_database",
    "user": "your_username",
    "password": "your_password",
    "schema": "analytics"
}

# Check if config file exists, create template if not
import os
config_file_path = "config.json"

if not os.path.exists(config_file_path):
    print("üìù Creating database configuration template...")
    with open(config_file_path, 'w') as config_file:
        json.dump(config_template, config_file, indent=4)
    print(f"   ‚úÖ Template created: {config_file_path}")
    print("   ‚ö†Ô∏è  Please update the configuration with your database credentials")
else:
    print(f"   ‚úÖ Configuration file found: {config_file_path}")

print(f"\nüîß Database libraries imported successfully")
print(f"   ‚Ä¢ psycopg2: PostgreSQL adapter")
print(f"   ‚Ä¢ sqlalchemy: Database toolkit")
print(f"   ‚Ä¢ json: Configuration management")

In [None]:
# üîó DATABASE CONNECTION TEST
# ===========================
# Test connection to PostgreSQL database

try:
    print("üîó TESTING DATABASE CONNECTION")
    print("=" * 35)
    
    # Load database configuration
    with open('config.json', 'r') as config_file:
        config = json.load(config_file)
    
    print(f"üì° Connecting to PostgreSQL...")
    print(f"   ‚Ä¢ Host: {config['host']}")
    print(f"   ‚Ä¢ Port: {config['port']}")
    print(f"   ‚Ä¢ Database: {config['database']}")
    print(f"   ‚Ä¢ Schema: {config.get('schema', 'public')}")
    
    # Create psycopg2 connection
    connection = psycopg2.connect(
        host=config['host'],
        port=config['port'],
        database=config['database'],
        user=config['user'],
        password=config['password']
    )
    
    # Create SQLAlchemy engine
    engine = create_engine(
        f'postgresql+psycopg2://{config["user"]}:{config["password"]}@{config["host"]}:{config["port"]}/{config["database"]}'
    )
    
    print(f"   ‚úÖ Connection established successfully!")
    print(f"\nüìä Dataset ready for insertion:")
    print(f"   ‚Ä¢ Records: {len(consolidated_df):,}")
    print(f"   ‚Ä¢ Columns: {len(consolidated_df.columns)}")
    print(f"   ‚Ä¢ Memory: {consolidated_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Close test connection
    connection.close()
    
except FileNotFoundError:
    print("‚ùå Configuration file not found!")
    print("   Please ensure 'config.json' exists with database credentials")
except Exception as db_error:
    print(f"‚ùå Database connection failed: {type(db_error).__name__}")
    print(f"   Details: {str(db_error)}")
    print("   Please check your database configuration and credentials")