In [1]:
# Importing libraries required to clean, standardize, and prepare the dataset for futher analysis.

import numpy as np
import pandas as pd
import zipfile
import os
from datetime import datetime

import time
start_time  = time.time()

In [2]:
# Define the directory path where datasets will be stored
data_directory = "datasets"

# Create the directory if it doesn't exist, avoiding errors if it already exists
os.makedirs(data_directory, exist_ok=True)

# SIMPLIFIED VERSION - Basic loop for loading ITBI datasets

# Define dataset URLs
dataset_sources = [
    ("2023", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/d0c08a6f-4c27-423c-9219-8d13403816f4/download/itbi_2023.csv"),
    ("2024", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/a36d548b-d705-496a-ac47-4ec36f068474/download/itbi_2024.csv"),
    ("2025", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/5b582147-3935-459a-bbf7-ee623c22c97b/download/itbi_2025.csv")
]

print("🏠 LOADING ITBI DATASETS - RECIFE")
print("=" * 40)

# Simple loop to load each dataset
load_success_count = 0
all_records_total = 0
all_columns_total = 0
years_loaded = []
data_storage = {}  # Dictionary to store the datasets

for load_year, data_url in dataset_sources:
    print(f"\n📅 Loading ITBI data {load_year}...")
    print(f"   🔗 URL: {data_url[:80]}...")
    
    try:
        # Try to load the CSV
        print(f"   ⏳ Downloading file...")
        temp_dataframe = pd.read_csv(data_url, sep=';', encoding='utf-8')
        
        # Check if DataFrame is not empty
        if temp_dataframe.empty:
            raise ValueError("Dataset loaded is empty")
        
        # Check if it has the expected columns
        required_columns = ['bairro', 'tipo_imovel', 'valor_avaliacao', 'data_transacao']
        missing_columns = [col for col in required_columns if col not in temp_dataframe.columns]
        
        if missing_columns:
            print(f"   ⚠️  Warning: Missing columns: {missing_columns}")
        
        # Add year column
        temp_dataframe['year'] = int(load_year)
        
        # Show basic information
        current_records = len(temp_dataframe)
        current_columns = len(temp_dataframe.columns)
        
        # Add to general totals
        all_records_total += current_records
        all_columns_total = current_columns  # Assume all have the same number of columns
        years_loaded.append(load_year)
        
        # Save dataset in dictionary for later manipulation
        data_storage[load_year] = temp_dataframe.copy()  # Create an independent copy
        
        print(f"   ✅ Success: {current_records:,} records, {current_columns} columns")
        print(f"   📊 Data sample:")
        
        # Check if 'bairro' column exists before showing
        if 'bairro' in temp_dataframe.columns:
            sample_neighborhoods = temp_dataframe['bairro'].head(3).tolist()
            print(f"      First neighborhoods: {sample_neighborhoods}")
            del sample_neighborhoods
        else:
            first_column_sample = temp_dataframe.iloc[:3, 0].tolist()
            print(f"      First 3 rows of first column: {first_column_sample}")
            del first_column_sample
        
        load_success_count += 1
        del current_records, current_columns
        
    except Exception as load_error:
        print(f"   ❌ Error loading data for {load_year}: {type(load_error).__name__}")
        print(f"      Details: {str(load_error)}")
        del load_error

# Clean up loop variables
del load_year, data_url, temp_dataframe, required_columns, missing_columns

print(f"\n🔍 VERIFication")
print("-" * 20)
print(f"   • Total datasets loaded: {load_success_count}")
print(f"   • Years included: {years_loaded}")
print(f"   • Expected datasets: 3")
print()

print(f"📊 FINAL DATASET SUMMARY")
print("=" * 30)
print(f"   • Total records: {all_records_total:,}")
print(f"   • Total columns: {all_columns_total}")
print(f"   • Years included: {years_loaded}")

print(f"   • 2023: Dataset loaded successfully")
print(f"   • 2024: Dataset loaded successfully") 
print(f"   • 2025: Dataset loaded successfully")

# Access specific datasets with intermediate variables
dataset_2023 = data_storage['2023']
dataset_2024 = data_storage['2024']
dataset_2025 = data_storage['2025']

print(f"\n📋 Sample data (first 3 rows):")
sample_data = dataset_2025[['bairro', 'tipo_imovel', 'valor_avaliacao', 'data_transacao']].head(4)
print(sample_data)

print(f'\n✅ Directory "{data_directory}" is ready for use.')
print("✅ ETL Extract phase completed successfully!")

# Clean up all intermediate variables
del load_success_count, all_records_total, all_columns_total, years_loaded
del dataset_2023, dataset_2024, dataset_2025, sample_data

# Rename final variables for consistency
dataset_directory = data_directory
datasets_dict = data_storage
del data_directory, data_storage



🏠 LOADING ITBI DATASETS - RECIFE

📅 Loading ITBI data 2023...
   🔗 URL: http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resou...
   ⏳ Downloading file...
   ✅ Success: 12,669 records, 23 columns
   📊 Data sample:
      First neighborhoods: ['Encruzilhada', 'Encruzilhada', 'Encruzilhada']

📅 Loading ITBI data 2024...
   🔗 URL: http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resou...
   ⏳ Downloading file...
   ✅ Success: 15,242 records, 23 columns
   📊 Data sample:
      First neighborhoods: ['Encruzilhada', 'Encruzilhada', 'Encruzilhada']

📅 Loading ITBI data 2025...
   🔗 URL: http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resou...
   ⏳ Downloading file...
   ✅ Success: 7,206 records, 23 columns
   📊 Data sample:
      First neighborhoods: ['Encruzilhada', 'Encruzilhada', 'Encruzilhada']

🔍 VERIFication
--------------------
   • Total datasets loaded: 3
   • Years included: ['2023', '2024', '2025']
   • E

In [3]:
# Save dataframes as CSV files and create ZIP archive

header_message = "💾 SAVING DATASETS TO FILES AND CREATING ZIP ARCHIVE"
separator_line = "=" * 55

print(header_message)
print(separator_line)

# Clean up header variables immediately
del header_message, separator_line

# Initialize control variables
csv_files_list = []
save_successful = True

# Create CSV files with proper variable management
for dataset_year, dataset_df in datasets_dict.items():
    # Create filename using intermediate variables
    csv_filename = f"itbi_{dataset_year}.csv"
    csv_filepath = os.path.join(dataset_directory, csv_filename)
    
    try:
        # Save to CSV
        dataset_df.to_csv(csv_filepath, sep=';', encoding='utf-8', index=False)
        csv_files_list.append(csv_filepath)
    except Exception as save_error:
        # Use intermediate variable for error message
        error_msg = f"   ❌ Failed to save: {csv_filename}"
        print(error_msg)
        save_successful = False
        del save_error, error_msg
    
    # Clean up loop variables immediately
    del csv_filename, csv_filepath

# Clean up loop variables completely
del dataset_year, dataset_df

# Print success messages outside the loop to avoid duplicates
for file_path in csv_files_list:
    # Use intermediate variable for filename
    saved_filename = os.path.basename(file_path)
    success_msg = f"   ✅ Saved: {saved_filename}"
    print(success_msg)
    del saved_filename, success_msg

# CRITICAL: Clean up the loop variable
del file_path

# Create ZIP archive if CSV files were created successfully
if csv_files_list and save_successful:
    # Create intermediate variables for ZIP creation
    zip_filename = "itbi_datasets_recife.zip"
    zip_filepath = os.path.join(dataset_directory, zip_filename)
    
    try:
        # Create ZIP with managed variables
        with zipfile.ZipFile(zip_filepath, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for source_file in csv_files_list:
                target_filename = os.path.basename(source_file)
                zip_file.write(source_file, target_filename)
                del target_filename
            del source_file
        
        # Verify and show results with managed variables
        if os.path.exists(zip_filepath):
            # Calculate file size using intermediate variables
            file_size_bytes = os.path.getsize(zip_filepath)
            file_size_mb = file_size_bytes / (1024 * 1024)
            
            with zipfile.ZipFile(zip_filepath, 'r') as zip_reader:
                zip_contents = zip_reader.namelist()
                files_in_zip = len(zip_contents)
            
            # Create all success messages using intermediate variables
            success_header = "\n✅ ZIP ARCHIVE CREATED SUCCESSFULLY!"
            filename_line = f"   📦 Filename: {zip_filename}"
            size_line = f"   📁 Size: {file_size_mb:.2f} MB"
            files_line = f"   🗃️  Files in ZIP: {files_in_zip}"
            location_line = f"   📂 Location: {zip_filepath}"
            
            print(success_header)
            print(filename_line)
            print(size_line)
            print(files_line)
            print(location_line)
            
            # Clean up all verification variables immediately
            del file_size_bytes, file_size_mb, zip_contents, files_in_zip
            del success_header, filename_line, size_line, files_line, location_line
        else:
            # Use intermediate variable for error message
            zip_not_created_msg = "   ❌ Error: ZIP file was not created"
            print(zip_not_created_msg)
            del zip_not_created_msg
            
    except Exception as zip_error:
        # Use intermediate variables for error handling
        error_details = str(zip_error)
        zip_error_msg = f"   ❌ Error creating ZIP: {error_details}"
        print(zip_error_msg)
        del zip_error, error_details, zip_error_msg
        
    # Clean up ZIP variables immediately
    del zip_filename, zip_filepath
else:
    # Use intermediate variable for failure message
    no_zip_msg = "\n❌ Cannot create ZIP: No CSV files or save errors occurred"
    print(no_zip_msg)
    del no_zip_msg

# Final comprehensive cleanup
del csv_files_list, save_successful




💾 SAVING DATASETS TO FILES AND CREATING ZIP ARCHIVE
   ✅ Saved: itbi_2023.csv
   ✅ Saved: itbi_2024.csv
   ✅ Saved: itbi_2025.csv

✅ ZIP ARCHIVE CREATED SUCCESSFULLY!
   📦 Filename: itbi_datasets_recife.zip
   📁 Size: 0.91 MB
   🗃️  Files in ZIP: 3
   📂 Location: datasets\itbi_datasets_recife.zip


In [4]:
# Now let's take a good look at the tables and their nomenclature structure.
# After analyzing the datasets, we can confirm that all tables follow good naming standards:
# snake_case convention, descriptive names, Portuguese language consistency, no special characters,
# logical grouping, and standardized separators. These naming conventions ensure database 
# compatibility, readability, and maintainability across different systems and programming environments.
# However, the 'sfh' acronym lacks clarity and context, making it difficult for users to understand
# its meaning without domain knowledge. To improve data documentation and usability, we will rename
# this column to 'valores_financiados_sfh' providing explicit context about financed values.
datasets_dict['2023'].columns


Index(['logradouro', 'numero', 'complemento', 'valor_avaliacao', 'bairro',
       'cidade', 'uf', 'ano_construcao', 'area_terreno', 'area_construida',
       'fracao_ideal', 'padrao_acabamento', 'tipo_construcao', 'tipo_ocupacao',
       'data_transacao', 'estado_conservacao', 'tipo_imovel', 'sfh',
       'cod_logradouro', 'latitude', 'longitude', 'ano', 'year'],
      dtype='object')

In [5]:
# Transforming renaming sfh column in order to improve understanding 
for year, df in datasets_dict.items():
    new_df = df.rename(columns = {'sfh':'valores_financiados_sfh'})
    datasets_dict[year] = new_df
    

In [6]:
# Null values analysis 
print("🩺 Data Health Check - Missing Values Diagnostic & Investigation")
print("=" * 65)
missing_datasets = 0
for year, df in datasets_dict.items():
    print(f"\n📅 Dataset {year}:")
    print("-" * 20)
    
    null_summary = df.isna().sum()
    columns_with_nulls = null_summary[null_summary > 0]
    
    if len(columns_with_nulls.index.tolist()) > 0:
        
        missing_datasets += 1
        print(f"  🔍 Found {len(columns_with_nulls)} columns with missing values:")
        
        for column_name, null_count in columns_with_nulls.items():
            print(f"      • {column_name}: {null_count:,} nulls ")
            
    else:
        print("   ✅ No missing values found - Dataset is complete!")


print("\n📋 Final diagnosis:")
print(f'There is a total of {missing_datasets} datasets with missing values out of {len(datasets_dict)} total datasets.')

# NEXT STEP: DATA CLEANING AND NULL VALUES TREATMENT
# Now that we've identified null values in some datasets, we need to perform cleaning
# and removal of these missing values to prevent issues during subsequent analysis.
# Null values can cause errors in statistical calculations, visualizations, and data modeling.
# Proper treatment of these values is essential for ETL pipeline integrity and reliability.


🩺 Data Health Check - Missing Values Diagnostic & Investigation

📅 Dataset 2023:
--------------------
  🔍 Found 3 columns with missing values:
      • complemento: 1,320 nulls 
      • latitude: 3,402 nulls 
      • longitude: 3,402 nulls 

📅 Dataset 2024:
--------------------
  🔍 Found 3 columns with missing values:
      • complemento: 1,443 nulls 
      • latitude: 5,619 nulls 
      • longitude: 5,619 nulls 

📅 Dataset 2025:
--------------------
  🔍 Found 3 columns with missing values:
      • complemento: 576 nulls 
      • latitude: 2,623 nulls 
      • longitude: 2,623 nulls 

📋 Final diagnosis:
There is a total of 3 datasets with missing values out of 3 total datasets.


In [7]:
# COLUMN OPTIMIZATION: REMOVING REDUNDANT GEOGRAPHIC COLUMNS
# We will drop the 'cidade' and 'uf' columns as they contain only uniform values across all records
# (Recife and PE respectively). Since our analysis focuses specifically on ITBI data from Recife's
# urban region within Pernambuco state, these columns provide no analytical value or variation.
# Removing these redundant columns optimizes memory usage and simplifies the dataset structure
# without losing any meaningful information for our geographic scope of analysis.

for year, df in datasets_dict.items():
    df = df.drop(["cidade", "uf"], axis =1)
    df.info()
    datasets_dict[year] = df



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12669 entries, 0 to 12668
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   logradouro               12669 non-null  object 
 1   numero                   12669 non-null  int64  
 2   complemento              11349 non-null  object 
 3   valor_avaliacao          12669 non-null  object 
 4   bairro                   12669 non-null  object 
 5   ano_construcao           12669 non-null  int64  
 6   area_terreno             12669 non-null  object 
 7   area_construida          12669 non-null  object 
 8   fracao_ideal             12669 non-null  object 
 9   padrao_acabamento        12669 non-null  object 
 10  tipo_construcao          12669 non-null  object 
 11  tipo_ocupacao            12669 non-null  object 
 12  data_transacao           12669 non-null  object 
 13  estado_conservacao       12669 non-null  object 
 14  tipo_imovel           

In [8]:
for year, df in datasets_dict.items():

    for i in range(len(df.columns)):
        col_name = df.columns[i]
        print(df[col_name].head(10))

0    av norte miguel arraes de alencar
1    av norte miguel arraes de alencar
2                   rua belmiro corrêa
3                   rua belmiro corrêa
4                   rua belmiro corrêa
5                   rua belmiro corrêa
6                   rua belmiro corrêa
7                   rua belmiro corrêa
8                   rua belmiro corrêa
9                   rua belmiro corrêa
Name: logradouro, dtype: object
0    3071
1    3029
2     133
3     133
4     133
5     133
6     133
7     133
8     109
9     109
Name: numero, dtype: int64
0          NaN
1          NaN
2    apto 0001
3    apto 0001
4    apto 0002
5    apto 0003
6    apto 0004
7    apto 0005
8          NaN
9          NaN
Name: complemento, dtype: object
0    1068562,63
1    1500000,00
2     110000,00
3     110000,00
4     110000,00
5     110000,00
6     110000,00
7     110000,00
8    4900000,00
9    4900000,00
Name: valor_avaliacao, dtype: object
0    Encruzilhada
1    Encruzilhada
2    Encruzilhada
3    Encruzilhada

In [9]:
# DATA TYPE CONVERSION: VALOR_AVALIACAO TO FLOAT
# We will convert the 'valor_avaliacao' column from object type to float to enable proper
# numerical operations and statistical analysis. Currently stored as object (string), this
# prevents mathematical calculations, aggregations, and numeric comparisons essential for
# financial analysis of property values. Converting to float ensures data integrity and
# enables accurate computation of means, sums, and other statistical measures for ITBI values.



# DECIMAL SEPARATOR STANDARDIZATION FUNCTION
# Converts Brazilian decimal format (comma) to international format (dot) required for float conversion
def standardize_decimal_format(x):
    new = str(x.replace(',','.'))
    return new
    
# STEP 1: Replace commas with dots to prepare for float conversion
for year, df in datasets_dict.items():
    df['valor_avaliacao'] = df['valor_avaliacao'].apply(standardize_decimal_format)
    datasets_dict[year] = df

# STEP 2: Convert standardized strings to float type for numerical operations
for year, df in datasets_dict.items():
    df['valor_avaliacao'] = df['valor_avaliacao'].astype('float')
    datasets_dict[year] = df

In [10]:
# AREA_TERRENO CONVERSION: APPLYING SAME DECIMAL STANDARDIZATION PROCESS
# The 'area_terreno' column requires identical treatment as 'valor_avaliacao' - converting
# Brazilian decimal format (comma) to international format (dot) before float conversion.
# This ensures consistent numerical data types across all measurement columns for analysis.

for year, df in datasets_dict.items():
    df['area_terreno'] = df['area_terreno'].astype(str).str.replace(',', '.').astype(float)
    df['area_terreno'] = df['area_terreno'].astype('float')
    datasets_dict[year] = df


In [11]:
# AREA_CONSTRUIDA CONVERSION: SAME DECIMAL STANDARDIZATION PROCESS
# Converting 'area_construida' from Brazilian decimal format (comma) to international format (dot)
for year, df in datasets_dict.items():
    df['area_construida'] = df['area_construida'].astype(str).str.replace(',', '.').astype(float)
    df['area_construida'] = df['area_construida'].astype('float')
    datasets_dict[year] = df

In [None]:
# ENCODING CORRECTION: FIXING INCORRECTLY ENCODED CHARACTERS
# Brazilian datasets often contain encoding issues where Portuguese characters (ã, ç, ê, õ, etc.) 
# are incorrectly displayed due to mismatched character encoding during data extraction.
# This commonly occurs when CSV files are saved with Latin1 (ISO-8859-1) encoding but read as UTF-8,
# causing characters like "ção" to appear as "Ã§Ã£o" or similar garbled text.
# We fix this by re-encoding the text: first encode as Latin1 then decode as UTF-8 to restore
# the original Portuguese characters for proper data analysis and visualization.

def fix_encoding_issues(text_value):
    if not isinstance(text_value, str):
        return text_value
    try:
        return text_value.encode('latin1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text_value

# Apply encoding correction to all text columns in all datasets
for year, df in datasets_dict.items():
    text_columns = df.select_dtypes(include=['object']).columns
    for col in text_columns:
        df[col] = df[col].apply(fix_encoding_issues)
    datasets_dict[year] = df



0        Médio
1        Médio
2      Simples
3      Simples
4      Simples
5      Simples
6      Simples
7      Simples
8        Médio
9        Médio
10    Superior
11    Superior
12    Superior
13       Médio
14    Superior
15    Superior
16    Superior
17    Superior
18    Superior
19    Superior
Name: padrao_acabamento, dtype: object
