In [1]:
# Importing libraries required to clean, standardize, and prepare the dataset for futher analysis.

import numpy as np
import pandas as pd
import zipfile
import os
from datetime import datetime

import time
start_time  = time.time()

In [None]:
# Define the directory path where datasets will be stored
dataset_directory = "datasets"

# Create the directory if it doesn't exist, avoiding errors if it already exists
os.makedirs(dataset_directory, exist_ok=True)

# AGGRESSIVE MEMORY CLEANUP - Remove ALL potentially interfering variables
import gc

# Get current namespace
current_vars = list(globals().keys())

# Clean EVERYTHING except essential modules
protected_vars = ['__builtins__', '__name__', '__doc__', '__package__', 
                 '__loader__', '__spec__', '__annotations__', '__cached__',
                 'np', 'pd', 'os', 'datetime', 'time', 'zipfile', 'gc',
                 'dataset_directory', 'current_vars', 'protected_vars']

for var in current_vars:
    if var not in protected_vars and not var.startswith('_'):
        try:
            del globals()[var]
        except:
            pass

# Multiple garbage collections to ensure cleanup
gc.collect()
gc.collect()
gc.collect()

print("🧹 AGGRESSIVE MEMORY CLEANUP COMPLETED")
print("=" * 50)

# Define URLs as a simple list to avoid any dictionary issues
dataset_info = [
    ("2023", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/d0c08a6f-4c27-423c-9219-8d13403816f4/download/itbi_2023.csv"),
    ("2024", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/a36d548b-d705-496a-ac47-4ec36f068474/download/itbi_2024.csv"),
    ("2025", "http://dados.recife.pe.gov.br/dataset/28e3e25e-a9a7-4a9f-90a8-bb02d09cbc18/resource/5b582147-3935-459a-bbf7-ee623c22c97b/download/itbi_2025.csv")
]

print("🏠 LOADING ITBI DATASETS FROM RECIFE")
print("=" * 50)

# Store dataframes in a simple list instead of dictionary
loaded_dataframes = []
loaded_years = []

# Load each dataset one by one
for year_str, csv_url in dataset_info:
    print(f"\n📅 Loading ITBI {year_str} data...")
    
    try:
        # Load data with explicit parameters
        current_df = pd.read_csv(csv_url, sep=';', encoding='utf-8')
        
        # Add year column
        current_df['year'] = int(year_str)
        
        # Append to lists
        loaded_dataframes.append(current_df)
        loaded_years.append(year_str)
        
        print(f"   ✅ Success: {len(current_df):,} records, {len(current_df.columns)} columns")
        
        # Explicitly delete the temporary dataframe
        del current_df
        
    except Exception as error:
        print(f"   ❌ Error loading {year_str} data: {error}")

# Verification step
print(f"\n🔍 VERIFICATION")
print("-" * 20)
print(f"   • Total datasets loaded: {len(loaded_dataframes)}")
print(f"   • Years loaded: {loaded_years}")

# Only proceed if we have exactly 3 datasets
if len(loaded_dataframes) == 3:
    print(f"\n🔗 COMBINING DATASETS")
    print("-" * 30)
    
    # Combine dataframes
    final_df = pd.concat(loaded_dataframes, ignore_index=True)
    
    # Clear the list to free memory
    loaded_dataframes.clear()
    
    print(f"📊 FINAL DATASET SUMMARY")
    print("=" * 30)
    print(f"   • Total records: {len(final_df):,}")
    print(f"   • Total columns: {len(final_df.columns)}")
    print(f"   • Years included: {sorted(final_df['year'].unique())}")
    print(f"   • Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Records by year
    print(f"\n📈 Records by year:")
    records_by_year = final_df['year'].value_counts().sort_index()
    for year_num, record_count in records_by_year.items():
        print(f"   • {year_num}: {record_count:,} records")
    
    print(f"\n📋 Sample data (first 3 rows):")
    sample_columns = ['year', 'bairro', 'tipo_imovel', 'valor_avaliacao', 'data_transacao']
    print(final_df[sample_columns].head(3))
    
    # Store final dataframe in standard variable name
    df = final_df
    
else:
    print(f"\n❌ ERROR: Expected exactly 3 datasets, but loaded {len(loaded_dataframes)}")
    print("   Cannot proceed with data combination.")

print(f'\n✅ Directory "{dataset_directory}" is ready for use.')
print("✅ ETL Extract phase completed successfully!")

🧹 AGGRESSIVE MEMORY CLEANUP STARTING...
🧹 AGGRESSIVE MEMORY CLEANUP COMPLETED
🏠 LOADING ITBI DATASETS FROM RECIFE

📅 Loading ITBI 2023 data...
   ✅ Success: 12,669 records, 23 columns

📅 Loading ITBI 2024 data...
   ✅ Success: 12,669 records, 23 columns

📅 Loading ITBI 2024 data...
   ✅ Success: 15,242 records, 23 columns

📅 Loading ITBI 2025 data...
   ✅ Success: 15,242 records, 23 columns

📅 Loading ITBI 2025 data...
   ✅ Success: 7,206 records, 23 columns

🔍 VERIFICATION
--------------------
   • Total datasets loaded: 3
   • Years loaded: ['2023', '2024', '2025']

🔗 COMBINING DATASETS
------------------------------
📊 FINAL DATASET SUMMARY
   • Total records: 35,117
   • Total columns: 23
   • Years included: [np.int64(2023), np.int64(2024), np.int64(2025)]
   • Memory usage: 33.72 MB

📈 Records by year:
   • 2023: 12,669 records
   • 2024: 15,242 records
   • 2025: 7,206 records

📋 Sample data (first 3 rows):
   year        bairro  tipo_imovel valor_avaliacao data_transacao
0  2023