In [8]:
# Adiciona a pasta raiz do projeto ao sys.path
import sys
if "/home/user" not in sys.path:
    sys.path.append("/home/user")
from src.spark_session import create_spark_session
spark = create_spark_session()

## Importar a sessão spark criada no spark_session.py
from minio import Minio
from datetime import datetime
import os

In [3]:
path = f"s3a://silver/geo_spatial_infos/{datetime.now().month}_{datetime.now().year}"
df = spark.read.format('parquet').load(path)
df_geospatial = df.select('latitude','longitude').distinct().toPandas()

In [4]:
pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.9.3 requires numpy<1.26.0,>=1.18.5, but you have numpy 1.26.4 which is incompatible.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-1.1.1-py3-none-any.whl (338 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m338.4/338.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pandas>=2.0.0
  Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting shapely>=2.0.0
  Downloading shapely-2.1.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyogrio>=0.7.2
  Downloading pyogrio-0.11.1-cp310-cp310-manylinux_2_28_x86_64.whl (27.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hColl

In [23]:
import tempfile
import requests
import zipfile
import geopandas as gpd
from shapely.geometry import Point
import json
from pyspark.sql import functions as F

In [20]:

# Configura o cliente MinIO (mesmo de antes)
client = Minio(
    "minio:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False
)

# Dados do arquivo no MinIO
bucket = "raw"
minio_path = "geospatial/tl_2025_us_county.zip"

# Cria diretório temporário
with tempfile.TemporaryDirectory() as tmp_dir:
    zip_path = os.path.join(tmp_dir, "tl_2025_us_county.zip")

    # Baixa o ZIP do MinIO
    client.fget_object(
        bucket,
        minio_path,
        zip_path
    )
    print(f"✅ Arquivo baixado do MinIO para: {zip_path}")
    
    # Extrai o conteúdo do ZIP
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(tmp_dir)
        print(f"📂 Arquivos extraídos em: {tmp_dir}")

    # Localiza o .shp extraído
    shp_file = [f for f in os.listdir(tmp_dir) if f.endswith(".shp")]
    if not shp_file:
        raise FileNotFoundError("Nenhum arquivo .shp encontrado no ZIP extraído.")
    
    shp_path = os.path.join(tmp_dir, shp_file[0])
    print(f"Shapefile localizado: {shp_path}")

    # Lê o shapefile com geopandas
    gdf = gpd.read_file(shp_path)
    print("✅ GeoDataFrame carregado com sucesso!")



✅ Arquivo baixado do MinIO para: /tmp/tmpl4524e23/tl_2025_us_county.zip
📂 Arquivos extraídos em: /tmp/tmpl4524e23
Shapefile localizado: /tmp/tmpl4524e23/tl_2025_us_county.shp
✅ GeoDataFrame carregado com sucesso!


In [29]:
# Cria uma coluna de geometria a partir das coordenadas
geometry = [Point(xy) for xy in zip(df_geospatial['longitude'], df_geospatial['latitude'])]

# Cria o GeoDataFrame com CRS WGS84 (EPSG:4326), que é o padrão GPS
gdf_points = gpd.GeoDataFrame(df_geospatial.copy(), geometry=geometry, crs="EPSG:4326")

# Transforma para o mesmo CRS do GeoDataFrame do shapefile (gdf)
gdf_points = gdf_points.to_crs(gdf.crs)

# Faz o spatial join para agregar dados do shapefile aos pontos
gdf_joined = gpd.sjoin(gdf_points, gdf, how="left", predicate="within")

# Remove a coluna de geometria para voltar ao formato tabular
df_enriched = gdf_joined.drop(columns='geometry')

#  Seleciona as colunas que você quer no resultado final
df_final = df_enriched[['latitude', 'longitude', 'COUNTYFP', 'NAME', 'NAMELSAD']]


#  Converte o pandas DataFrame para JSON (orientação records - lista de dicts)
json_records = df_final.to_json(orient='records')

#  Transforma a string JSON em lista de dicionários Python
list_of_dicts = json.loads(json_records)

#  Cria o Spark DataFrame a partir da lista de dicionários
df_spark_enriched = spark.createDataFrame(list_of_dicts).select(
    'latitude', 'longitude',
    F.col('COUNTYFP').alias('county_code'),
    F.col('NAME').alias('county_name'),
    F.col('NAMELSAD').alias('county_full_name')
)



In [30]:
# Definição de como salvar os arquivos
bucket = 'silver'
project_name = 'geo_spatial_full'

save_path = f"s3a://{bucket}/{project_name}/{datetime.now().month}_{datetime.now().year}"

df_spark_enriched.write\
    .mode("overwrite") \
    .format('delta')\
    .save(save_path)

In [31]:
!pip install --upgrade numpy

Collecting numpy
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.9.3 requires numpy<1.26.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 2.2.6 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.6
