In [None]:
from pyspark.sql import SparkSession
import geopandas as gpd
import os
from datetime import datetime, timedelta

# Create a Spark session
spark = SparkSession.builder \
    .appName("SedonaApp") \
    .config("spark.jars.packages", "org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
    .getOrCreate()

# Set Sedona configuration
spark.sparkContext.setSystemProperty("sedona.global.charset", "utf8")

In [None]:
from sedona.register import SedonaRegistrator

SedonaRegistrator.registerAll(spark)

In [None]:
from sedona.utils.adapter import Adapter
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.core.SpatialRDD import SpatialRDD

# Read a shapefile into a SpatialRDD
spatial_rdd = ShapefileReader.readToGeometryRDD(spark.sparkContext, "path_to_shapefile")

# Transform the SpatialRDD to a DataFrame
spatial_df = Adapter.toDf(spatial_rdd, spark)

# Register the DataFrame as a temporary table and run SQL queries
spatial_df.createOrReplaceTempView("spatial_table")
result = spark.sql("SELECT * FROM spatial_table WHERE st_contains(geometry, st_point(1.5, 1.5))")

# Show the results
result.show()

In [None]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
import os

# Initialize Spark
spark = SparkSession.builder.appName("ConcatCSVFiles").getOrCreate()

# Folder where the CSV files are located
folder_path = "data"

# Define the range of months
start_month = datetime.strptime("202204", "%Y%m")
end_month = datetime.strptime("202303", "%Y%m")

# Initialize an empty DataFrame to store final concatenated data
consumption_malaga_df = None

# Loop through each month in the range
current_month = start_month
while current_month <= end_month:
    # Construct file names
    file_suffix = current_month.strftime("%Y%m")
    file_consumos = os.path.join(folder_path, f"{file_suffix}_SIPS2_CONSUMOS_ELECTRICIDAD_peninsular.csv")
    file_ps = os.path.join(folder_path, f"{file_suffix}_SIPS2_PS_ELECTRICIDAD_peninsular.csv")
    
    # Read the CSV files into Spark DataFrames
    if os.path.exists(file_consumos) and os.path.exists(file_ps):
        df_c = spark.read \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .csv(file_consumos)
        
        df_ps = spark.read \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .csv(file_ps)
        
        # Dropping columns in df_ps that are also present in df_c
        columns_in_df_c = df_c.columns
        for column in columns_in_df_c:
            if column in df_ps.columns and column != 'cups':
                df_ps = df_ps.drop(column)

        # Join the filtered df_ps with df_c on the 'cups' column
        df = df_ps.join(df_c, on='cups', how='inner')

        # Filter the DataFrame where 'codigoProvinciaPS' is equal to 29
        df_malaga = df.filter(df.codigoProvinciaPS == 29)
        
        # Concatenate the individual DataFrame with the final DataFrame
        if final_df is None:
            final_df = df_malaga
        else:
            final_df = final_df.union(df_malaga)
        
    else:
        print(f"The files for {file_suffix} were not found in the folder '{folder_path}'.")

    # Move to the next month
    current_month = current_month + timedelta(days=31)
    current_month = current_month.replace(day=1)

# Now final_df contains the concatenated data. You can perform further operations on it.
if consumption_malaga_df:
    consumption_malaga_df.show()
else:
    print("No data to process.")

In [None]:
spark.stop()