In [0]:
pip install azure-storage-file

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
pip install azure-storage-blob


Python interpreter will be restarted.
Python interpreter will be restarted.


# Loading Data

### Mounting Blob Storage

In [0]:
storage_account_name = "bartercapstone"
storage_account_key = "2N1aMMEBpbc35P9M6I3aLhYFwN3qOYqrg5+ofg0GVZ+uCnKR8FCgTfQVsE+8hAw3X07r2YM/L00w+AStBNTQjw=="

# Define the container and mount point for electricity consumption data
container_name = "electricityconsumption"
mount_point = "/mnt/electricityconsumption"

# Get all mount points
mount_points = [mnt.mountPoint for mnt in dbutils.fs.mounts()]

# Mount Azure Blob Storage container to Databricks FS (if not already mounted)
if mount_point not in mount_points:
    try:
        dbutils.fs.mount(
            source=f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net",
            mount_point=mount_point,
            extra_configs={f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}
        )
    except Exception as e:
        print(f"Unable to mount. {e}")


### Loading the Catastro Data

In [0]:
# Initialize the two dataframes
consumption_df = None
cadastral_df = None

# List all files in the cadastral directory
cadastral_directory = mount_point + "/cadastral"
cadastral_files = dbutils.fs.ls(cadastral_directory)

for file in cadastral_files:
    print(f"Processing file in cadastral directory: {file.name}")

    # Load data with Spark
    df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').option('delimiter', ';').load(file.path)

    # Concatenate the dataframes
    if cadastral_df is None:
        cadastral_df = df
    else:
        cadastral_df = cadastral_df.union(df)

# display(cadastral_df.limit(100))

Processing file in cadastral directory: cadastral_malaga.csv


## Loading the Consumption Data

In [0]:
# List all files in the electricity consumption directory
consumption_directory = mount_point + "/electricityconsumption"
consumption_files = dbutils.fs.ls(consumption_directory)

# for file in electricity_consumption_files:
print(f"Processing file in consumption directory: {consumption_files[0].name}")

# Load data with Spark
df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(consumption_files[0].path)

# Concatenate the dataframes
if consumption_df is None:
    consumption_df = df
else:
    consumption_df = consumption_df.union(df)

# display(consumption_df.limit(100))


Processing file in consumption directory: 202203_ELECTRICIDAD_MALAGA.csv


# Cleaning Data

### Cleaning Catastro Data

In [0]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import count, avg, sum, mode, first, col

columns_to_drop = ["Codigo provincia (PROV)", "Codigo municipio (MUN)", "consulta", "Superficie suelo (SUPF)", "Superficie construida metros cuadrados (SUP)", "Escalera (ESC)", "Planta (PLA)", "Puerta (PUE)", "Superficie catastral (SEC)", "UEC analisis"]

cadastral_df = cadastral_df.drop(*columns_to_drop)
# display(cadastral_df.limit(100))

In [0]:
from pyspark.sql.functions import col, count, max, first, mode

# this will essentially drop duplicates as above but will take the record with the max of CPA and the rest of the values from that record. This assumes that the size is the same for all as it is one catastro.
cadastral_df = cadastral_df.groupBy("Referencia Catastral (RC)").agg(
    max(col("Coeficiente de participacion (CPA)")), count(col("Uso (USO)")), 
    first(col("Localizacion de la finca (LFI)")).alias("Localizacion de la finca (LFI)"),
    first(col("Superficie construida (SUCF)")).alias("Superficie construida (SUCF)"),
    mode(col("Tipo de Bien (TIP)")).alias("Tipo de Bien (TIP)"),
    mode(col("Uso (USO)")).alias("Uso (USO)")
)

# modify the addresses, so that they can be better understand by API
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'CL ', 'Calle '))
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'AV ', 'Avenida '))
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'PZ ', 'Plaza '))
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'PJ ', 'Pasaje '))
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'CM ', 'Camino '))
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'Calle ALCALDE GUILLERMO REINA ', 'Calle del ALCALDE GUILLERMO REIN '))
cadastral_df = cadastral_df.withColumn('Localizacion de la finca (LFI)', regexp_replace(cadastral_df['Localizacion de la finca (LFI)'], 'Calle ANIBAL ', 'Calle ANIBA '))

# display(cadastral_df.limit(100))

### Cleaning Consumption Data

In [0]:
from pyspark.sql.functions import col,sum

# "fechaInicioMesConsumo", "fechaFinMesConsumo",
columns_to_keep = ["cups", "nombreEmpresaDistribuidora", "codigoPostalPS",  "consumoEnergiaActivaEnWhP1", "consumoEnergiaActivaEnWhP2", "consumoEnergiaActivaEnWhP3", "consumoEnergiaActivaEnWhP4", "consumoEnergiaActivaEnWhP5", "consumoEnergiaActivaEnWhP6", "codigoTipoSuministro", "CNAE", "informacionImpagos", "codigoTarifaATR", "aplicacionBonoSocial", "tipoPerfilConsumo"]
consumption_df = consumption_df.select(columns_to_keep)

# summing the energy consumption
energy_columns = ["consumoEnergiaActivaEnWhP1", "consumoEnergiaActivaEnWhP2", "consumoEnergiaActivaEnWhP3", "consumoEnergiaActivaEnWhP4", "consumoEnergiaActivaEnWhP5", "consumoEnergiaActivaEnWhP6"]

consumption_df = consumption_df.fillna(0, subset=energy_columns)
consumption_df = consumption_df.withColumn('totalConsumoEnergiaActivaEnWh', col('consumoEnergiaActivaEnWhP1') + col('consumoEnergiaActivaEnWhP2') + col('consumoEnergiaActivaEnWhP3') + col('consumoEnergiaActivaEnWhP4') + col('consumoEnergiaActivaEnWhP5') + col('consumoEnergiaActivaEnWhP6'))

# average the consumption per CUPS
consumption_df = consumption_df.groupBy("cups").agg(first("codigoPostalPS").alias("codigoPostalPS"), first("nombreEmpresaDistribuidora").alias("nombreEmpresaDistribuidora"), avg("totalConsumoEnergiaActivaEnWh").alias("avg_month_consumption"), first("codigoTipoSuministro"), first("CNAE"), first("informacionImpagos"), first("codigoTarifaATR"), first("aplicacionBonoSocial"), first("tipoPerfilConsumo"))


In [0]:
display(consumption_df.limit(10))

cups,codigoPostalPS,nombreEmpresaDistribuidora,avg_month_consumption,first(codigoTipoSuministro),first(CNAE),first(informacionImpagos),first(codigoTarifaATR),first(aplicacionBonoSocial),first(tipoPerfilConsumo)
ES0031101457515001AK0F,29004,"ENDESA DISTRIBUCION ELECTRICA, S.L.",117346414.63414636,UV,4639,0,20,0,
ES0031101457519001QL0F,29010,"ENDESA DISTRIBUCION ELECTRICA, S.L.",159560575.0,UV,8411,0,20,0,
ES0031101457527001LS0F,29004,"ENDESA DISTRIBUCION ELECTRICA, S.L.",124006918.91891892,UV,5510,0,20,0,
ES0031101457529001ZZ0F,29007,"ENDESA DISTRIBUCION ELECTRICA, S.L.",211634100.0,UV,8610,0,20,0,
ES0031101457531001DJ0F,29009,"ENDESA DISTRIBUCION ELECTRICA, S.L.",35558425.0,UV,5813,0,20,0,
ES0031101457535001EB0F,29016,"ENDESA DISTRIBUCION ELECTRICA, S.L.",41616100.0,UV,8411,0,20,0,
ES0031101457549001XG0F,29014,"ENDESA DISTRIBUCION ELECTRICA, S.L.",8755125.0,UV,3600,0,20,0,
ES0031101457552001WZ0F,29004,"ENDESA DISTRIBUCION ELECTRICA, S.L.",15631023.80952381,UV,3600,0,20,0,
ES0031101457554001CJ0F,29006,"ENDESA DISTRIBUCION ELECTRICA, S.L.",27418894.736842107,UV,1414,0,20,0,
ES0031101457555001HR0F,29006,"ENDESA DISTRIBUCION ELECTRICA, S.L.",43114918.91891892,UV,4711,0,20,0,


# Feature Engineering

### Adding postcode to each catastro

In [0]:
from pyspark.sql.functions import udf
import requests

# Define a UDF to retrieve postal codes using the Google Maps Geocoding API
api_key = ""
def get_postal_code(address):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={api_key}"
    response = requests.get(url)
    data = response.json()
    if data["results"]:
        result = data["results"][0]
        for component in result["address_components"]:
            if "postal_code" in component["types"]:
                return component["long_name"]
    return None

get_postal_code_udf = udf(get_postal_code)
cadastral_df = cadastral_df.withColumn("codigoPostalPS", get_postal_code_udf(cadastral_df["Localizacion de la finca (LFI)"]))


In [0]:
clean_cadastral_df = cadastral_df.filter(col("codigoPostalPS").isNotNull())


#We drop the postcodes because if we want to fix this it will be done manually. 
#the reason some addresses return null is because there are some typos for example, they are missing an accent so the google maps api doesn't recognize them. 

In [0]:
display(clean_cadastral_df.limit(10))

Referencia Catastral (RC),max(Coeficiente de participacion (CPA)),count(Uso (USO)),Localizacion de la finca (LFI),Superficie construida (SUCF),Tipo de Bien (TIP),Uso (USO),codigoPostalPS
000400100UF66B,100,5,Camino PRADOS LOS 47 MALAGA (M�LAGA),8.887,RU,Agrario,29006
000700500UF66B,100,1,CR AZUCARERA INTELHORCE 13 MALAGA (M�LAGA),898.0,RU,Industrial agr.,29004
0017109UF7601N,100,5,Avenida VELAZQUEZ 107 MALAGA (M�LAGA),1.648,UR,Comercial,29004
0021501UF7602S,100,3,Avenida VIRGEN DE BELEN 12 MALAGA (M�LAGA),1.503,UR,Religioso,29004
0021502UF7602S,100,4,Avenida VIRGEN DE BELEN 10 MALAGA (M�LAGA),1.223,UR,Cultural,29004
0021801UF7602S,100,8,Avenida VIRGEN DE BELEN 11 MALAGA (M�LAGA),5.838,UR,Cultural,29004
0021803UF7602S,344,52,Calle CHICO DEL MATADERO 10 MALAGA (M�LAGA),2.465,UR,Residencial,29004
0021805UF7602S,100,1,Calle CHICO DEL MATADERO 12 MALAGA (M�LAGA),38.0,UR,"Almac�n,Estac.",29004
0022101UF7602S,5021,134,Avenida BONAIRE 12 MALAGA (M�LAGA),5.538,UR,Residencial,29004
0022201UF7602S,161,63,Avenida BONAIRE 7 MALAGA (M�LAGA),4.857,UR,Residencial,29004


# Creating KPIs

### Filtering postcodes

In [0]:
target_postcodes = clean_cadastral_df.select("codigoPostalPS").distinct()
filtered_consumption_df = consumption_df.join(target_postcodes, consumption_df["codigoPostalPS"] == target_postcodes["codigoPostalPS"], "inner").drop(target_postcodes.codigoPostalPS)


### Creating KPIs

In [0]:
from pyspark.sql.functions import countDistinct

# Calculate the amount of catastros per postcode
catastro_count_per_postcode = clean_cadastral_df.groupBy("codigoPostalPS").agg(countDistinct("Referencia Catastral (RC)").alias("catastro_count"))

# Calculate the amount of catastros per postcode
cups_count_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS").agg(countDistinct("cups").alias("cups_count"))

#here we sum all the values of count(*) which represents the number of households/offices per CUPS
households_count_per_postcode = clean_cadastral_df.groupBy("codigoPostalPS").agg(sum("count(Uso (USO))").alias("number_of_households"))

# Calculate the average consumption per postcode
avg_consumption_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS").agg(avg("avg_month_consumption").alias("avg_consumption"))

# Calculate the average size per postcode
avg_size_per_postcode = clean_cadastral_df.groupBy("codigoPostalPS").agg(avg("Superficie construida (SUCF)").alias("avg_superficie"))

# average particpation
avg_participation_per_postcode = clean_cadastral_df.groupBy("codigoPostalPS").agg(avg("max(Coeficiente de participacion (CPA))").alias("avg_participation"))

# average first(informacionImpagos)
avg_impagos_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS").agg(avg("first(informacionImpagos)").alias("avg_info_impagos"))

# average first(aplicacionBonoSocial)
avg_bono_social_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS").agg(avg("first(aplicacionBonoSocial)").alias("avg_bono_social"))

# Modify column names for the clean_cadastral_df DataFrame
catastro_type_per_postcode = clean_cadastral_df.groupBy("codigoPostalPS") \
    .pivot("Uso (USO)") \
    .agg(count("*").alias("count")) \
    .fillna(0)

# Modify column names for the filtered_consumption_df DataFrame
tariffs_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS") \
    .pivot("first(codigoTarifaATR)") \
    .agg(count("*").alias("count")) \
    .fillna(0)

tipoPerfilConsumo_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS") \
    .pivot("first(tipoPerfilConsumo)") \
    .agg(count("*").alias("count")) \
    .fillna(0)

avg_consumption_per_CNAE_and_Postcode = filtered_consumption_df.groupBy('codigoPostalPS','first(CNAE)').agg(avg('avg_month_consumption').alias('avg_consumption_by_CNAE_and_Postcode'))

### Further KPIs. Could include in future.###

# # rate of payment defaults for different types of economic activities per postcode.
# avg_impagos_per_CNAE_and_Postcode = filtered_consumption_df.groupBy("codigoPostalPS","first(CNAE)").agg(avg("first(informacionImpagos)").alias("avg_info_impagos_CNAE"))

# #Analyze how energy consumption differs across different types of economic activities and per postcode
# avg_consumption_per_CNAE_and_Postcode = filtered_consumption_df.groupBy('codigoPostalPS','first(CNAE)').agg(avg('avg_month_consumption').alias('avg_consumption_by_CNAE_and_Postcode'))

# #see how often the bono social is being applied per CNAE and postcode
# avg_bono_social_per_CNAE_and_Postcode= filtered_consumption_df.groupBy('codigoPostalPS','first(CNAE)').agg(avg('first(aplicacionBonoSocial)').alias('bono_application_rate_per_CNAE_and_Postcode'))

# #See which tarrifs occurs the most frequent per CNAE and zipcode
# most_recurring_ATRtarifa_per_RCNAE_and_Postcode= filtered_consumption_df.groupBy('codigoPostalPS','first(CNAE)').agg(mode("first(codigoTarifaATR)").alias('Recurring_codigoTarifaATR_per_CNAE_and_Postcode'))

# #See which Type of Consumption Profile occurs the most frequent per CNAE and zipcode
# most_recurring_TipoPerfilConsumo_per_RCNAE_and_Postcode= filtered_consumption_df.groupBy('codigoPostalPS','first(CNAE)').agg(mode("first(tipoPerfilConsumo)").alias('Recurring_tipoPerfilConsumo_per_CNAE_and_Postcode'))

# #See the occurernce of a payment default if a bonosocial was provided per zipcode
#count_informacionImpagos_given_aplicacionBonoSocial_and_Postcode= filtered_consumption_df.groupBy('codigoPostalPS','first(aplicacionBonoSocial)').agg(count("first(informacionImpagos)").alias('count_defaultpayment_given_aplicacionBonoSocial_and_Postcode'))

# #See the number of a payment default per CNAE and zipcode
#count_informacionImpagos_per_CNAE_and_Postcode= filtered_consumption_df.groupBy('codigoPostalPS','first(CNAE)').agg(count("first(informacionImpagos)").alias('count_defaultpayment_per_CNAE_and_Postcode'))


# Add a preface to the column names
prefixed_columns = ["codigoPostalPS"] + [f"catastro_{col_name}" for col_name in catastro_type_per_postcode.columns[1:]]
catastro_type_per_postcode = catastro_type_per_postcode.toDF(*prefixed_columns)

prefixed_columns = ["codigoPostalPS"] + [f"tariffs_{col_name}" for col_name in tariffs_per_postcode.columns[1:]]
tariffs_per_postcode = tariffs_per_postcode.toDF(*prefixed_columns)

prefixed_columns = ["codigoPostalPS"] + [f"tipoPerfilConsumo_{col_name}" for col_name in tipoPerfilConsumo_per_postcode.columns[1:]]
tipoPerfilConsumo_per_postcode = tipoPerfilConsumo_per_postcode.toDF(*prefixed_columns)



In [0]:
# Apply the function to the 'CNAE' column in your DataFrame
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

group_dict = {
    'A.- AGRICULTURA, GANADERÍA, SILVICULTURA Y PESCA': range(111, 323),
    'B.- INDUSTRIAS EXTRACTIVAS': range(510, 991),
    'C.- INDUSTRIA MANUFACTURERA': range(1011, 3321),
    'D.- SUMINISTRO DE ENERGÍA ELÉCTRICA, GAS, VAPOR Y AIRE ACONDICIONADO': range(3512, 3531),
    'E.- SUMINISTRO DE AGUA, ACTIVIDADES DE SANEAMIENTO, GESTIÓN DE RESIDUOS Y DESCONTAMINACIÓN': range(3600, 3901),
    'F.- CONSTRUCCIÓN': range(4110, 4400),
    'G.- COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACIÓN DE VEHÍCULOS DE MOTOR Y MOTOCICLETAS': range(4511, 4800),
    'H.- TRANSPORTE Y ALMACENAMIENTO': range (4910,5321),
    'I.- HOSTELERÍA': range(5510,5631),
    'J.- INFORMACIÓN Y COMUNICACIONES': range(5811,6400),
    'K.- ACTIVIDADES FINANCIERAS Y DE SEGUROS':range(6411,6631),
    'L.- ACTIVIDADES INMOBILIARIAS': range(6810,6833),
    'M.- ACTIVIDADES PROFESIONALES, CIENTÍFICAS Y TÉCNICAS': range(6910,7501),
    'N.- ACTIVIDADES ADMINISTRATIVAS Y SERVICIOS AUXLIARES': range(7711,8300),
    'O.- ADMINISTRACIÓN PÚBLICA Y DEFENSA; SEGURIDAD SOCIAL OBLIGATORIA': range(8411,8431),
    'P.- EDUCACIÓN': range(8510,8561),
    'Q.- ACTIVIDADES SANITARIAS Y DE SERVICIOS SOCIALES': range(8610,8900),
    'R.- ACTIVIDADES ARTÍSTICAS, RECREATIVAS Y DE ENTRENIMIENTO': range (9001,9330),
    'S.- OTROS SERVICIOS': range(9411,9610),
    'T.- ACTIVIDADES DE LOS HOGARES COMO EMPLEADORES DE PERSONAL DOMÉSTICO': range(9700,9821),
    'U.- ACTIVIDADES DE ORGANIZACIONES Y ORGANISMOS EXTRATERRITORIALES': range(9900,10000),

}

# Function to map CNAE codes to their groups
def map_cnae_to_group(cnae_code):
    for group, code_range in group_dict.items():
        if cnae_code in code_range:
            return group
    return 'Unknown Group'

# Define the UDF (User-Defined Function) to map CNAE values to groups
map_cnae_to_group_udf = udf(map_cnae_to_group, StringType())

# Add the 'Group_CNAE' column using the UDF
filtered_consumption_df = filtered_consumption_df.withColumn('Group_CNAE', map_cnae_to_group_udf(filtered_consumption_df['first(CNAE)']))

cnae_per_postcode = filtered_consumption_df.groupBy("codigoPostalPS") \
    .pivot("Group_CNAE") \
    .agg(count("*").alias("count")) \
    .fillna(0)

prefixed_columns = ["codigoPostalPS"] + [f"group_cnae_{col_name}" for col_name in cnae_per_postcode.columns[1:]]
cnae_per_postcode = cnae_per_postcode.toDF(*prefixed_columns)

In [0]:
# Join the statistics together
combined_stats = (
    catastro_count_per_postcode
    .join(cups_count_per_postcode, "codigoPostalPS")
    .join(households_count_per_postcode, "codigoPostalPS")
    .join(avg_consumption_per_postcode, "codigoPostalPS")
    .join(avg_size_per_postcode, "codigoPostalPS")
    .join(avg_participation_per_postcode, "codigoPostalPS")
    .join(avg_impagos_per_postcode, "codigoPostalPS")
    .join(avg_bono_social_per_postcode, "codigoPostalPS")
    .join(catastro_type_per_postcode, "codigoPostalPS")
    .join(tariffs_per_postcode, "codigoPostalPS")
    .join(tipoPerfilConsumo_per_postcode, "codigoPostalPS")    
    .join(cnae_per_postcode, "codigoPostalPS")
)
display(combined_stats.limit(10))

codigoPostalPS,catastro_count,cups_count,number_of_households,avg_consumption,avg_superficie,avg_participation,avg_info_impagos,avg_bono_social,catastro_Agrario,"catastro_Almac�n,Estac.",catastro_Comercial,catastro_Cultural,catastro_Deportivo,catastro_Edif. Singular,catastro_Espect�culos,catastro_Industrial,catastro_Industrial agr.,"catastro_Ocio,Hosteler�a",catastro_Oficinas,catastro_Religioso,catastro_Residencial,"catastro_Sanidad,Benefic",catastro_Suelo sin edif.,tariffs_1,tariffs_3,tariffs_4,tariffs_5,tariffs_6,tariffs_7,tariffs_8,tariffs_11,tariffs_12,tariffs_18,tariffs_19,tariffs_20,tipoPerfilConsumo_null,tipoPerfilConsumo_P2.0TD,tipoPerfilConsumo_P3.0TD,"group_cnae_A.- AGRICULTURA, GANADERÍA, SILVICULTURA Y PESCA",group_cnae_B.- INDUSTRIAS EXTRACTIVAS,group_cnae_C.- INDUSTRIA MANUFACTURERA,"group_cnae_D.- SUMINISTRO DE ENERGÍA ELÉCTRICA, GAS, VAPOR Y AIRE ACONDICIONADO","group_cnae_E.- SUMINISTRO DE AGUA, ACTIVIDADES DE SANEAMIENTO, GESTIÓN DE RESIDUOS Y DESCONTAMINACIÓN",group_cnae_F.- CONSTRUCCIÓN,group_cnae_G.- COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACIÓN DE VEHÍCULOS DE MOTOR Y MOTOCICLETAS,group_cnae_H.- TRANSPORTE Y ALMACENAMIENTO,group_cnae_I.- HOSTELERÍA,group_cnae_J.- INFORMACIÓN Y COMUNICACIONES,group_cnae_K.- ACTIVIDADES FINANCIERAS Y DE SEGUROS,group_cnae_L.- ACTIVIDADES INMOBILIARIAS,"group_cnae_M.- ACTIVIDADES PROFESIONALES, CIENTÍFICAS Y TÉCNICAS",group_cnae_N.- ACTIVIDADES ADMINISTRATIVAS Y SERVICIOS AUXLIARES,group_cnae_O.- ADMINISTRACIÓN PÚBLICA Y DEFENSA; SEGURIDAD SOCIAL OBLIGATORIA,group_cnae_P.- EDUCACIÓN,group_cnae_Q.- ACTIVIDADES SANITARIAS Y DE SERVICIOS SOCIALES,"group_cnae_R.- ACTIVIDADES ARTÍSTICAS, RECREATIVAS Y DE ENTRENIMIENTO",group_cnae_S.- OTROS SERVICIOS,group_cnae_T.- ACTIVIDADES DE LOS HOGARES COMO EMPLEADORES DE PERSONAL DOMÉSTICO,group_cnae_U.- ACTIVIDADES DE ORGANIZACIONES Y ORGANISMOS EXTRATERRITORIALES,group_cnae_Unknown Group
29006,779,29480,6259,552319.1765375816,282.14158292512235,98.49885057471263,0.0196065128900949,0.0601085481682496,1,9,26,1,8,1,5,269,0,8,15,4,428,2,0,3204,216,1941,95,60,3,0,7,3,23020,872,59,429,28093,958,8,1,305,7,45,110,1203,1747,304,189,67,46,100,170,185,63,48,65,234,24536,1,46
29190,3,9267,417,385881.1437178201,6.8693333333333335,,0.0140282723643034,0.051796697960505,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,697,21,489,6,9,1,0,2,0,7897,136,9,83,9045,139,23,0,43,1,47,89,155,219,64,16,10,19,10,21,74,20,14,18,53,8356,0,15
29004,258,32413,21665,646302.8073648231,197.39161487964984,92.11811023622047,0.0139141702403356,0.0447659889550489,0,9,6,6,2,1,1,37,1,5,3,1,184,3,0,2899,220,1650,75,62,0,0,9,6,26264,1145,83,534,30736,1143,14,3,206,12,44,112,1322,1319,422,197,62,67,159,202,196,59,48,75,215,27643,1,35
29003,127,17403,13761,260690.8636416219,66.43285714285715,90.6734693877551,0.0222375452508188,0.0806182842038728,0,2,6,8,0,0,0,10,0,1,0,0,97,0,0,2088,43,1006,15,12,0,0,0,1,14076,155,7,164,17065,174,4,0,41,0,18,21,486,398,165,44,15,14,26,29,61,31,22,36,84,15888,0,20
29010,5,40488,181,421776.1209052699,73.5,100.0,0.0128186129223473,0.0513485477178423,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,2853,129,2089,50,34,4,0,5,0,34597,692,35,319,39439,730,8,1,138,1,58,93,847,632,375,155,56,41,89,141,200,95,68,49,209,37160,0,72


In [0]:
# # Specify the output path and file name for the JSON file
output_path = "/mnt/electricityconsumption/kpis/postcode_kpi_data.json"

# Write the DataFrame to JSON format
combined_stats.write.format("json") \
    .mode("overwrite") \
    .save(output_path)

In [0]:
# Unmount the directory
#dbutils.fs.unmount("/mnt/electricityconsumption")