In [0]:
# Databricks notebook source
# DBTITLE 1,Variable Definition
MOUNTPOINT = "/mnt/source/"
STORAGE_ACCOUNT_NAME = "releasenotessa"
CONTAINER_NAME = "mapcontent-pbf"
SOURCE = f"abfss://{CONTAINER_NAME}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/"
 
mountPath = MOUNTPOINT+STORAGE_ACCOUNT_NAME
 
# COMMAND ----------
 
# DBTITLE 1,Unmount Storage Account
[dbutils.fs.unmount(mountPath) for mnt in dbutils.fs.mounts() if mountPath in mnt.mountPoint]
print("Unmounted pbfs container of releasenotessa from " + MOUNTPOINT)
 
# COMMAND ----------
 
# DBTITLE 1,Mount Storage Account to Databricks Workspace
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": "5f84bad8-717f-433f-b34c-19844c2e969d",
          "fs.azure.account.oauth2.client.secret": "hdp8Q~7vyU_.ERTjiJxRupe_WTtaZb1hSGSLKau4",
          "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/374f8026-7b54-4a3a-b87d-328fa26ec10d/oauth2/token"
          }
 
try:
    if any(mnt.mountPoint in mountPath for mnt in dbutils.fs.mounts()):
        dbutils.fs.mount(
            source = SOURCE,
            mount_point = mountPath,
            extra_configs = configs)
    else:
        print(f"Directory {mountPath} successfully mounted.")
except Exception as e:
  print(f"Error mounting Directory at {mountPath}. Error : {e}")
 
# COMMAND ----------
 
# DBTITLE 1,List the mounted directory
dbutils.fs.ls(mountPath)


###### 3g | delta
###### planet_osm_point -> nodes
###### planet_osm_polygon -> arealgeoms
###### planet_osm_line -> lineeargeoms
###### planet_osm_roads -> lineargeoms
######
######

In [0]:
%fs 
ls /mnt/source/releasenotessa/Orbis_Ventura/WRL/

path,name,size,modificationTime
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/22222.000/,22222.000/,0,1700657357000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23350.000/,23350.000/,0,1693832953000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23370.000/,23370.000/,0,1694772403000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23370.001/,23370.001/,0,1694910444000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23370.002/,23370.002/,0,1695806000000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23370.003/,23370.003/,0,1695686324000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23380.000/,23380.000/,0,1695282061000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23390.000/,23390.000/,0,1695985404000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23400.000/,23400.000/,0,1696514982000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/23410.000/,23410.000/,0,1697109161000


In [0]:
%fs 
ls /mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/

path,name,size,modificationTime
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/arealgeoms/,arealgeoms/,0,1707490802000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/country_boundary_indexed/,country_boundary_indexed/,0,1707490827000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/lineargeoms/,lineargeoms/,0,1707490756000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/nodes/,nodes/,0,1707488311000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/relations/,relations/,0,1707488979000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/relsgeoms/,relsgeoms/,0,1707490578000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/ways/,ways/,0,1707488727000
dbfs:/mnt/source/releasenotessa/Orbis_Ventura/WRL/24060.000/prototype/AUT/output/data/waysgeoms/,waysgeoms/,0,1707490068000



# Part 1 
## Adams Address Ranges SQL


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### 1. Sample Table Temp

##### sample as(SELECT 

#####           osm_id as aa8_osm_id ,
 #####          "name" as index_searched_query,
 #####          ST_SetSRID(way, 4326) as coordinates

##### FROM "ade_wrl_23490_000_eur_aut".planet_osm_polygon
##### where boundary= 'administrative' and admin_level = '8')

In [0]:
country = 'AUT'
OrbisVenturaRelease = '24060.000'

In [0]:
planet_osm_polygon = spark.read.option("inferSchema", 'true')\
    .format("delta").load(f"/mnt/source/releasenotessa/Orbis_Ventura/WRL/{OrbisVenturaRelease}/prototype/{country}/output/data/arealgeoms")

planet_osm_polygon.createOrReplaceTempView("planet_osm_polygon")

In [0]:
planet_osm_line = spark.read.option("inferSchema", 'true')\
    .format("delta").load(f"/mnt/source/releasenotessa/Orbis_Ventura/WRL/{OrbisVenturaRelease}/prototype/{country}/output/data/lineargeoms")
    
planet_osm_line.createOrReplaceTempView("planet_osm_line")

In [0]:
planet_osm_point = spark.read.option("inferSchema",'true').format("delta").load(f"/mnt/source/releasenotessa/Orbis_Ventura/WRL/{OrbisVenturaRelease}/prototype/{country}/output/data/nodes")

planet_osm_point.createOrReplaceTempView("planet_osm_point")

In [0]:
planet_osm_ways = spark.read.option("inferSchema","true").format("delta").load(f"/mnt/source/releasenotessa/Orbis_Ventura/WRL/{OrbisVenturaRelease}/prototype/{country}/output/data/waysgeoms")

planet_osm_ways.createOrReplaceTempView("planet_osm_ways")

In [0]:
planet_osm_polygon.printSchema()

In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW sample AS
SELECT id AS aa8_osm_id,
       tags['name'] AS index_searched_query,
       geometry AS coordinates
FROM planet_osm_polygon
WHERE boundary = 'administrative' AND admin_level = '8'



### Table 2 : Tags Table Temp

##### tags as (

                        distinct skeys(tags) keys
                      
##### from "ade_wrl_23490_000_eur_aut".planet_osm_polygon pop
##### where admin_level  in ('4', '8'))

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW tags AS 
SELECT DISTINCT EXPLODE(map_keys(tags)) AS keys
FROM planet_osm_polygon
WHERE admin_level IN ('4', '8')

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW hnr_way AS 
select tags["addr:interpolation"] as interpolation ,planet_osm_line.*
from planet_osm_line
where tags["addr:interpolation"] is not null 

In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW name_tags AS 
select *
from tags
where (keys like '%name:%' or keys like '%alt%name') and keys not like '%pronunciation%'

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW hsn_tags AS 
SELECT DISTINCT EXPLODE(map_keys(tags)) AS keys
FROM planet_osm_point
WHERE tags["addr:housenumber"] IS NOT NULL OR array_contains(map_keys(tags), 'addr:housenumber')


In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW hsn_keys AS 
select * from hsn_tags where (keys like '%addr:housenumber%')

In [0]:
spark.sql("DESCRIBE hnr_way").show()

In [0]:
spark.sql("DESCRIBE planet_osm_ways").show()

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW address_ranges AS 
SELECT 
    hnr_way.id as osm_id,
    hnr_way.geometry AS way,
    hnr_way.tags["addr:interpolation"] AS interpolation,
    hnr_way.tags,
    hnr_way.tags['addr:street'] AS road_name_way,
    hnr_way.tags['addr:interpolation'] AS interpolation_tag,
    regexp_replace(regexp_replace(hnr_way.tags['addr:intermediate'], '[^0-9;]', ''),';', ',') AS intermediate_one,
    hnr_way.name,
    nodes
FROM 
    hnr_way
JOIN 
    (
        SELECT id, explode(nodes) AS nodes
        FROM planet_osm_ways
    ) ways
ON 
    ways.id = hnr_way.id



###### CREATE OR REPLACE TEMPORARY VIEW hsn AS 
###### select
###### pop.tags as tags_hsn
###### ,   array_remove(array_append(pop.tags -> array((select keys from hsn_keys )), pop."addr:housenumber"]), null) as ######range_hsn
######, pop.tags["addr:housenumber"] as range_hsn
###### , address_ranges.*
###### from address_ranges
###### left join planet_osm_point pop
###### on pop.id = address_ranges.nodes
###### where pop.tags is not null and pop.tags['layer_id'] = '15633'

In [0]:
address_ranges_API = spark.sql("select * from address_ranges")
planet_osm_point_API = spark.sql("select * from planet_osm_point")
hsn_keys_API = spark.sql("select * from hsn_keys")

In [0]:
planet_osm_point_API = planet_osm_point_API.withColumn("tags_hsn",col("tags"))
planet_osm_point_API = planet_osm_point_API.select("id","tags_hsn")


In [0]:
planet_osm_point_API = planet_osm_point_API.withColumn("license_zone", when(col("tags_hsn").getItem("license_zone").isNotNull(), col("tags_hsn").getItem("license_zone")).otherwise(None))

In [0]:
# Create new columns based on the selection
planet_osm_point_API = planet_osm_point_API.withColumn("license_zone", col("tags_hsn.license_zone"))
planet_osm_point_API = planet_osm_point_API.withColumn("layer_id", col("tags_hsn.layer_id"))

In [0]:
addressRnjesJoinPointLayer15633 = (
    address_ranges_API
    .join(planet_osm_point_API.alias("pop"), col("pop.id") == col("address_ranges.nodes"), "left")
    .filter(col("pop.tags_hsn").isNotNull() & (col("pop.tags_hsn.layer_id") == "15633"))
)

In [0]:
hsn_keys_Temp_List = hsn_keys_API.select("keys").collect()
hsn_keys = [i[0] for i in hsn_keys_Temp_List]
hsn_keys = [x for x in hsn_keys if x.startswith('addr:')]
print(hsn_keys)


In [0]:
def get_values_from_keys(x: dict, keys: list) -> list:
    output_value = [x.get(key) for key in keys]
    output_value = [x for x in output_value if x is not None]
    return output_value

get_values_from_hsn_keys = lambda x: get_values_from_keys(x, keys=hsn_keys)
 
udf_get_values_from_hsn_keys = udf(get_values_from_hsn_keys, ArrayType(StringType()))

In [0]:
hsn = addressRnjesJoinPointLayer15633.withColumn('range_hsn', udf_get_values_from_hsn_keys('tags_hsn'))  
hsn = hsn.withColumn('range_hsn_length',size('range_hsn'))


In [0]:
hsn = hsn.filter(col("range_hsn_length") > 0)

In [0]:
hsn.printSchema()

In [0]:
hsn.createOrReplaceTempView("hsn")


In [0]:
# %sql

# CREATE OR REPLACE TEMPORARY VIEW hsn AS 
# select
# pop.tags as tags_hsn
# ,   array_remove(array_append(pop.tags -> array((select keys from hsn_keys )), pop."addr:housenumber"]), null) as range_hsn
# , pop.tags["addr:housenumber"] as range_hsn
# , address_ranges.*
# from address_ranges
# left join planet_osm_point pop
# on pop.id = address_ranges.nodes
# where pop.tags is not null and pop.tags['layer_id'] = '15633'


In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW hsn_long AS 
    select
    hsn.osm_id
,   hsn.tags as tags_network
,   hsn.road_name_way
,   hsn.interpolation
,   hsn.interpolation_tag
,   hsn.way
,   hsn.name
,   first_value(tags_hsn) over(partition by osm_id) as first_tags_hsn
,   range_hsn 
,   hsn.nodes
,   hsn.license_zone 
,   hsn.layer_id
,   hsn.intermediate_one
from hsn


In [0]:
hsn_long_df = spark.sql("select * from hsn_long")

In [0]:
hsn_long_df = hsn_long_df.withColumn("tags_network", col("tags_network").cast(StringType()))
hsn_long_df = hsn_long_df.withColumn("first_tags_hsn", col("first_tags_hsn").cast(StringType()))
hsn_long_df = hsn_long_df.withColumn("country",lit(country))

In [0]:
hsn_long_df.createOrReplaceTempView("hsn_long")

In [0]:
%sql 

CREATE OR REPLACE TEMPORARY VIEW addressrangesfinal AS 
    select
        hsn_long.osm_id,
        min(range_hsn) as min_hsn,
        max(range_hsn) as max_hsn,
        hsn_long.road_name_way,
        hsn_long.interpolation,
        hsn_long.interpolation_tag,
        hsn_long.name,
        array_distinct(collect_list(range_hsn)) as intermediates,
        hsn_long.tags_network,
        hsn_long.first_tags_hsn,
        hsn_long.license_zone ,
        hsn_long.country,
        hsn_long.layer_id,
        hsn_long.intermediate_one

    from hsn_long
    group by
        hsn_long.osm_id,
        hsn_long.road_name_way,
        hsn_long.interpolation,
        hsn_long.interpolation_tag,
        hsn_long.name,
        hsn_long.tags_network,
        hsn_long.first_tags_hsn,
        hsn_long.license_zone ,
        hsn_long.country,
        hsn_long.layer_id,
        hsn_long.intermediate_one


In [0]:
# %sql
# select * from addressrangesfinal


# Part 2
## Rajendsa  Address Ranges Linked Count SQL

In [0]:
planet_osm_rels = spark.read.option("inferSchema", 'true')\
    .format("delta").load(f"/mnt/source/releasenotessa/Orbis_Ventura/WRL/23490.000/prototype/{country}/output/data/relations")

planet_osm_rels.createOrReplaceTempView("planet_osm_rels")

In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW ar_rels_ AS 
select 
tags['side'] as ar_side_of_line,

        CASE WHEN array_contains(relations.role, 'access_via') THEN 
            element_at(filter(relations, x -> x.role = 'access_via').id, 1)
        ELSE NULL END AS rels_ar_rp_id,
        CASE WHEN array_contains(relations.role, 'access_from') THEN 
            element_at(filter(relations, x -> x.role = 'access_from').id, 1)
        ELSE NULL END AS rels_ar_road_id,
        CASE WHEN array_contains(relations.role, 'access_to') THEN 
            element_at(filter(relations, x -> x.role = 'access_to').id, 1)
        ELSE NULL END AS rels_ar_point_id

from planet_osm_rels 
WHERE 
    (
        tags['road_access'] = 'geocoding' 
        OR tags['road_access'] = 'geocoding;routing' 
        OR tags['road_access'] = 'routing;geocoding'
    )
    AND tags['layer_id'] = '15633'

In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW apt_rels_ AS 
select 
tags['side'] as apt_side_of_line,

      CASE WHEN array_contains(relations.role, 'access_via') THEN 
          element_at(filter(relations, x -> x.role = 'access_via').id, 1)
      ELSE NULL END AS rels_apt_rp_id,
      CASE WHEN array_contains(relations.role, 'access_from') THEN 
          element_at(filter(relations, x -> x.role = 'access_from').id, 1)
      ELSE NULL END AS rels_apt_road_id,
      CASE WHEN array_contains(relations.role, 'access_to') THEN 
          element_at(filter(relations, x -> x.role = 'access_to').id, 1)
      ELSE NULL END AS rels_apt_point_id

from planet_osm_rels 
WHERE 
    (
        tags['road_access'] = 'geocoding' 
        OR tags['road_access'] = 'geocoding;routing' 
        OR tags['road_access'] = 'routing;geocoding'
    )
    AND tags['layer_id'] = '23783'

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW ar_apt_rels AS 

select *
					 from ar_rels_
					 left join apt_rels_
					 on ar_rels_.rels_ar_road_id = apt_rels_.rels_apt_road_id
					 where ar_rels_.ar_side_of_line = apt_rels_.apt_side_of_line
					 or ar_rels_.ar_side_of_line = apt_rels_.apt_side_of_line is null

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW ar_apt_rels_log AS 

select distinct rels_ar_point_id,
					rels_ar_road_id,
					rels_apt_road_id,
					ar_side_of_line,
					apt_side_of_line,
					count (distinct rels_apt_rp_id) as link_apt_count
					from ar_apt_rels
					group by rels_ar_point_id,
					rels_ar_road_id,
					rels_apt_road_id,
					ar_side_of_line,
					apt_side_of_line

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW hsn_long_2 AS 
select *
			   from hsn_long
			   left join ar_apt_rels_log
			   on hsn_long.nodes = ar_apt_rels_log.rels_ar_point_id

In [0]:
# Read the hsn_long_2 view into a DataFrame
hsn_long_2_df = spark.sql("SELECT * FROM hsn_long_2")

In [0]:
hsn_long_2_df.printSchema()

In [0]:
hsn_long_2_df = hsn_long_2_df.withColumn("tags_network", col("tags_network").cast(StringType()))
hsn_long_2 = hsn_long_2_df.withColumn("first_tags_hsn", col("first_tags_hsn").cast(StringType()))

In [0]:
hsn_long_2.createOrReplaceTempView("hsn_long_2")

In [0]:
hsn_long_2.printSchema()

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW linden_apt_count_final AS 

select distinct hsn_long_2.osm_id,
				hsn_long_2.tags_network,
				hsn_long_2.way,
				hsn_long_2.road_name_way,
				hsn_long_2.interpolation,
				hsn_long_2.interpolation_tag,
				hsn_long_2.name,
				hsn_long_2.first_tags_hsn,
				min(range_hsn) as min_hsn,   
				max(range_hsn) as max_hsn, 
				hsn_long_2.rels_ar_road_id,
				hsn_long_2.ar_side_of_line,
				hsn_long_2.rels_apt_road_id,
				hsn_long_2.apt_side_of_line,
				hsn_long_2.link_apt_count

				from hsn_long_2
				group by
				hsn_long_2.osm_id,
				hsn_long_2.tags_network,
				hsn_long_2.way,
				hsn_long_2.road_name_way,
				hsn_long_2.interpolation,
				hsn_long_2.interpolation_tag,
				hsn_long_2.name,
				hsn_long_2.first_tags_hsn,
				hsn_long_2.rels_ar_road_id,
				hsn_long_2.ar_side_of_line,
				hsn_long_2.rels_apt_road_id,
				hsn_long_2.apt_side_of_line,
				hsn_long_2.link_apt_count


In [0]:
addressrangesfinal = spark.sql("select * from addressrangesfinal")
linded_apt_count_final = spark.sql("select * from linden_apt_count_final")

In [0]:
# Modifying records from Rajendar's query to avoid ambiguous column names.
linded_apt_count_final = linded_apt_count_final.drop("tags_network","road_name_way","interpolation","interpolation_tag","name","first_tags_hsn","min_hsn","max_hsn")
linded_apt_count_final= linded_apt_count_final.withColumnRenamed("osm_id","linked_osm_id")

In [0]:
linded_apt_count_final.printSchema()


In [0]:
# Joinning Dataframe for Linked apt count with Address Ranges
addressRangesLinkedAptCount = addressrangesfinal.join(linded_apt_count_final, addressrangesfinal['osm_id'] == linded_apt_count_final['linked_osm_id'], how='left')



# Part 3
## Address Ranjest Count and Linked APT Process 

In [0]:

# Define a UDF to generate the array for "even" interpolation
# def generate_array_even(min_hsn, max_hsn):
#     return list(range(int(min_hsn), int(max_hsn) + 1, 2))

def generate_array_even(min_hsn, max_hsn):
    # return list(range(int(min_hsn), int(max_hsn) + 1, 2))
        # Check if min_hsn and max_hsn are not None before converting to int
    if min_hsn is not None and max_hsn is not None:
        # Convert to integers and generate the array
        return list(range(int(min_hsn), int(max_hsn) + 1, 2))
    else:
        # Handle the case where either min_hsn or max_hsn is None
        return []



# Define a UDF to generate the array for "odd" interpolation
def generate_array_odd(min_hsn, max_hsn):
    # Check if min_hsn and max_hsn are not None before converting to int
    if min_hsn is not None and max_hsn is not None:
        # Convert to integers and generate the array
        return list(range(int(min_hsn), int(max_hsn) + 1, 2))
    else:
        # Handle the case where either min_hsn or max_hsn is None
        return []


# Define a UDF to generate the array for "numeric_mixed" interpolation
def generate_array_numeric_mixed(min_hsn, max_hsn):
    if min_hsn is not None and max_hsn is not None:
    # Convert to integers and generate the array
        return list(range(int(min_hsn), int(max_hsn) + 1))
    else:
    # Handle the case where either min_hsn or max_hsn is None
        return []
    

# # Define a UDF to convert the string to an array of integers
# def string_to_array(string):
#     if string:
#         return [int(x.strip()) for x in string.split(',')]
#     else:
#         return []

def string_to_array(string):
    if string:
        return [int(x.strip()) if x.strip() else 0 for x in string.split(',')]
    else:
        return []


# Function to update the array based on min_hsn and max_hsn
def update_array(min_hsn, max_hsn, hnr_array):
    if min_hsn == max_hsn:
        return hnr_array + [min_hsn]
    else:
        return hnr_array + [min_hsn, max_hsn]

def swapMin_hsnMax_hsnAlphabate(min_hsn_alpha, max_hsn_alpha):
    """
    Swaps the input HSN codes based on alphanumeric comparison.

    Parameters:
    - min_hsn_alpha (str): The minimum HSN code to be compared and potentially swapped.
    - max_hsn_alpha (str): The maximum HSN code to be compared and potentially swapped.

    Returns:
    tuple: A tuple containing the swapped HSN codes, or the original codes if they are already in the correct order.

    Example:
    ```python
    min_code, max_code = swapMin_hsnMax_hsnAlphabate("123A", "456B")
    print(min_code, max_code)  # Output: "123A" "456B"
    ```
    """
    if min_hsn_alpha is None or max_hsn_alpha is None:
        return min_hsn_alpha, max_hsn_alpha

    min_hsn_i, min_hsn_a = min_hsn_alpha[:-1], min_hsn_alpha[-1]
    max_hsn_i, max_hsn_a = max_hsn_alpha[:-1], max_hsn_alpha[-1]

    if (min_hsn_i.isdigit() and max_hsn_i.isdigit()) and (min_hsn_a.isalpha() and max_hsn_a.isalpha()):
        if ord(min_hsn_a) > ord(max_hsn_a):
            min_hsn_alpha, max_hsn_alpha = max_hsn_alpha, min_hsn_alpha
        return min_hsn_alpha, max_hsn_alpha
    else:
        return min_hsn_alpha, max_hsn_alpha
    

def generate_array_alphabetic(min_hsn_alphaN, max_hsn_alphaN):
    """
    Generates an array based on the input min_hsn and max_hsn values.

    Parameters:
        min_hsn_alphaN (str): The minimum HSN value.
        max_hsn_alphaN (str): The maximum HSN value.

    Returns:
        list: An array of values based on the conditions:
            - If the last character of both columns is an alphabet:
                - Segregates the last character and remaining digits.
                - Creates lists based on the conditions.
            - If the last character is not an alphabet, returns [min_hsn, max_hsn].
    """
    print(f"min_hsn_alphaN: {min_hsn_alphaN}, max_hsn_alphaN: {max_hsn_alphaN}")
    if min_hsn_alphaN is not None and max_hsn_alphaN is not None:
    
        if isinstance(min_hsn_alphaN, str) and isinstance(max_hsn_alphaN, str) and min_hsn_alphaN and max_hsn_alphaN:
            if min_hsn_alphaN[-1].isalpha() and max_hsn_alphaN[-1].isalpha():
                # Filter records where the last character of both columns is an alphabet
                # Segregate the last character and remaining digits
                min_alpha, min_digits = min_hsn_alphaN[-1], min_hsn_alphaN[:-1]
                max_alpha, max_digits = max_hsn_alphaN[-1], max_hsn_alphaN[:-1]

                # Create lists based on the conditions
                alpha_list = [chr(i) for i in range(ord(min_alpha), ord(max_alpha) + 1)]
                digits_list = [f"{min_digits}{alpha}" for alpha in alpha_list] if min_digits == max_digits else [min_hsn_alphaN, max_hsn_alphaN]

                print(f"digits_list: {digits_list}")
                
                return digits_list
    
    # If the last character is not an alphabet or input is not valid strings, return [min_hsn, max_hsn]
    return None


In [0]:
addressRangesLinkedAptCount.printSchema()

In [0]:
df = addressRangesLinkedAptCount
# Extracting the first element from the array columns and creating new columns
df = df.withColumn('min_hsn', col('min_hsn').getItem(0))
df = df.withColumn('max_hsn', col('max_hsn').getItem(0))

In [0]:

# lower column license_zone and country
df = df.withColumn("license_zone",lower(col("license_zone")))
df = df.withColumn("country",lower(col("country")))

In [0]:
# Filter keep only Column country = license_zone
df = df.filter(col("country") == col("license_zone"))

In [0]:
# create new column "min_hsn_alpha" & "max_hsn_alpha" for "interpolation" == "alphabetic"
df = df.withColumn("min_hsn_alpha",when(col("interpolation") == "alphabetic", col("min_hsn")).otherwise(None))
df = df.withColumn("max_hsn_alpha",when(col("interpolation") == "alphabetic", col("max_hsn")).otherwise(None))

In [0]:
df.printSchema()

In [0]:
# Remove alphabetic parts from the values ""max_hsn" and "min_hsn"
df = df.withColumn("max_hsn", regexp_replace(col("max_hsn"), "[^0-9]", ""))
df = df.withColumn("min_hsn", regexp_replace(col("min_hsn"), "[^0-9]", ""))

In [0]:
from pyspark.sql.types import *

In [0]:
# Convert string columns to integers
df = df.withColumn("min_hsn", df["min_hsn"].cast(IntegerType()))
df = df.withColumn("max_hsn", df["max_hsn"].cast(IntegerType()))

In [0]:
# Swap values if min_hsn > max_hsn
df = df.withColumn("temp",when(col("min_hsn") > col("max_hsn"), col("min_hsn")).otherwise(col("max_hsn")))

df = df.withColumn("max_hsn",when(col("min_hsn") > col("max_hsn"), col("min_hsn")).otherwise(col("max_hsn")))

df = df.withColumn("min_hsn",when(col("min_hsn") > col("max_hsn"), col("temp")).otherwise(col("min_hsn")))

# Drop the temporary column
df = df.drop("temp")

In [0]:
# Register the custom function as a UDF for "interpolation")=='alphabetic' Swapping
swap_udf = udf(swapMin_hsnMax_hsnAlphabate, StructType([StructField("min_hsn_alpha", StringType(), True),
                                                StructField("max_hsn_alpha", StringType(), True)]))

# Apply the UDF to the DataFrame
df = df.withColumn("result", swap_udf("min_hsn_alpha", "max_hsn_alpha"))

# Extracting values from the struct column and creating new columns
df = df.withColumn("min_hsn_alphaN", col("result.min_hsn_alpha"))
df = df.withColumn("max_hsn_alphaN", col("result.max_hsn_alpha"))

In [0]:
# Register the UDF "generate_array_alphabetic_udf"
generate_array_alphabetic_udf = udf(generate_array_alphabetic, ArrayType(StringType()))

# Apply the UDF to the DataFrame
df_temp = df.withColumn("hnr_array_result", generate_array_alphabetic_udf(col("min_hsn_alphaN"), col("max_hsn_alphaN")))


# Create new columns based on conditions "generate_array_alphabetic_udf"
df_temp = df_temp.withColumn(
    "hnr_array_alphabetic",
    when((col("interpolation") == "alphabetic") & (col("intermediate_one").isNull()), 
        generate_array_alphabetic_udf(col("min_hsn_alphaN"), col("max_hsn_alphaN")))
)

In [0]:
# Register the UDFs
generate_array_even_udf = udf(generate_array_even, ArrayType(IntegerType()))
generate_array_odd_udf = udf(generate_array_odd, ArrayType(IntegerType()))
generate_array_numeric_mixed_udf = udf(generate_array_numeric_mixed, ArrayType(IntegerType()))
generate_array_irregular_udf = udf(generate_array_numeric_mixed, ArrayType(IntegerType()))

In [0]:
# # Register the UDF String to Array 
# string_to_array_udf = udf(string_to_array, ArrayType(IntegerType()))

# # Register the UDF intermediate_one_array
# update_array_udf = udf(update_array, ArrayType(IntegerType()))


# # Apply the UDF to create a new column with the converted array
# df_temp = df_temp.withColumn("intermediate_one_array", string_to_array_udf("intermediate_one"))

# # df = df.withColumn("intermediate_one_array", regexp_replace(col("intermediate_one_array"), "[^0-9]", ""))

# # Apply the UDF to update the array
# df_temp = df_temp.withColumn("hnr_array_intermediate", update_array_udf(col("min_hsn"), col("max_hsn"), col("intermediate_one_array")))

# # Create new columns based on conditions "hnr_array_even"
# df_temp = df_temp.withColumn(
#     "hnr_array_even",
#     when((col("interpolation") == "even") & (col("intermediate_one").isNull()), 
#         generate_array_even_udf(col("min_hsn"), col("max_hsn")))
# )


# # Create new columns based on conditions "hnr_array_odd"
# df_temp = df_temp.withColumn(
#     "hnr_array_odd",
#     when((col("interpolation") == "odd") & (col("intermediate_one").isNull()), 
#         generate_array_odd_udf(col("min_hsn"), col("max_hsn")))
# )

# # Create new columns based on conditions "hnr_array_numeric_mixed"
# df_temp = df_temp.withColumn(
#     "hnr_array_numeric_mixed",
#     when((col("interpolation") == "numeric_mixed") & (col("intermediate_one").isNull()), 
#         generate_array_numeric_mixed_udf(col("min_hsn"), col("max_hsn")))
# )

# # Create new columns based on conditions "generate_array_irregular_udf"
# df_temp = df_temp.withColumn(
#     "hnr_array_irregular",
#     when((col("interpolation") == "irregular") & (col("intermediate_one").isNull()), 
#         generate_array_irregular_udf(col("min_hsn"), col("max_hsn")))
# )

# # Combine the new columns into a single "hnr_array" column using coalesce
# df_temp = df_temp.withColumn(
#     "hnr_array",
#     coalesce(col("hnr_array_even"), col("hnr_array_odd"), col("hnr_array_numeric_mixed"),col("hnr_array_irregular")
#             ,col("hnr_array_alphabetic"),col("hnr_array_intermediate"))
# )

# count Array House Number count 

# df_temp = df_temp.withColumn("hnr_array_count", size(col("hnr_array")).cast("int"))

# # Replace null values with 0 in the "hnr_array_count" column
# df_temp = df_temp.fillna(0, subset=["hnr_array_count"])

# # Select and keep only the specified columns
# addressRangesGroupBy = df_temp.select("osm_id","country","interpolation", "hnr_array_count","hnr_array", "license_zone", "layer_id","link_apt_count", "ar_side_of_line")
# # fill blank records with 0 'Zero'
# addressRangesGroupBy = addressRangesGroupBy.fillna(0, subset=['link_apt_count'])


###### new logic by Jona

In [0]:
string_to_array_udf = udf(string_to_array, ArrayType(IntegerType()))
update_array_udf = udf(update_array, ArrayType(IntegerType()))

# Apply transformations
df_temp = df_temp.withColumn("intermediate_one_array", string_to_array_udf("intermediate_one"))
df_temp = df_temp.withColumn("hnr_array_intermediate", update_array_udf(col("min_hsn"), col("max_hsn"), col("intermediate_one_array")))

# Combine conditional column creation into a single operation
df_temp = df_temp.withColumn(
    "hnr_array",
    when((col("interpolation") == "even") & (col("intermediate_one").isNull()), generate_array_even_udf(col("min_hsn"), col("max_hsn")))
    .when((col("interpolation") == "odd") & (col("intermediate_one").isNull()), generate_array_odd_udf(col("min_hsn"), col("max_hsn")))
    .when((col("interpolation") == "numeric_mixed") & (col("intermediate_one").isNull()), generate_array_numeric_mixed_udf(col("min_hsn"), col("max_hsn")))
    .when((col("interpolation") == "irregular") & (col("intermediate_one").isNull()), generate_array_irregular_udf(col("min_hsn"), col("max_hsn")))
    .otherwise(col("hnr_array_intermediate"))
)

# Add and clean up columns
df_temp = df_temp.withColumn("hnr_array_count", size(col("hnr_array")).cast("int"))
# fill blank records with 0 'Zero'
df_temp = df_temp.fillna(0, subset=["hnr_array_count", "link_apt_count"])

In [0]:
# Remove Duplicate 
addressRangesGroupBy = df_temp.dropDuplicates()

# Select and keep only the specified columns
addressRangesGroupBy = df_temp.select("osm_id","country","interpolation", "hnr_array_count","hnr_array", "license_zone", "layer_id","link_apt_count", "ar_side_of_line")
# Add "Ranges_count" 
addressRangesGroupBy = addressRangesGroupBy.withColumn("Ranges_count", lit(1))
# fill blank records with 0 'Zero'
# addressRangesGroupBy = addressRangesGroupBy.fillna(0, subset=['link_apt_count'])

# addressRangesGroupBy = addressRangesGroupBy.filter(col("ar_side_of_line").isNotNull())

In [0]:
# if has side of line same sum "link_apt_count"
addressRangesGroupBy = addressRangesGroupBy.groupBy(
    "osm_id", "country", "interpolation", "hnr_array_count", "hnr_array", "license_zone", "layer_id", "ar_side_of_line", "Ranges_count"
).agg(
    sum("link_apt_count").alias("link_apt_count")
)

In [0]:
# addressRangesGroupBy = addressRangesGroupBy.repartition(30)

In [0]:
# from pyspark.sql.window import *
# # Define the window specification with partitioning by "osm_id" and ordering by "link_apt_count" in descending order
# windowSpec = Window.partitionBy("osm_id").orderBy(col("link_apt_count").desc())

# addressRangesGroupBy = addressRangesGroupBy.withColumn("rank_number", dense_rank().over(windowSpec))
# addressRangesGroupBy = addressRangesGroupBy.filter(col("rank_number")== 1)

In [0]:

# # Group by columns and aggregate "hnr_array_count" colun
# result = addressRangesGeoupBy.groupby("country", "license_zone", "layer_id", "interpolation") \
#     .agg(sum("hnr_array_count").alias("sum_hnr_array_count"),
#          sum(when(col("link_apt_count") == 0, col("hnr_array_count")).otherwise(0)).alias("link_apt_count = 0"),
#          sum(when(col("link_apt_count") == 1, col("hnr_array_count")).otherwise(0)).alias("link_apt_count = 1"),
#          sum(when(col("link_apt_count") > 1, col("hnr_array_count")).otherwise(0)).alias("link_apt_count > 1")
#         )

In [0]:
result = addressRangesGroupBy.groupby("country", "license_zone", "layer_id", "interpolation") \
    .agg(
        sum("hnr_array_count").alias("sum_hnr_array_count"),
        sum("Ranges_count").alias("AddreesRangesCount"),
        sum(when(col("link_apt_count") == 0, col("hnr_array_count")).otherwise(0)).alias("link_apt_count_eq_0"),
        sum(when(col("link_apt_count") == 1, col("hnr_array_count")).otherwise(0)).alias("link_apt_count_eq_1"),
        sum(when(col("link_apt_count") > 1, col("hnr_array_count")).otherwise(0)).alias("link_apt_count_gt_1"),
        sum(when(col("hnr_array_count").between(0, 500), col("hnr_array_count")).otherwise(0)).alias("hnr_ranges_0_500"),
        sum(when(col("hnr_array_count").between(501, 1000), col("hnr_array_count")).otherwise(0)).alias("hnr_ranges_501_1000"),
        sum(when(col("hnr_array_count").between(1001, 2000), col("hnr_array_count")).otherwise(0)).alias("hnr_ranges_1001_2000"),
        sum(when(col("hnr_array_count").between(2001, 3000), col("hnr_array_count")).otherwise(0)).alias("hnr_ranges_2001_3000"),
        sum(when(col("hnr_array_count").between(3001, 5000), col("hnr_array_count")).otherwise(0)).alias("hnr_ranges_3001_5000"),
        sum(when(col("hnr_array_count") > 5000, col("hnr_array_count")).otherwise(0)).alias("hnr_ranges_grt_5000")
    )


In [0]:
display(result)