In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import pyspark.sql.functions as F
from itertools import combinations
from pyspark.sql import DataFrame

# format the show() output
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>")) 

spark = SparkSession.builder.getOrCreate()
spark

# Read the data

In [4]:
df_metadata = spark.read.csv("./data/globaloptionsetmetadata.csv", header=True, inferSchema=True)
df_metadata.filter((df_metadata["EntityName"] == "opportunity")).show()

+-----------------+------+---------------+--------------------------+-------------------+-------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|    OptionSetName|Option|IsUserLocalized|LocalizedLabelLanguageCode|     LocalizedLabel|GlobalOptionSetName| EntityName|   RecordCreateDats|   RecordCreateJobID|   RecordUpdateDats|   RecordUpdateJobID|IsUserLocalizedLabel|
+-----------------+------+---------------+--------------------------+-------------------+-------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|     budgetstatus|     0|           NULL|                      1033|No Committed Budget|       budgetstatus|opportunity|2024-11-01 09:48:11|GlobalOptionsetMe...|2024-11-01 09:48:11|GlobalOptionsetMe...|               false|
|     budgetstatus|     1|           NULL|                      1033|            May Buy|       budg

In [12]:
df_opportunity = spark.read.csv("./data/opportunity.csv",sep=",", multiLine=True, header=True, inferSchema=True)
df_opportunity.show(truncate=False)

+------------------------------------+--------------------------+--------------------------+---------+----------+------------+--------------------------------------+-----------------------------------------+------------------------------------------+---------------------+------+-------------------+-----------------+--------------+---------------------+----------------------+----------------+-------------------+--------------+-----------------------+------------------+-------------------+---------------------+------------------+----------------+------------------+------------------------+--------------------------+--------------+-----------------------+------------------+-------------------+----------------------+----------------+--------------------+------------------+---------------+-----------------+---------------+-------------------------+---------------+----------+--------------------------------------+--------------------+---------------+---------------------------+--------------

In [34]:
def join_with_metadata_alias(
    df: DataFrame, 
    df_metadata_table: DataFrame,
    option: str,
    optionsetname: str,
    label_alias: str
) -> DataFrame:
    """
    Joins the df DataFrame with df_metadata based on provided conditions and returns the result with
    all columns from df and the LocalizedLabel column from df_metadata.

    Parameters:
    - df: Input DataFrame containing opportunity data.
    - df_metadata: Input DataFrame containing metadata.
    - option: The column name from df_metadata to match with the column name from df.
    - optionsetname: The filter value for the OptionSetName column in df_metadata.
    - label_alias: The alias for the resulting LocalizedLabel column.

    Returns:
    - DataFrame: The resulting DataFrame after the join.
    """
    # Alias the DataFrames
    df_alias = df.alias("df_alias")
    df_meta_alias = df_metadata_table.alias(f"meta_{label_alias}")

    # Drop pre-existing columns with the same alias to prevent ambiguity
    if label_alias in df.columns:
        df_alias = df_alias.drop(label_alias)
    
    # Define the join condition
    cond = [
        df_meta_alias["Option"] == df_alias[option],
        df_meta_alias["OptionSetName"] == optionsetname,
        df_meta_alias["EntityName"] == "opportunity"
    ]
    
    # Perform the join and select the required columns
    df_joined = (
        df_alias
        .join(df_meta_alias, cond, "left")
        .select(
            #df_alias["*"],
            *[df_alias[col] for col in df.columns],
            df_meta_alias["LocalizedLabel"].alias(label_alias))
    )    
    
    return df_joined


In [None]:
df_result = join_with_metadata_alias(df_opportunity, df_metadata, "dow_opportunitycategory", "dow_opporotunitycategory", "opportunitycategory_label")
df_result.show()

In [None]:
df_result = join_with_metadata_alias(df_result, df_metadata, "dow_channelsource", "dow_channelsource", "dow_channelsource_label")
df_result.where(F.col("dow_opportunitycategory") == 2).show()

# Approach 2: Seperate function for retreiving metadata + re-naming the LocalizedLabel column, and joining Metadata with Opportunity DataFrame 

First call get_metadata_for_optionset passing the 'optionsetname' to retreive the metadata for the given optionset. <br>Then call join_with_metadata passing both the Opportunity DataFrame and metadata DataFrame and the optionset name as a string.

In [5]:
def get_metadata_for_optionset(optionSetName: str):
    """
    Filters and processes metadata for a specific entity and OptionSetName.

    This function filters the `df_metadata` DataFrame for rows related to the 
    "opportunity" entity and the specified `optionSetName`. It renames the 
    "LocalizedLabel" column to include the `optionSetName` as a suffix for clarity 
    and to avoid column name conflicts when joining with other DataFrames.

    Parameters:
    - optionSetName (str): The OptionSetName to filter the metadata for.

    Returns:
    - DataFrame: A subset of `df_metadata` with filtered rows and a renamed "LocalizedLabel" column.
    """
    # Filter the metadata DataFrame to include only rows where:
    # - The "EntityName" is "opportunity".
    # - The "OptionSetName" matches the input parameter `optionSetName`.
    df_meta_subset = df_metadata.filter(
        (df_metadata["EntityName"] == "opportunity") & (df_metadata["OptionSetName"] == optionSetName)  
    )
    
    # Rename the "LocalizedLabel" column to "<optionSetName>_label" to make it unique
    # and prevent conflicts during joins with other DataFrames.
    df_meta_subset = df_meta_subset.withColumnRenamed("LocalizedLabel", f"{optionSetName}_label")
    
    # Return the processed DataFrame.
    return df_meta_subset


In [8]:
df_budgetstatus = get_metadata_for_optionset("budgetstatus")
df_budgetstatus.show()

+-------------+------+---------------+--------------------------+-------------------+-------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|OptionSetName|Option|IsUserLocalized|LocalizedLabelLanguageCode| budgetstatus_label|GlobalOptionSetName| EntityName|   RecordCreateDats|   RecordCreateJobID|   RecordUpdateDats|   RecordUpdateJobID|IsUserLocalizedLabel|
+-------------+------+---------------+--------------------------+-------------------+-------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
| budgetstatus|     0|           NULL|                      1033|No Committed Budget|       budgetstatus|opportunity|2024-11-01 09:48:11|GlobalOptionsetMe...|2024-11-01 09:48:11|GlobalOptionsetMe...|               false|
| budgetstatus|     1|           NULL|                      1033|            May Buy|       budgetstatus|opportunity

In [27]:
def join_with_metadata(
    df: DataFrame, 
    df_metadata_table: DataFrame,
    # option: str, # check if we can use optionsetname parameter instead
    optionsetname: str,    
) -> DataFrame:
    """
    Joins the df DataFrame with df_metadata based on provided conditions and returns the result with
    all columns from df and the LocalizedLabel column from df_metadata.

    Parameters:
    - df: Input DataFrame containing opportunity data.
    - df_metadata: Input DataFrame containing metadata.
    - option: The column name from df_metadata to match with the column name from df.
    - optionsetname: The filter value for the OptionSetName column in df_metadata.

    Returns:
    - DataFrame: The resulting DataFrame after the join.
    """
    
    # Define the join condition
    cond = [
        df_metadata_table["Option"] == df[optionsetname],
        df_metadata_table["OptionSetName"] == optionsetname,
        df_metadata_table["EntityName"] == "opportunity"
    ]
    
    # Perform the join and select the required columns
    df_joined = (
        df
        .join(df_metadata_table, cond, "left")
        .select(
            #df_alias["*"],
            *[df[col] for col in df.columns],
            df_metadata_table[optionsetname + "_label"]
        )
    )    
    
    return df_joined


In [28]:
df_joined = join_with_metadata(df_opportunity, df_budgetstatus, "budgetstatus")
df_joined.show()

+--------------------+--------------------+--------------------+---------+----------+------------+--------------------------------------+-----------------------------------------+------------------------------------------+---------------------+------+-------------------+-----------------+--------------+---------------------+----------------------+----------------+-------------------+--------------+-----------------------+------------------+-------------------+---------------------+------------------+----------------+------------------+------------------------+--------------------------+--------------+-----------------------+------------------+-------------------+----------------------+----------------+--------------------+------------------+---------------+-----------------+---------------+-------------------------+---------------+----------+--------------------------------------+--------------------+---------------+---------------------------+------------------+----------+----------+-

In [29]:
df_channelsource = get_metadata_for_optionset("dow_channelsource")
df_channelsource.show()

+-----------------+------+---------------+--------------------------+-----------------------+-------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|    OptionSetName|Option|IsUserLocalized|LocalizedLabelLanguageCode|dow_channelsource_label|GlobalOptionSetName| EntityName|   RecordCreateDats|   RecordCreateJobID|   RecordUpdateDats|   RecordUpdateJobID|IsUserLocalizedLabel|
+-----------------+------+---------------+--------------------------+-----------------------+-------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|dow_channelsource|    20|           NULL|                      1033|                  Baidu|  dow_channelsource|opportunity|2024-11-01 09:48:11|GlobalOptionsetMe...|2024-11-01 09:48:11|GlobalOptionsetMe...|               false|
|dow_channelsource|    21|           NULL|                      1033|               

In [30]:
df_joined = join_with_metadata(df_joined, df_channelsource, "dow_channelsource")
df_joined.show()

+--------------------+--------------------+--------------------+---------+----------+------------+--------------------------------------+-----------------------------------------+------------------------------------------+---------------------+------+-------------------+-----------------+--------------+---------------------+----------------------+----------------+-------------------+--------------+-----------------------+------------------+-------------------+---------------------+------------------+----------------+------------------+------------------------+--------------------------+--------------+-----------------------+------------------+-------------------+----------------------+----------------+--------------------+------------------+---------------+-----------------+---------------+-------------------------+---------------+----------+--------------------------------------+--------------------+---------------+---------------------------+------------------+----------+----------+-

In [31]:
df_joined.filter(F.col("budgetstatus_label").isNotNull()).show()

+--------------------+-------------+--------------+---------+----------+------------+--------------------------------------+-----------------------------------------+------------------------------------------+---------------------+------+-------------------+-----------------+--------------+---------------------+----------------------+----------------+--------------------+--------------+-----------------------+------------------+-------------------+---------------------+------------------+----------------+------------------+------------------------+--------------------------+--------------+-----------------------+------------------+-------------------+----------------------+----------------+--------------------+------------------+---------------+--------------------+---------------+-------------------------+---------------+--------------+--------------------------------------+--------------------+---------------+---------------------------+--------------------+--------------------+-----