In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import pyspark.sql.functions as F
from itertools import combinations
from pyspark.sql import DataFrame

# format the show() output
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>")) 

spark = SparkSession.builder.getOrCreate()
spark

# Read the data

In [10]:
df_globaloptionset= spark.read.csv("./data/globaloptionsetmetadata.csv", header=True, inferSchema=True)
df_globaloptionset.filter(df_globaloptionset["OptionSetName"] == "dow_previouscategory").show()

+--------------------+------+---------------+--------------------------+--------------+--------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|       OptionSetName|Option|IsUserLocalized|LocalizedLabelLanguageCode|LocalizedLabel| GlobalOptionSetName| EntityName|   RecordCreateDats|   RecordCreateJobID|   RecordUpdateDats|   RecordUpdateJobID|IsUserLocalizedLabel|
+--------------------+------+---------------+--------------------------+--------------+--------------------+-----------+-------------------+--------------------+-------------------+--------------------+--------------------+
|dow_previouscategory|     1|           NULL|                      1033|   Competitive|dow_opportunityca...|opportunity|2024-11-01 09:48:11|GlobalOptionsetMe...|2024-11-01 09:48:11|GlobalOptionsetMe...|               false|
|dow_previouscategory|     2|           NULL|                      1033|        Growth|dow_opportunityca

In [3]:
df_optionset = spark.read.csv("./data/optionsetmetadata.csv", header=True, inferSchema=True)
# df_optionset.filter(df_optionset["EntityName"] == "opportunity").show()

In [19]:
df_opportunity = spark.read.csv("./data/opportunity.csv",sep=",", multiLine=True, header=True, inferSchema=True)
# df_opportunity.show(truncate=False)
# df_opportunity.printSchema()
# df_opportunity.select(F.col("dow_initiative")).show()

# Single function for filtering metadata table and joining with the specified entity dataframe

Which metadata table to join with is specified with input parameter 'df_metadata'.

When calling the method for the first time, pass the DataFrame that holds the entity's data. You can overwrite the <br>
DataFrame by assigning the output parameter to the entity's DataFrame in your call, and continue using that DataFrame for any <br>
subsequent calls to this method, continuously updating that DataFrame.


In [5]:
def join_with_metadata(
    df: DataFrame, 
    df_metadata: DataFrame,    
    entity: str,
    optionsetname: str,    
) -> DataFrame:
    """
    Joins the df DataFrame with df_globaloptionsetbased on provided conditions and returns the result with
    all columns from df and the LocalizedLabel column from df_metadata.

    Parameters:
    - df: Input DataFrame containing opportunity data.
    - df_metadata: Input DataFrame containing metadata.    
    - entity: The filter value for the entity for which to join the metadata
    - optionsetname: The filter value for the OptionSetName column in df_metadata.

    Returns:
    - DataFrame: The resulting DataFrame after the join.
    """

    # filter the metadata dataframe based on the optionsetname
    df_metadata_filtered = df_metadata.filter(
        (df_metadata["EntityName"] == "opportunity") & (df_metadata["OptionSetName"] == optionsetname)  
    )
    
    # Rename the "LocalizedLabel" column to "<optionSetName>_label" to make it unique
    # and prevent conflicts during joins with other DataFrames.
    df_metadata_filtered = df_metadata_filtered.withColumnRenamed("LocalizedLabel", f"{optionsetname}_label")
    
    # Define the join condition
    cond = [
        df_metadata_filtered["Option"] == df[optionsetname], # optionsetname here represents the column name in the entity's dataframe
        df_metadata_filtered["OptionSetName"] == optionsetname,
        df_metadata_filtered["EntityName"] == entity
    ]
    
    # Perform the join and select the required columns
    if f"{optionsetname}_label" in df.columns:
        print("label column already exists")
    else:
        df = (
            df
            .join(df_metadata_filtered, cond, "left")
            .select(
                df["*"],            
                df_metadata_filtered[optionsetname + "_label"]
            )
        )    
    
    return df


## Following 3 cells are for testing; 2 different metadata sets

In [6]:
df_joined = join_with_metadata(df_opportunity, df_globaloptionset, "opportunity", "budgetstatus")
# df_joined.show()

In [7]:
df_joined = join_with_metadata(df_joined, df_globaloptionset, "opportunity", "dow_channelsource")
# df_joined.show()

In [8]:
df_joined = join_with_metadata(df_joined, df_optionset, "opportunity", "dow_status")
df_joined.filter(df_joined["dow_status"].isNotNull()).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------------------------+-----------------------------------------+------------------------------------------+---------------------+----------+--------------------+--------------------+--------------------+---------------------+----------------------+--------------------+-------------------+--------------+-----------------------+--------------------+-------------------+---------------------+--------------------+----------------+--------------------+------------------------+--------------------------+--------------------+-----------------------+------------------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+-------------------------+--------------------+----------+--------------------------------------+--------------------+-------

In [20]:
# create a list with column names that need to be joined with globaloptionset metadata and call the join function in a loop
join_cols = ["dow_opportunitycategory", "dow_previouscategory", "dow_source", "dow_valuechainlocation", 
             "dow_sustainability", "dow_sustainabilitysubcategory"]
for col in join_cols:
    df_opportunity = join_with_metadata(df_opportunity, df_globaloptionset, "opportunity", col)    

df_opportunity.show()

+--------------------+--------------------+--------------------+---------+----------+------------+--------------------------------------+-----------------------------------------+------------------------------------------+---------------------+------+-------------------+-----------------+--------------+---------------------+----------------------+----------------+-------------------+--------------+-----------------------+------------------+-------------------+---------------------+------------------+----------------+------------------+------------------------+--------------------------+--------------+-----------------------+------------------+-------------------+----------------------+----------------+--------------------+------------------+---------------+-----------------+---------------+-------------------------+---------------+----------+--------------------------------------+--------------------+---------------+---------------------------+------------------+----------+----------+-