### Distinct Country name in Dropdown

In [0]:
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/user_dataset/users_002.csv",
    header=True,
    inferSchema=True, 
    sep=","  
)
df.display()

In [0]:
from pyspark.sql.functions import col

def get_unique_country_names(user_df):
    countryNames = (
        user_df.select(col("country"))
        .distinct()
        .rdd.map(lambda x: x.country)
        .collect()
    )
    countryNames.insert(0, "SELECT")
    return countryNames

get_unique_country_names(df)

### Populate Country Name

In [0]:
dbutils.widgets.dropdown(name="List of Countries",
                         defaultValue="SELECT",
                         choices=get_unique_country_names(df),
                         )

### Populate State Droupdown based on Country Selected

In [0]:
def get_region_names_for(Country, user_df):
    regionName = (
        user_df.filter(col("country") == Country)
        .select(col("region"))
        .distinct()
        .rdd.map(lambda x: x.region)
        .collect()
    )
    regionName.insert(0, "SELECT")
    return regionName

get_region_names_for("India", df)

In [0]:
selected_country = dbutils.widgets.get("List of Countries")
regions = get_region_names_for(selected_country,df)
print(f"Selected Country= {selected_country}, Regions= {regions}")

 
 
 

In [0]:
dbutils.widgets.dropdown(name="region",
                         defaultValue="SELECT",
                         choices=get_region_names_for(selected_country,df),
                         )

## Analysis

In [0]:
from pyspark.sql.functions import count
selected_country = dbutils.widgets.get("List of Countries")
selected_region = dbutils.widgets.get("region")
df.groupBy("country").agg(count("*")).display()

Databricks visualization. Run in Databricks to view.

In [0]:
def get_city_names_for(regions, user_df):
    cityName = (
        user_df.filter(col("region") == regions)
        .select(col("city"))
        .distinct()
        .rdd.map(lambda x: x.city)
        .collect()
    )
    cityName.insert(0, "SELECT")
    return cityName

get_city_names_for("Lakshadweep", df)

In [0]:
selected_region = dbutils.widgets.get("region")
allcity = get_city_names_for(selected_region,df)
print(f"Selected Country= {selected_country}, Regions= {regions}, City= {allcity}")


In [0]:
dbutils.widgets.dropdown(name="cities",
                         defaultValue="SELECT",
                         choices= get_city_names_for(selected_region,df),
                         )

#### What percentage of Male and Female for the Selected city

In [0]:
selected_country = dbutils.widgets.get("List of Countries")
selected_region = dbutils.widgets.get("region")
selected_city = dbutils.widgets.get("cities")


In [0]:
from pyspark.sql.functions import col, count
def calculate_percentage_for(country, region, city):
    filtered_records = df.filter(
        (col("country") == country) & (col("region") == region) & (col("city") == city)
    )
    total_count = filtered_records.count()
    if total_count > 0:
        gender_counts = filtered_records.groupBy("gender").agg(
            count("*").alias("count")
        )
        percentage_df = gender_counts.withColumn(
            "percentage", (col("count") / total_count) * 100
        )
        return percentage_df
    else:
        return 0

calculate_percentage_for(selected_country, selected_region, selected_city)
    

In [0]:
display(calculate_percentage_for(selected_country, selected_region, selected_city))

Databricks visualization. Run in Databricks to view.

### Trainer Code

In [0]:
from pyspark.sql.functions import col, count, round
 
def calculate_percentage_for(country, region):
    filtered_records = user_df.filter((col("country") == country) & (col("region") == region))
    total_records = filtered_records.count()
 
    result = (filtered_records.groupBy("gender")
              .agg(count("*").alias("count"))
              .withColumn("percentage", round((col("count") / total_records) * 100, 2))
             )
    return result
 
result = calculate_percentage_for(dbutils.widgets.get("countries"), dbutils.widgets.get("region"))
display(result.select("gender","percentage"))

## Topics to learn 
  1. Learn about JOIN Using Dataframe
  2. FreeSQLDatabase -- Connect and read data 
