### Mounting the container into Databricks

In [0]:
# Install required packages
%pip install azure-identity azure-keyvault-secrets

Collecting azure-identity
  Obtaining dependency information for azure-identity from https://files.pythonhosted.org/packages/f0/d5/3995ed12f941f4a41a273d9b1709282e825ef87ed8eab3833038fee54d59/azure_identity-1.19.0-py3-none-any.whl.metadata
  Using cached azure_identity-1.19.0-py3-none-any.whl.metadata (80 kB)
Collecting azure-keyvault-secrets
  Obtaining dependency information for azure-keyvault-secrets from https://files.pythonhosted.org/packages/bf/ad/e5dd4c09ed80196b1b35f107502b12e32d06eb2d965adf4673df0d5cf85e/azure_keyvault_secrets-4.9.0-py3-none-any.whl.metadata
  Using cached azure_keyvault_secrets-4.9.0-py3-none-any.whl.metadata (29 kB)
Collecting azure-core>=1.31.0 (from azure-identity)
  Obtaining dependency information for azure-core>=1.31.0 from https://files.pythonhosted.org/packages/39/83/325bf5e02504dbd8b4faa98197a44cdf8a325ef259b48326a2b6f17f8383/azure_core-1.32.0-py3-none-any.whl.metadata
  Using cached azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting msal

In [0]:
# Restart Python interpreter to ensure new packages are loaded
%restart_python

In [0]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

# Key Vault configuration
key_vault_url = "https://Olist-Key.vault.azure.net/"
credential = DefaultAzureCredential()
client = SecretClient(vault_url=key_vault_url, credential=credential)

# Retrieve secrets from Key Vault
client_id = client.get_secret("olist-client-id").value
client_secret = client.get_secret("olist-client-secret").value
tenant_id = client.get_secret("olist-tenant-id").value

# Unmount the existing mount point if it exists
dbutils.fs.unmount("/mnt/olist-store-data")

# Create the configurations
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": client_id,
    "fs.azure.account.oauth2.client.secret": client_secret,
    "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
}

# Mount the storage
dbutils.fs.mount(
    source="abfss://olist-store-data@olistbrdata.dfs.core.windows.net",
    mount_point="/mnt/olist-store-data",
    extra_configs=configs
)

/mnt/olist-store-data has been unmounted.


True

### Check mounting of the storage-account container

In [0]:
# Check if the mounting is successful or not
dbutils.fs.ls("/mnt/olist-store-data")

[FileInfo(path='dbfs:/mnt/olist-store-data/raw-data/', name='raw-data/', size=0, modificationTime=1735461319000),
 FileInfo(path='dbfs:/mnt/olist-store-data/ready-data/', name='ready-data/', size=0, modificationTime=1735792345000),
 FileInfo(path='dbfs:/mnt/olist-store-data/test-upload/', name='test-upload/', size=0, modificationTime=1736860622000),
 FileInfo(path='dbfs:/mnt/olist-store-data/transformed-data/', name='transformed-data/', size=0, modificationTime=1735461344000)]

### Read sellers dataset from raw-data folder

In [0]:
sellers = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/mnt/olist-store-data/raw-data/olist_sellers_dataset.csv")

In [0]:
sellers.show(10)
sellers.printSchema()
display(sellers.limit(10))

+--------------------+----------------------+-----------------+------------+
|           seller_id|seller_zip_code_prefix|      seller_city|seller_state|
+--------------------+----------------------+-----------------+------------+
|3442f8959a84dea7e...|                 13023|         campinas|          SP|
|d1b65fc7debc3361e...|                 13844|       mogi guacu|          SP|
|ce3ad9de960102d06...|                 20031|   rio de janeiro|          RJ|
|c0f3eea2e14555b6f...|                  4195|        sao paulo|          SP|
|51a04a8a6bdcb23de...|                 12914|braganca paulista|          SP|
|c240c4061717ac180...|                 20920|   rio de janeiro|          RJ|
|e49c26c3edfa46d22...|                 55325|           brejao|          PE|
|1b938a7ec6ac5061a...|                 16304|        penapolis|          SP|
|768a86e36ad6aae3d...|                  1529|        sao paulo|          SP|
|ccc4bbb5f32a6ab2b...|                 80310|         curitiba|          PR|

seller_id,seller_zip_code_prefix,seller_city,seller_state
3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
c240c4061717ac1806ae6ee72be3533b,20920,rio de janeiro,RJ
e49c26c3edfa46d227d5121a6b6e4d37,55325,brejao,PE
1b938a7ec6ac5061a66a3766e0e75f90,16304,penapolis,SP
768a86e36ad6aae3d03ee3c6433d61df,1529,sao paulo,SP
ccc4bbb5f32a6ab2b7066a4130f114e3,80310,curitiba,PR


### Cast the column to a string value as it has turned into an integer automatically when Databricks reads data from source.<br>
Adding a "0" in the seller_zip_code_prefix column

In [0]:
from pyspark.sql.functions import col, lpad

sellers = sellers.withColumn(
    "seller_zip_code_prefix", 
    lpad(col("seller_zip_code_prefix").cast("string"), 5, "0")
)

sellers.show(10)

+--------------------+----------------------+-----------------+------------+
|           seller_id|seller_zip_code_prefix|      seller_city|seller_state|
+--------------------+----------------------+-----------------+------------+
|3442f8959a84dea7e...|                 13023|         campinas|          SP|
|d1b65fc7debc3361e...|                 13844|       mogi guacu|          SP|
|ce3ad9de960102d06...|                 20031|   rio de janeiro|          RJ|
|c0f3eea2e14555b6f...|                 04195|        sao paulo|          SP|
|51a04a8a6bdcb23de...|                 12914|braganca paulista|          SP|
|c240c4061717ac180...|                 20920|   rio de janeiro|          RJ|
|e49c26c3edfa46d22...|                 55325|           brejao|          PE|
|1b938a7ec6ac5061a...|                 16304|        penapolis|          SP|
|768a86e36ad6aae3d...|                 01529|        sao paulo|          SP|
|ccc4bbb5f32a6ab2b...|                 80310|         curitiba|          PR|

### Data cleaning for sellers dataset
- **Step 1: Data Loading and Initial Analysis**<br>
The script begins by loading a CSV file containing seller data into a PySpark DataFrame. It then performs an initial analysis, providing information about:
  - Total number of records
  - Number of columns
  - Unique cities and states
  - Missing values analysis

- **Step 2: Data Cleaning Process**<br>
The cleaning process involves several steps:
  - Basic String Cleaning: Trims whitespace from string columns.
  - Enhanced City Name Cleaning:
    - Removes numeric values and special characters
    - Standardizes city names by splitting on delimiters
    - Performs character standardization (e.g., replacing "são" with "sao")
  - State Name Standardization: Converts state names to uppercase.
  - Metrics Calculation:
    - Calculates sellers per state and city:<br>
      a. **Seller Density**<br>
      Categorizes cities based on the number of sellers:
      ```
      - High: >= 100 sellers
      - Medium: >= 20 sellers
      - Low: < 20 sellers<br>
      ```
      b. **City Size**<br>
      Classifies cities based on the number of sellers:
      ```
      - Major City: >= 500 sellers
      - Large City: >= 100 sellers
      - Medium City: >= 50 sellers
      - Small City: < 50 sellers
      ```
    - Computes state market share percentages:<br>
    a. **Sellers per State**<br>The "sellers_in_state" column shows the number of sellers in each state. Some key observations:<br>
      ```
      - The state with the highest number of sellers has 1,849 sellers.
      - Several states have very few sellers, with some having only 1 or 2.
      - There's a wide range in the number of sellers across states, indicating a highly concentrated market in certain areas.
      ```
    - **State Market Share**<br>The "state_market_share" column represents the percentage of total sellers in each state. Notable points:<br>
      ```
      - The highest market share is 59.74%, corresponding to the state with 1,849 sellers.
      - Many states have very small market shares, below 1%.
      - The market shares directly correlate with the number of sellers in each state.
      ```
  - Geographic Classification:<br>
    - **Metropolitan Area**<br>Classifies sellers based on their location:
      ```
      - Sao Paulo Metro
      - Rio Metro
      - BH Metro
      - Curitiba Metro
      - Porto Alegre Metro
      - Other
      ```
    - **Seller Region**<br>This classification categorizes sellers into broader geographical regions based on their state. Here's how the regions are defined:
      ```
      - Southeast: SP, RJ, MG, ES
      - South: PR, RS, SC
      - Central-West: MT, MS, GO, DF
      - Northeast: BA, PE, CE, PB, MA, RN, AL, PI, SE
      - North: PA, AM, RO, AP, AC, RR, TO
      - Unknown: Any other state
      ```
  - Market Analysis:
    - **Seller Density**<br>This classification categorizes cities based on the concentration of sellers, providing insights into the competitiveness and market saturation of different locations:
      ```
      - High: Cities with 100 or more sellers (>= 100 sellers)
      - Medium: Cities with 20 to 99 sellers (>= 20 sellers)
      - Low: Cities with fewer than 20 sellers (< 20 sellers)
      ```
    - **Market Proximity**<br>Categorizes sellers based on their 
    proximity to core markets:
      ```
      - Core Market: Located in major metropolitan areas
      - Near Market: In SP, RJ, or MG states but outside metropolitan areas
      - Remote Market: All other locations
      ```
    - **Business Potential**<br>Assesses areas based on their growth potential:
      ```
      - High Growth: Core Market with High seller density
      - Medium Growth: Near Market with High or Medium seller density
      - Stable: All other combinations
      ```

- **Step 3: Data Analysis and Reporting**<br>
The script generates various analytical reports:
  - State-level analysis
  - City size distribution
  - Metropolitan area market share
  - Top 10 cities with market analysis
  - Market proximity distribution
  - Business potential distribution

In [0]:
# This script processes the sellers dataset to clean, standardize, and enrich the data
# with additional metrics and classifications for business intelligence purposes.

# Import libraries
from pyspark.sql.functions import (
    col, sum, count, when, upper, initcap, 
    length, regexp_replace, trim, round, avg,
    split, expr, desc, lower
)
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
from functools import reduce

def clean_sellers_dataset(spark, geolocation=None):
    """
    Comprehensive data cleaning and enrichment function for the Olist sellers dataset.
    
    The function performs the following major operations:
    1. Initial data loading and analysis
    2. Data cleaning and standardization
    3. Geographic classification and market analysis
    4. Business metrics calculation
    5. Detailed reporting
    
    Args:
        spark: SparkSession object
        geolocation: Optional geolocation data for additional analysis
        
    Returns:
        DataFrame: Cleaned and enriched sellers dataset
    """
    try:
        # Step 1: Data Loading and Initial Analysis
        print("Loading sellers dataset...")
        sellers = spark.read.format("csv").option("header","true").option("inferSchema","true")\
            .load("/mnt/olist-store-data/raw-data/olist_sellers_dataset.csv")
        
        # Create a copy for cleaning operations
        cleaned_sellers = sellers
        
        # Calculate initial dataset statistics for later comparison
        initial_count = sellers.count()
        initial_cities = sellers.select("seller_city").distinct().count()
        initial_states = sellers.select("seller_state").distinct().count()
        
         # Display initial dataset metrics
        print("\nInitial dataset information:")
        print(f"Number of records: {initial_count:,}")
        print(f"Number of columns: {len(sellers.columns)}")
        print(f"Number of unique cities: {initial_cities:,}")
        print(f"Number of unique states: {initial_states:,}")
        
        # Analyze missing values across all columns
        print("\nMissing values analysis:")
        missing_values = sellers.select([
            sum(col(c).isNull().cast("int")).alias(c) for c in sellers.columns
        ])
        
        # Display missing value statistics for each column
        for column in sellers.columns:
            missing_count = missing_values.collect()[0][column]
            missing_percentage = (missing_count / initial_count) * 100
            print(f"{column}: {missing_count:,} missing values ({missing_percentage:.2f}%)")
        
        print("\nMissing values count:")
        missing_values.show()
        
        # Check for duplicate seller IDs
        print("\nChecking for duplicate records...")
        seller_id_duplicates = sellers.groupBy("seller_id").count().filter(col("count") > 1)
        seller_id_duplicate_count = seller_id_duplicates.count()
        seller_id_duplicate_percentage = (seller_id_duplicate_count / initial_count) * 100
        print(f"Number of duplicate seller_ids: {seller_id_duplicate_count:,} ({seller_id_duplicate_percentage:.2f}%)")
        
        # Step 2: Data Cleaning Process
        print("\nStarting data cleaning process...")
        
        # Basic string cleaning: Remove whitespace from all string columns
        string_columns = ["seller_id", "seller_city", "seller_state"]
        for column in string_columns:
            cleaned_sellers = cleaned_sellers.withColumn(
                column,
                trim(col(column))
            )
        
        # Enhanced city name cleaning
        # 1. Remove invalid characters and standardize formatting
        # 2. Handle common city name variations
        # 3. Standardize special characters and accents
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_city",
            when(col("seller_city").rlike("\\d+"), None)
            .when(col("seller_city").rlike("@"), None)
            .when(col("seller_city").rlike("/"), split(col("seller_city"), "/").getItem(0))
            .when(col("seller_city").rlike(","), split(col("seller_city"), ",").getItem(0))
            .when(col("seller_city").rlike("-"), split(col("seller_city"), "-").getItem(0))
            .otherwise(col("seller_city"))
        )
        
        # Standardize character encoding and formatting
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_city",
            regexp_replace(
                regexp_replace(
                    regexp_replace(
                        regexp_replace(
                            regexp_replace(
                                lower(trim(col("seller_city"))),
                                "são", "sao"
                            ),
                            "d['´`]", "d"
                        ),
                        "\\s+", " "
                    ),
                    "[^a-z ]", ""
                ),
                "^sp$", "sao paulo"
            )
        )
        
        # Convert city names to proper case
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_city",
            initcap(col("seller_city"))
        )
        
        # Standardize state names to uppercase
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_state",
            upper(trim(col("seller_state")))
        )
        
        # Convert zip_code into string
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_zip_code_prefix", 
            lpad(col("seller_zip_code_prefix").cast("string"), 5, "0")
        ) 

        # Metrics Calculation
        # Calculate sellers per state and city
        sellers_per_state = cleaned_sellers.groupBy("seller_state").count()
        cleaned_sellers = cleaned_sellers.join(
            sellers_per_state.withColumnRenamed("count", "sellers_in_state"), 
            "seller_state"
        )
        
        # Compute state market share percentages for each state
        cleaned_sellers = cleaned_sellers.withColumn(
            "state_market_share",
            round(col("sellers_in_state") / initial_count * 100, 2)
        )
        
        # Classify metropolitan areas based on major city clusters
        cleaned_sellers = cleaned_sellers.withColumn(
            "metropolitan_area",
            when(col("seller_city").isin("Sao Paulo", "Guarulhos", "Santo Andre", "Osasco", "Barueri"), "Sao Paulo Metro")
            .when(col("seller_city").isin("Rio De Janeiro", "Niteroi", "Nova Iguacu", "Duque De Caxias"), "Rio Metro")
            .when(col("seller_city").isin("Belo Horizonte", "Contagem", "Betim"), "BH Metro")
            .when(col("seller_city").isin("Curitiba", "Sao Jose Dos Pinhais", "Colombo"), "Curitiba Metro")
            .when(col("seller_city").isin("Porto Alegre", "Canoas", "Novo Hamburgo"), "Porto Alegre Metro")
            .otherwise("Other")
        )
        
        # Classify sellers by geographic region
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_region",
            when(col("seller_state").isin("SP", "RJ", "MG", "ES"), "Southeast")
            .when(col("seller_state").isin("PR", "RS", "SC"), "South")
            .when(col("seller_state").isin("MT", "MS", "GO", "DF"), "Central-West")
            .when(col("seller_state").isin("BA", "PE", "CE", "PB", "MA", "RN", "AL", "PI", "SE"), "Northeast")
            .when(col("seller_state").isin("PA", "AM", "RO", "AP", "AC", "RR", "TO"), "North")
            .otherwise("Unknown")
        )
        
        # Calculate city-level metrics
        sellers_per_city = cleaned_sellers.groupBy("seller_city").count()
        cleaned_sellers = cleaned_sellers.join(
            sellers_per_city.withColumnRenamed("count", "sellers_in_city"), 
            "seller_city"
        )
        
        # Classify cities by size based on seller count
        cleaned_sellers = cleaned_sellers.withColumn(
            "city_size",
            when(col("sellers_in_city") >= 500, "Major City")
            .when(col("sellers_in_city") >= 100, "Large City")
            .when(col("sellers_in_city") >= 50, "Medium City")
            .otherwise("Small City")
        )
        
        # Classify locations by market proximity
        cleaned_sellers = cleaned_sellers.withColumn(
            "seller_density",
            when(col("sellers_in_city") >= 100, "High")
            .when(col("sellers_in_city") >= 20, "Medium")
            .otherwise("Low")
        )
        
        # Classify areas by business growth potential
        cleaned_sellers = cleaned_sellers.withColumn(
            "market_proximity",
            when(col("metropolitan_area") != "Other", "Core Market")
            .when(
                (col("seller_state").isin("SP", "RJ", "MG")) & 
                (col("metropolitan_area") == "Other"), 
                "Near Market"
            )
            .otherwise("Remote Market")
        )

        # Add business potential classification
        cleaned_sellers = cleaned_sellers.withColumn(
            "business_potential",
            when(
                (col("market_proximity") == "Core Market") & 
                (col("seller_density") == "High"),
                "High Growth"
            )
            .when(
                (col("market_proximity") == "Near Market") & 
                (col("seller_density").isin("High", "Medium")),
                "Medium Growth"
            )
            .otherwise("Stable")
        )
        
         # Calculate final dataset metrics
        final_count = cleaned_sellers.count()
        final_cities = cleaned_sellers.select("seller_city").distinct().count()
        
        # Print enhanced analysis results
        print("\nState-level Analysis:")
        cleaned_sellers.groupBy("seller_state")\
            .agg(
                count("*").alias("sellers"),
                round(count("*") / initial_count * 100, 2).alias("market_share_pct")
            )\
            .orderBy(desc("sellers"))\
            .show()
        
        print("\nCity Size Distribution:")
        cleaned_sellers.groupBy("city_size").count()\
            .withColumn("percentage", round(col("count") / final_count * 100, 2))\
            .orderBy("count", ascending=False).show()
        
        print("\nMetropolitan Area Market Share:")
        cleaned_sellers.groupBy("metropolitan_area")\
            .agg(
                count("*").alias("sellers"),
                round(count("*") / final_count * 100, 2).alias("market_share_pct"),
                round(avg("sellers_in_city"), 2).alias("avg_sellers_per_city")
            )\
            .orderBy(desc("sellers"))\
            .show()
        
        print("\nTop 10 Cities with Market Analysis:")
        cleaned_sellers.groupBy("seller_city", "seller_state", "metropolitan_area", "city_size")\
            .agg(
                count("*").alias("sellers"),
                round(count("*") / final_count * 100, 2).alias("market_share_pct")
            )\
            .orderBy(desc("sellers"))\
            .show(10)
        
        print("\nFinal Cleaning Summary:")
        print(f"Original record count: {initial_count:,}")
        print(f"Cleaned record count: {final_count:,}")
        print(f"Records affected: {abs(initial_count - final_count):,}")
        print(f"Data retention rate: {(final_count / initial_count) * 100:.2f}%")
        
        print("\nCity Consolidation:")
        print(f"Original cities: {initial_cities:,}")
        print(f"Final unique cities: {final_cities:,}")
        print(f"Cities consolidated: {initial_cities - final_cities:,}")
        print(f"Consolidation rate: {((initial_cities - final_cities) / initial_cities) * 100:.2f}%")
        
        print("\nCleaned Dataset Schema:")
        cleaned_sellers.printSchema()
        
        # Print market proximity analysis
        print("\nMarket Proximity Distribution:")
        cleaned_sellers.groupBy("market_proximity")\
            .agg(
                count("*").alias("sellers"),
                round(count("*") / final_count * 100, 2).alias("percentage"),
                round(avg("sellers_in_city"), 2).alias("avg_sellers_per_city")
            )\
            .orderBy(desc("sellers"))\
            .show()
            
        print("\nMarket Proximity by Region:")
        cleaned_sellers.groupBy("market_proximity", "seller_region")\
            .count()\
            .withColumn("percentage", round(col("count") / final_count * 100, 2))\
            .orderBy(desc("count"))\
            .show()

        print("\nBusiness Potential Distribution:")
        cleaned_sellers.groupBy("business_potential")\
            .count()\
            .withColumn("percentage", round(col("count") / final_count * 100, 2))\
            .orderBy(desc("count"))\
            .show()

        print("\nBusiness Potential by Region:")
        cleaned_sellers.groupBy("business_potential", "seller_region")\
            .count()\
            .withColumn("percentage", round(col("count") / final_count * 100, 2))\
            .orderBy(desc("count"))\
            .show()

        print("\nSample of Cleaned Data with New Metrics:")
        cleaned_sellers.select(
            "seller_id", "seller_city", "seller_state", 
            "metropolitan_area", "market_proximity", "city_size", 
            "seller_density", "state_market_share", "business_potential"
        ).show(5)

        # Display final result
        display(cleaned_sellers.limit(10))

        print("\nCleaning process completed.")
        return cleaned_sellers
        
    except Exception as e:
        print(f"\nError in data cleaning process: {str(e)}")
        raise

# Execute the cleaning process
if __name__ == "__main__":
    try:
        cleaned_sellers = clean_sellers_dataset(spark)
    except Exception as e:
        print(f"Failed to clean sellers dataset: {str(e)}")

Loading sellers dataset...

Initial dataset information:
Number of records: 3,095
Number of columns: 4
Number of unique cities: 611
Number of unique states: 23

Missing values analysis:
seller_id: 0 missing values (0.00%)
seller_zip_code_prefix: 0 missing values (0.00%)
seller_city: 0 missing values (0.00%)
seller_state: 0 missing values (0.00%)

Missing values count:
+---------+----------------------+-----------+------------+
|seller_id|seller_zip_code_prefix|seller_city|seller_state|
+---------+----------------------+-----------+------------+
|        0|                     0|          0|           0|
+---------+----------------------+-----------+------------+


Checking for duplicate records...
Number of duplicate seller_ids: 0 (0.00%)

Starting data cleaning process...

State-level Analysis:
+------------+-------+----------------+
|seller_state|sellers|market_share_pct|
+------------+-------+----------------+
|          SP|   1849|           59.74|
|          PR|    348|           

seller_city,seller_state,seller_id,seller_zip_code_prefix,sellers_in_state,state_market_share,metropolitan_area,seller_region,sellers_in_city,city_size,seller_density,market_proximity,business_potential
Campinas,SP,3442f8959a84dea7ee197c632cb2df15,13023,1849,59.74,Other,Southeast,41,Small City,Medium,Near Market,Medium Growth
Mogi Guacu,SP,d1b65fc7debc3361ea86b5f14c68d2e2,13844,1849,59.74,Other,Southeast,8,Small City,Low,Near Market,Stable
Rio De Janeiro,RJ,ce3ad9de960102d0677a81f5d0bb7b2d,20031,171,5.53,Rio Metro,Southeast,98,Medium City,Medium,Core Market,Stable
Sao Paulo,SP,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,1849,59.74,Sao Paulo Metro,Southeast,705,Major City,High,Core Market,High Growth
Braganca Paulista,SP,51a04a8a6bdcb23deccc82b0b80742cf,12914,1849,59.74,Other,Southeast,5,Small City,Low,Near Market,Stable
Rio De Janeiro,RJ,c240c4061717ac1806ae6ee72be3533b,20920,171,5.53,Rio Metro,Southeast,98,Medium City,Medium,Core Market,Stable
Brejao,PE,e49c26c3edfa46d227d5121a6b6e4d37,55325,9,0.29,Other,Northeast,1,Small City,Low,Remote Market,Stable
Penapolis,SP,1b938a7ec6ac5061a66a3766e0e75f90,16304,1849,59.74,Other,Southeast,5,Small City,Low,Near Market,Stable
Sao Paulo,SP,768a86e36ad6aae3d03ee3c6433d61df,1529,1849,59.74,Sao Paulo Metro,Southeast,705,Major City,High,Core Market,High Growth
Curitiba,PR,ccc4bbb5f32a6ab2b7066a4130f114e3,80310,349,11.28,Curitiba Metro,South,127,Large City,High,Core Market,High Growth



Cleaning process completed.


### Review the cleaned_sellers_withCalculations dataset

In [0]:
# Verify final schema
print("\nFinal Schema:")
cleaned_sellers.printSchema()

# Show sample of final dataset
print("\nSample of final cleaned dataset:")
cleaned_sellers.show(5, truncate=False)

# Print final record count
print(f"\nTotal records in cleaned dataset: {cleaned_sellers.count():,}")


Final Schema:
root
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- sellers_in_state: long (nullable = false)
 |-- state_market_share: double (nullable = true)
 |-- metropolitan_area: string (nullable = false)
 |-- seller_region: string (nullable = false)
 |-- sellers_in_city: long (nullable = false)
 |-- city_size: string (nullable = false)
 |-- seller_density: string (nullable = false)
 |-- market_proximity: string (nullable = false)
 |-- business_potential: string (nullable = false)


Sample of final cleaned dataset:
+-----------+------------+--------------------------------+----------------------+----------------+------------------+-----------------+-------------+---------------+----------+--------------+----------------+------------------+
|seller_city|seller_state|seller_id                       |seller_zip_code_prefix|sellers_in_state|state_m

### Save the cleaned_sellers_withCalculations dataset in parquet file

In [0]:
# Define the output path for final cleaned and translated dataset
output_path = "/mnt/olist-store-data/transformed-data/olist_sellers_cleaned_dataset_withCalculations.parquet"
temp_path = "/mnt/olist-store-data/transformed-data/temp_parquet_output"

try:
    # Remove existing directories if they exist
    dbutils.fs.rm(output_path, recurse=True)
    dbutils.fs.rm(temp_path, recurse=True)

    # Save as a single Parquet file using temporary directory
    (cleaned_sellers
     .repartition(1)  # Force to a single partition
     .write
     .mode("overwrite")
     .parquet(temp_path))

    # Find the Parquet file in temp directory
    temp_files = dbutils.fs.ls(temp_path)
    parquet_file = [f.path for f in temp_files if f.path.endswith(".parquet")][0]
    
    # Move to final location
    dbutils.fs.mv(parquet_file, output_path)
    
    # Clean up temp directory
    dbutils.fs.rm(temp_path, recurse=True)

    # Verify the saved Parquet file
    verified_df = spark.read.parquet(output_path)
    print("\nVerification of saved Parquet file:")
    print(f"Number of rows in saved Parquet file: {verified_df.count():,}")
    print("\nSample of saved data:")
    display(verified_df.limit(5))

    # Verify it's a single file
    if len(dbutils.fs.ls(output_path)) == 1:
        print("\nSuccessfully saved as a single Parquet file.")
    else:
        print("\nWarning: Multiple files were created.")

except Exception as e:
    print(f"Error saving dataset: {str(e)}")
    # Clean up temp directory in case of failure
    dbutils.fs.rm(temp_path, recurse=True)
    raise
finally:
    # Unpersist cached DataFrame
    cleaned_sellers.unpersist()


Verification of saved Parquet file:
Number of rows in saved Parquet file: 3,093

Sample of saved data:


seller_city,seller_state,seller_id,seller_zip_code_prefix,sellers_in_state,state_market_share,metropolitan_area,seller_region,sellers_in_city,city_size,seller_density,market_proximity,business_potential
Palhoca,SC,f8201cab383e484733266d1906e2fdfa,88137,190,6.14,Other,South,8,Small City,Low,Remote Market,Stable
Ji Parana,RO,a5259c149128e82c9d6d46e0c1c812bb,76900,2,0.06,Other,North,1,Small City,Low,Remote Market,Stable
Teresina,PI,47efca563408aae19bb7206c2d969ea9,64033,1,0.03,Other,Northeast,1,Small City,Low,Remote Market,Stable
Manaus,AM,327b89b872c14d1c0be7235ef4871685,69005,1,0.03,Other,North,1,Small City,Low,Remote Market,Stable
Goiania,GO,39f776d2974049026ff531fc42ef2a3e,74835,40,1.29,Other,Central-West,23,Small City,Medium,Remote Market,Stable



Successfully saved as a single Parquet file.


### Review the cleaned_sellers dataset with selected columns

In [0]:
from pyspark.sql.functions import col

def preview_sellers(spark):
    # Load the existing cleaned products dataset
    cleaned_sellers = spark.read.parquet("/mnt/olist-store-data/transformed-data/olist_sellers_cleaned_dataset_withCalculations.parquet")

    # Create DataFrame with selected columns
    cleaned_selected_sellers = cleaned_sellers.select(
        "seller_id",
        "seller_zip_code_prefix",
        "seller_city",
        "seller_state",
    )

    # Verify final schema
    print("\nFinal Schema:")
    cleaned_selected_sellers.printSchema()

    # Show sample of final dataset
    print("\nSample of final cleaned dataset:")
    display(cleaned_selected_sellers.limit(5))

    # Print final record count
    print(f"\nTotal records in cleaned dataset: {cleaned_selected_sellers.count():,}")
    
    return cleaned_selected_sellers

# Call the function with the spark session and store the result
selected_sellers = preview_sellers(spark)


Final Schema:
root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)


Sample of final cleaned dataset:


seller_id,seller_zip_code_prefix,seller_city,seller_state
f8201cab383e484733266d1906e2fdfa,88137,Palhoca,SC
a5259c149128e82c9d6d46e0c1c812bb,76900,Ji Parana,RO
47efca563408aae19bb7206c2d969ea9,64033,Teresina,PI
327b89b872c14d1c0be7235ef4871685,69005,Manaus,AM
39f776d2974049026ff531fc42ef2a3e,74835,Goiania,GO



Total records in cleaned dataset: 3,093


### Save the cleaned_sellers dataset with select columns in parquet file

In [0]:
# Define the output path for final cleaned and translated dataset
output_path = "/mnt/olist-store-data/transformed-data/olist_sellers_cleaned_dataset_v2.0.parquet"
temp_path = "/mnt/olist-store-data/transformed-data/temp_sellers_parquet"

try:
    # Remove existing directories if they exist
    dbutils.fs.rm(output_path, recurse=True)
    dbutils.fs.rm(temp_path, recurse=True)

    # Save as a single Parquet file using temporary directory
    (selected_sellers
     .repartition(1)  # Force to a single partition
     .write
     .mode("overwrite")
     .parquet(temp_path))

    # Find the Parquet file in temp directory
    temp_files = dbutils.fs.ls(temp_path)
    parquet_file = [f.path for f in temp_files if f.path.endswith(".parquet")][0]
    
    # Move to final location
    dbutils.fs.mv(parquet_file, output_path)
    
    # Clean up temp directory
    dbutils.fs.rm(temp_path, recurse=True)

    # Verify the saved Parquet file
    verified_df = spark.read.parquet(output_path)
    print("\nVerification of saved Parquet file:")
    print(f"Number of rows in saved Parquet file: {verified_df.count():,}")
    print("\nSample of saved data:")
    display(verified_df.limit(5))

    # Verify it's a single file
    if len(dbutils.fs.ls(output_path)) == 1:
        print("\nSuccessfully saved as a single Parquet file.")
    else:
        print("\nWarning: Multiple files were created.")

except Exception as e:
    print(f"Error saving dataset: {str(e)}")
    # Clean up temp directory in case of failure
    dbutils.fs.rm(temp_path, recurse=True)
    raise
finally:
    # Unpersist cached DataFrame
    selected_sellers.unpersist()


Verification of saved Parquet file:
Number of rows in saved Parquet file: 3,093

Sample of saved data:


seller_id,seller_zip_code_prefix,seller_city,seller_state
f8201cab383e484733266d1906e2fdfa,88137,Palhoca,SC
a5259c149128e82c9d6d46e0c1c812bb,76900,Ji Parana,RO
47efca563408aae19bb7206c2d969ea9,64033,Teresina,PI
327b89b872c14d1c0be7235ef4871685,69005,Manaus,AM
39f776d2974049026ff531fc42ef2a3e,74835,Goiania,GO



Successfully saved as a single Parquet file.
