### Mounting the container into Databricks

In [0]:
# Install required packages
%pip install azure-identity azure-keyvault-secrets

Collecting azure-identity
  Obtaining dependency information for azure-identity from https://files.pythonhosted.org/packages/f0/d5/3995ed12f941f4a41a273d9b1709282e825ef87ed8eab3833038fee54d59/azure_identity-1.19.0-py3-none-any.whl.metadata
  Using cached azure_identity-1.19.0-py3-none-any.whl.metadata (80 kB)
Collecting azure-keyvault-secrets
  Obtaining dependency information for azure-keyvault-secrets from https://files.pythonhosted.org/packages/bf/ad/e5dd4c09ed80196b1b35f107502b12e32d06eb2d965adf4673df0d5cf85e/azure_keyvault_secrets-4.9.0-py3-none-any.whl.metadata
  Using cached azure_keyvault_secrets-4.9.0-py3-none-any.whl.metadata (29 kB)
Collecting azure-core>=1.31.0 (from azure-identity)
  Obtaining dependency information for azure-core>=1.31.0 from https://files.pythonhosted.org/packages/39/83/325bf5e02504dbd8b4faa98197a44cdf8a325ef259b48326a2b6f17f8383/azure_core-1.32.0-py3-none-any.whl.metadata
  Using cached azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting msal

In [0]:
# Restart Python interpreter to ensure new packages are loaded
%restart_python

In [0]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

# Key Vault configuration
key_vault_url = "https://Olist-Key.vault.azure.net/"
credential = DefaultAzureCredential()
client = SecretClient(vault_url=key_vault_url, credential=credential)

# Retrieve secrets from Key Vault
client_id = client.get_secret("olist-client-id").value
client_secret = client.get_secret("olist-client-secret").value
tenant_id = client.get_secret("olist-tenant-id").value

# Unmount the existing mount point if it exists
dbutils.fs.unmount("/mnt/olist-store-data")

# Create the configurations
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": client_id,
    "fs.azure.account.oauth2.client.secret": client_secret,
    "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
}

# Mount the storage
dbutils.fs.mount(
    source="abfss://olist-store-data@olistbrdata.dfs.core.windows.net",
    mount_point="/mnt/olist-store-data",
    extra_configs=configs
)

/mnt/olist-store-data has been unmounted.


True

### Check mounting of the storage-account container

In [0]:
# Check if the mounting is successful or not
dbutils.fs.ls("/mnt/olist-store-data")

[FileInfo(path='dbfs:/mnt/olist-store-data/raw-data/', name='raw-data/', size=0, modificationTime=1735461319000),
 FileInfo(path='dbfs:/mnt/olist-store-data/ready-data/', name='ready-data/', size=0, modificationTime=1735792345000),
 FileInfo(path='dbfs:/mnt/olist-store-data/test-upload/', name='test-upload/', size=0, modificationTime=1736860622000),
 FileInfo(path='dbfs:/mnt/olist-store-data/transformed-data/', name='transformed-data/', size=0, modificationTime=1735461344000)]

### Read geolocation dataset from raw-data folder

In [0]:

geolocation = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/mnt/olist-store-data/raw-data/olist_geolocation_dataset.csv")

In [0]:
geolocation.printSchema()
geolocation.show(10)
geolocation.display()

root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)

+---------------------------+-------------------+------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+------------------+----------------+-----------------+
|                       1037| -23.54562128115268|-46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535|-46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469|-46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681|-46.63949930627844|       sao paulo|               SP|
|              

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1046,-23.54608112703553,-46.64482029837157,sao paulo,SP
1046,-23.54612896641469,-46.64295148361138,sao paulo,SP
1041,-23.5443921648681,-46.63949930627844,sao paulo,SP
1035,-23.541577961711493,-46.64160722329613,sao paulo,SP
1012,-23.547762303364262,-46.63536053788448,são paulo,SP
1047,-23.54627311241268,-46.64122516971552,sao paulo,SP
1013,-23.546923208436723,-46.6342636964915,sao paulo,SP
1029,-23.543769055769133,-46.63427784085132,sao paulo,SP
1011,-23.547639550320632,-46.63603162315495,sao paulo,SP


### Cast the column to a string value as it has turned into an integer automatically when Databricks reads data from source.<br>
Adding a "0" in the geolocation_zip_code_prefix column

In [0]:
from pyspark.sql.functions import col, lpad

geolocation = geolocation.withColumn(
    "geolocation_zip_code_prefix", 
    lpad(col("geolocation_zip_code_prefix").cast("string"), 5, "0")
)

geolocation.show(10)

+---------------------------+-------------------+------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+------------------+----------------+-----------------+
|                      01037| -23.54562128115268|-46.63929204800168|       sao paulo|               SP|
|                      01046|-23.546081127035535|-46.64482029837157|       sao paulo|               SP|
|                      01046| -23.54612896641469|-46.64295148361138|       sao paulo|               SP|
|                      01041|  -23.5443921648681|-46.63949930627844|       sao paulo|               SP|
|                      01035|-23.541577961711493|-46.64160722329613|       sao paulo|               SP|
|                      01012|-23.547762303364266|-46.63536053788448|       são paulo|               SP|
|                      01047|-23.546273112412678|-46.64122516971

### Data cleaning for geolocation dataset<br>
- **Step 1: Basic cleaning**<br>
A user-defined function (UDF) called replace_char is created to perform initial cleaning:
  - Converts city names to lowercase and strips spaces.
  - Removes state abbreviations (e.g., '-sp')
  - Handles the special case of 'sp' (São Paulo)
  - Replaces accented characters with their non-accented equivalents
  - Handles specific patterns like "d'oeste" and hyphenated names

- **Step 2: City corrections**<br>
A dictionary city_corrections is defined with common city name corrections. This is used to standardize frequently occurring city names.

- **Step 3: Applying Cleaning and Corrections**<br>
  - The replace_char UDF is applied to create a new column geolocation_city_cleaned
  - City corrections are applied using a series of when conditions

- **Step 4: Standardization Using Mode**<br>
For remaining inconsistencies, the code uses the mode (most frequent city name) for each zip code prefix to further standardize city names.

- **Step 5: Proper Title Case**<br>
A UDF proper_title_case is applied to convert city names to proper title case, keeping certain words (like 'de', 'da', 'do') in lowercase when not at the start of the name.

- **Step 6: Data Quality Checks and Statistics**<br>
The code then performs various data quality checks and generates statistics:
  - Compares the number of unique cities before and after cleaning
  - Shows a sample of city name changes
  - Identifies problematic cities (those with non-alphabetic characters or very short names)
  - Calculates and displays data quality metrics

- **Step 7: Display Results**<br>
Finally, the code displays:
  - Top 10 cities by frequency
  - State distribution
  - A sample of 25 rows from the final cleaned DataFrame 

In [0]:
# This script cleans and standardizes city names in a geolocation dataset.
# It performs several steps including basic cleaning, applying corrections,
# standardizing using mode, and proper case formatting.

# Import libraries
from pyspark.sql.functions import col, lower, regexp_replace, udf, mode, when, length
from pyspark.sql.types import StringType, DecimalType
from pyspark.sql import Window

# Step 1: Basic cleaning
# Drop duplicates to ensure data integrity
geolocation = geolocation.dropDuplicates()

# Clean city names
@udf(StringType())
def replace_char(city_name):
    """
    Cleans city names by converting to lowercase, removing accents,
    and handling special cases.
    """
    if city_name is None:
        return None
    
    # Convert to lowercase and strip spaces
    city_name = city_name.lower().strip()
    
    # Remove state abbreviations
    city_name = city_name.replace('-sp', '')
    
    # Handle special abbreviation
    if city_name == 'sp':
        return 'sao paulo'
    
    # Replace special characters
    replacements = {
        'ã': 'a', 'â': 'a', 'á': 'a', 'à': 'a', 'ä': 'a',
        'í': 'i', 'î': 'i', 'ì': 'i',
        'ú': 'u', 'û': 'u', 'ù': 'u', 'ü': 'u',
        'é': 'e', 'ê': 'e', 'è': 'e', 'ë': 'e',
        'ó': 'o', 'õ': 'o', 'ô': 'o', 'ò': 'o', 'ö': 'o',
        'ç': 'c'
    }
    
    for char, replacement in replacements.items():
        city_name = city_name.replace(char, replacement)
    
    # Handle specific patterns
    city_name = city_name.replace("d'", "d ")  # Handle d'oeste, d'alianca patterns
    city_name = city_name.replace("-", " ")    # Handle hyphenated names
    city_name = city_name.replace("'", "")     # Remove any remaining apostrophes
    
    return city_name

# Step 2: Define comprehensive city corrections
# This dictionary maps common misspellings or variations to the correct 
city_corrections = {
    'sao paulo': 'Sao Paulo',
    'rio de janeiro': 'Rio de Janeiro',
    'belo horizonte': 'Belo Horizonte',
    'brasilia': 'Brasilia',
    'curitiba': 'Curitiba',
    'fortaleza': 'Fortaleza',
    'salvador': 'Salvador',
    'porto alegre': 'Porto Alegre',
    'guarulhos': 'Guarulhos',
    'campinas': 'Campinas',
    'sao bernardo do campo': 'Sao Bernardo do Campo',
    'santo andre': 'Santo Andre',
    'osasco': 'Osasco',
    'jundiai': 'Jundiai',
    'sao caetano do sul': 'Sao Caetano do Sul',
    'mogi das cruzes': 'Mogi das Cruzes',
    'embu': 'Embu das Artes',
    'taboao da serra': 'Taboao da Serra',
    'itapecerica da serra': 'Itapecerica da Serra',
    'santana de parnaiba': 'Santana de Parnaiba',
    'goiania': 'Goiania',
    'nova iguacu': 'Nova Iguacu',
    'ribeirao preto': 'Ribeirao Preto',
    'ribeirao das neves': 'Ribeirao das Neves',
    'ribeirao pires': 'Ribeirao Pires',
    'niteroi': 'Niteroi',
    'sao joao de meriti': 'Sao Joao de Meriti',
    'sao jose dos campos': 'Sao Jose dos Campos',
    'sao jose do rio preto': 'Sao Jose do Rio Preto',
    'feira de santana': 'Feira de Santana',
    'varzea grande': 'Varzea Grande',
    'sao vicente': 'Sao Vicente',
    'jaboatao dos guararapes': 'Jaboatao dos Guararapes',
    'aparecida de goiania': 'Aparecida de Goiania',
    'vitoria da conquista': 'Vitoria da Conquista',
    'barueri': 'Barueri',
    'cotia': 'Cotia',
    'carapicuiba': 'Carapicuiba',
    'diadema': 'Diadema',
    'suzano': 'Suzano',
    'embu das artes': 'Embu das Artes'
}

# Step 3: Applying Cleaning and Corrections
# Apply initial cleaning
print("Starting city name cleaning...")
geolocation = geolocation.withColumn(
    "geolocation_city_cleaned", 
    replace_char(col("geolocation_city"))
)

# Apply city corrections
for original, corrected in city_corrections.items():
    geolocation = geolocation.withColumn(
        "geolocation_city_cleaned",
        when(lower(col("geolocation_city_cleaned")) == original, corrected)
        .otherwise(col("geolocation_city_cleaned"))
    )

# Step 4: Standardize Using Mode (most frequent city name)
# Create a window spec to partition by zip code prefix
window_spec = Window.partitionBy("geolocation_zip_code_prefix")
mode_city = mode("geolocation_city_cleaned").over(window_spec)

geolocation = geolocation.withColumn(
    "geolocation_city_final",
    when(
        (col("geolocation_city_cleaned") != mode_city) &
        (length(col("geolocation_city_cleaned")) > 2) &
        (~lower(col("geolocation_city_cleaned")).isin(list(city_corrections.keys()))),
        mode_city
    )
    .otherwise(col("geolocation_city_cleaned"))
)

# Step 5: Proper Title Case transformation
# Convert to proper title case
@udf(StringType())
def proper_title_case(x):
    if not x:
        return None
    
    # Words that should remain lowercase (except at start)
    lowercase_words = {'de', 'da', 'do', 'das', 'dos', 'e'}
    
    words = x.split()
    result = []
    
    for i, word in enumerate(words):
        if word.lower() in lowercase_words and i != 0:
            result.append(word.lower())
        else:
            result.append(word.capitalize())
    
    return ' '.join(result)

geolocation = geolocation.withColumn(
    "geolocation_city_final",
    proper_title_case(col("geolocation_city_final"))
)

# Step 6: Data Quality Checks and Statistics
# Display a sample of the cleaned data for visual inspection
print("\nSample of cleaned data:")
geolocation.select(
    "geolocation_zip_code_prefix",
    "geolocation_city",
    "geolocation_city_final"
).show(25, truncate=False)

# Calculate and display cleaning statistics
total_records = geolocation.count()
print("\nCleaning Statistics:")
print(f"Total records processed: {total_records:,}")
print("Unique cities before cleaning:", 
      geolocation.select("geolocation_city").distinct().count())
print("Unique cities after cleaning:", 
      geolocation.select("geolocation_city_final").distinct().count())

# Show city changes (25 rows)
print("\nSample of city name changes:")
comparison = geolocation.select("geolocation_city", "geolocation_city_final").distinct()
comparison.filter(
    col("geolocation_city") != col("geolocation_city_final")
).show(25, truncate=False)

# Identify problematic cities
problematic_cities = geolocation.filter(
    (col("geolocation_city_final").rlike("[^a-zA-Z ]")) |
    (length(col("geolocation_city_final")) < 3)
).select("geolocation_city_final").distinct()

problem_count = problematic_cities.count()
total_distinct = geolocation.select("geolocation_city_final").distinct().count()

print(f"\nData Quality Metrics:")
print(f"Problematic cities: {problem_count} out of {total_distinct}")
print(f"Percentage: {problem_count/total_distinct*100:.2f}%")

# Show city frequency distribution with percentages (10 rows)
print("\nTop 10 cities by frequency:")
city_freq = geolocation.groupBy("geolocation_city_final") \
    .count() \
    .withColumn("percentage", (col("count") / total_records * 100).cast(DecimalType(10,2))) \
    .orderBy(col("count").desc())

city_freq.show(10, truncate=False)

# Show state distribution
print("\nState distribution:")
geolocation.groupBy("geolocation_state") \
    .count() \
    .withColumn("percentage", (col("count") / total_records * 100).cast(DecimalType(10,2))) \
    .orderBy(col("count").desc()) \
    .show(truncate=False)

# Clean up temporary columns
geolocation = geolocation.drop("geolocation_city_cleaned")

# Display 25 rows of the final cleaned dataframe
display(geolocation.limit(25))

print("\nCleaning process completed.")

Starting city name cleaning...

Sample of cleaned data:
+---------------------------+----------------+----------------------+
|geolocation_zip_code_prefix|geolocation_city|geolocation_city_final|
+---------------------------+----------------+----------------------+
|01005                      |são paulo       |Sao Paulo             |
|01005                      |são paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao paulo       |Sao Paulo             |
|01005                      |sao p

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,geolocation_city_final
1005,-23.549980033585307,-46.63476783166945,são paulo,SP,Sao Paulo
1005,-23.549780031197237,-46.63535898865553,são paulo,SP,Sao Paulo
1005,-23.54968972289983,-46.63671905572496,sao paulo,SP,Sao Paulo
1005,-23.548933414651763,-46.63811234831787,sao paulo,SP,Sao Paulo
1005,-23.549819091869107,-46.63560588995324,sao paulo,SP,Sao Paulo
1005,-23.549780031197237,-46.63535898865553,sao paulo,SP,Sao Paulo
1005,-23.54976291539134,-46.63610028661863,sao paulo,SP,Sao Paulo
1005,-23.54875839078997,-46.6384109454671,sao paulo,SP,Sao Paulo
1005,-23.548779492642403,-46.63830268942238,sao paulo,SP,Sao Paulo
1005,-23.54977002637195,-46.63583891548848,sao paulo,SP,Sao Paulo



Cleaning process completed.


### Review the cleaned_geolocation dataset

In [0]:
# Select only the required columns and create final DataFrame
cleaned_geolocation = geolocation.select(
    "geolocation_zip_code_prefix",
    "geolocation_lat",
    "geolocation_lng",
    "geolocation_state",
    "geolocation_city_final"
)

# Verify the schema
print("Final Schema:")
cleaned_geolocation.printSchema()

# Show sample of final dataset
print("\nSample of final cleaned dataset:")
cleaned_geolocation.show(5, truncate=False)

# Count final records
print(f"\nTotal records in cleaned dataset: {cleaned_geolocation.count():,}")
cleaned_geolocation.display()


Final Schema:
root
 |-- geolocation_zip_code_prefix: string (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_state: string (nullable = true)
 |-- geolocation_city_final: string (nullable = true)


Sample of final cleaned dataset:
+---------------------------+-------------------+------------------+-----------------+----------------------+
|geolocation_zip_code_prefix|geolocation_lat    |geolocation_lng   |geolocation_state|geolocation_city_final|
+---------------------------+-------------------+------------------+-----------------+----------------------+
|01005                      |-23.549980033585307|-46.63476783166945|SP               |Sao Paulo             |
|01005                      |-23.549780031197233|-46.63535898865553|SP               |Sao Paulo             |
|01005                      |-23.54968972289983 |-46.63671905572496|SP               |Sao Paulo             |
|01005                      |-

geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city_final
1005,-23.549980033585307,-46.63476783166945,SP,Sao Paulo
1005,-23.549780031197237,-46.63535898865553,SP,Sao Paulo
1005,-23.54968972289983,-46.63671905572496,SP,Sao Paulo
1005,-23.548933414651763,-46.63811234831787,SP,Sao Paulo
1005,-23.549819091869107,-46.63560588995324,SP,Sao Paulo
1005,-23.549780031197237,-46.63535898865553,SP,Sao Paulo
1005,-23.54976291539134,-46.63610028661863,SP,Sao Paulo
1005,-23.54875839078997,-46.6384109454671,SP,Sao Paulo
1005,-23.548779492642403,-46.63830268942238,SP,Sao Paulo
1005,-23.54977002637195,-46.63583891548848,SP,Sao Paulo


### Save the cleaned_geolocation to a parquet file with selected columns

In [0]:
# Define the output path
output_path = "/mnt/olist-store-data/transformed-data/olist_geolocation_cleaned_dataset_v2.0.parquet"
temp_path = "/mnt/olist-store-data/transformed-data/temp_parquet_output"

try:
    # Remove existing directories if they exist
    dbutils.fs.rm(output_path, recurse=True)
    dbutils.fs.rm(temp_path, recurse=True)

    # Save as a single Parquet file using temporary directory
    (cleaned_geolocation
     .repartition(1)  # Force to a single partition
     .write
     .mode("overwrite")
     .parquet(temp_path))

    # Find the Parquet file in temp directory
    temp_files = dbutils.fs.ls(temp_path)
    parquet_file = [f.path for f in temp_files if f.path.endswith(".parquet")][0]
    
    # Move to final location with correct filename
    dbutils.fs.mv(parquet_file, output_path)
    
    # Clean up temp directory
    dbutils.fs.rm(temp_path, recurse=True)

    # Verify the saved Parquet file
    verified_df = spark.read.parquet(output_path)
    print("\nVerification of saved Parquet file:")
    print(f"Number of rows in saved Parquet file: {verified_df.count():,}")
    print("\nSample of saved data:")
    display(verified_df.limit(5))

    # Verify it's a single file
    if len(dbutils.fs.ls(output_path)) == 1:
        print("\nSuccessfully saved as a single Parquet file.")
    else:
        print("\nWarning: Multiple files were created.")

except Exception as e:
    print(f"Error saving dataset: {str(e)}")
    # Clean up temp directory in case of failure
    dbutils.fs.rm(temp_path, recurse=True)
    raise
finally:
    # Unpersist cached DataFrame
    cleaned_geolocation.unpersist()


Verification of saved Parquet file:
Number of rows in saved Parquet file: 738,332

Sample of saved data:


geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city_final
1005,-23.549980033585307,-46.63476783166945,SP,Sao Paulo
1005,-23.549780031197237,-46.63535898865553,SP,Sao Paulo
1005,-23.54968972289983,-46.63671905572496,SP,Sao Paulo
1005,-23.548933414651763,-46.63811234831787,SP,Sao Paulo
1005,-23.549819091869107,-46.63560588995324,SP,Sao Paulo



Successfully saved as a single Parquet file.
