In [0]:
!pip install openpyxl

In [0]:
dbutils.library.restartPython()

In [0]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Set up catalog, schema, and table names
catalog_name = "field_demos"
schema_name = "ml_ops"
table_name = "online_retail"

# Create catalog and schema if they don’t exist
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
spark.sql(f"USE CATALOG {catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
spark.sql(f"USE {schema_name}")

# Load public retail dataset (Online Retail from UCI)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
df_pandas = pd.read_excel(url)

# Check initial data types
print(df_pandas.dtypes)

# Explicit type casting
df_pandas['InvoiceNo'] = df_pandas['InvoiceNo'].astype(str)
df_pandas['StockCode'] = df_pandas['StockCode'].astype(str).fillna('UNKNOWN')
df_pandas['Description'] = df_pandas['Description'].astype(str).fillna('N/A')
df_pandas['Quantity'] = pd.to_numeric(df_pandas['Quantity'], errors='coerce')
df_pandas['InvoiceDate'] = pd.to_datetime(df_pandas['InvoiceDate'], errors='coerce').dt.strftime('%Y-%m-%d')
df_pandas['UnitPrice'] = pd.to_numeric(df_pandas['UnitPrice'], errors='coerce')
df_pandas['CustomerID'] = pd.to_numeric(df_pandas['CustomerID'], errors='coerce').astype('Int64')
df_pandas['Country'] = df_pandas['Country'].astype(str)

# Handle missing values after type conversion
df_pandas = df_pandas.dropna(subset=['InvoiceNo', 'StockCode', 'InvoiceDate', 'Quantity', 'UnitPrice'])

# Check data types again after casting
print(df_pandas.dtypes)

# Convert Pandas DataFrame to Spark DataFrame
df_spark = spark.createDataFrame(df_pandas)

# Save as Delta table
df_spark.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog_name}.{schema_name}.{table_name}")

# Quick check on the new table
display(spark.sql(f"SELECT * FROM {catalog_name}.{schema_name}.{table_name} LIMIT 10"))