## ETL Process

In [3]:
# Import Libraries
import pandas as pd
import sqlite3
from datetime import datetime


### Extraction

In [2]:
# Define file path again
data_path = r"C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\data\online_retail.csv"

# Step 1: Extract - Read CSV
df = pd.read_csv(data_path)

#### Columns/Variables

In [5]:
df.columns

Index(['Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID',
       'Country'],
      dtype='object')

### Wrangle function for Transformation process

In [4]:

def etl_online_retail(
    csv_path: str,
    db_path: str,
    filter_date: str = '2024-08-12'  # Default cutoff date for filtering sales last year
):
    """
    ETL process for Online Retail data.
    
    Args:
        csv_path (str): File path to the input CSV data.
        db_path (str): SQLite database file path to load data.
        filter_date (str): Filter sales from this date onward (YYYY-MM-DD).
        
    Returns:
        None, but saves cleaned/transformed data into SQLite DB.
    """
    print("Starting ETL process...")
    
    # Transform
    print("Transforming data...")
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
    
    # Remove rows with invalid Quantity or UnitPrice
    original_count = df.shape[0]
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
    filtered_count = df.shape[0]
    print(f"Removed {original_count - filtered_count} rows with invalid Quantity or UnitPrice")
    
    # Calculate TotalSales
    df['TotalSales'] = df['Quantity'] * df['UnitPrice']
    
    # Filter for sales from filter_date onward
    cutoff_date = pd.Timestamp(filter_date)
    df_filtered = df[df['InvoiceDate'] >= cutoff_date]
    print(f"Filtered data to {df_filtered.shape[0]} rows from {filter_date} onwards")
    
    # Customer Dimension
    print("Creating Customer Dimension...")
    customer_dim = df_filtered.groupby('CustomerID').agg(
        TotalPurchases=('TotalSales', 'sum'),
        Country=('Country', 'first')
    ).reset_index()
    
    # Time Dimension
    print("Creating Time Dimension...")
    time_dim = df_filtered[['InvoiceDate']].drop_duplicates().copy()
    time_dim['Date'] = time_dim['InvoiceDate'].dt.date
    time_dim['Year'] = time_dim['InvoiceDate'].dt.year
    time_dim['Quarter'] = time_dim['InvoiceDate'].dt.quarter
    time_dim['Month'] = time_dim['InvoiceDate'].dt.month
    
    # Sales Fact Table
    print("Preparing Sales Fact Table...")
    sales_fact = df_filtered.merge(time_dim, on='InvoiceDate', how='left')
    sales_fact_table = sales_fact[[
        'InvoiceNo', 'CustomerID', 'StockCode', 'Quantity', 'UnitPrice', 'TotalSales',
        'Date', 'Year', 'Quarter', 'Month'
    ]]
    
    

### Load

In [6]:
# Load
print(f"Loading data into database at: {db_path}")
conn = sqlite3.connect(db_path)
    
customer_dim.to_sql('CustomerDim', conn, if_exists='replace', index=False)
time_dim.to_sql('TimeDim', conn, if_exists='replace', index=False)
sales_fact_table.to_sql('SalesFact', conn, if_exists='replace', index=False)
    
conn.close()
print("ETL process completed successfully!")



NameError: name 'db_path' is not defined

### Example usage

In [None]:


csv_path = r"C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\data\online_retail.csv"
db_path = r"C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\data\retail_dw.db"

etl_online_retail(csv_path, db_path)