### ETL 

In [None]:
# Import Libraries
import pandas as pd
import sqlite3
from datetime import datetime

In [None]:
# Define file path again
data_path = r"C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\data\online_retail.csv"

# Step 1: Extract - Read CSV
df = pd.read_csv(data_path)

# Step 2: Data cleaning & transformation

# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

# Remove rows with negative quantity or non-positive price
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

# Calculate TotalSales
df['TotalSales'] = df['Quantity'] * df['UnitPrice']

# Filter sales for last year (assuming current date August 12, 2025)
cutoff_date = pd.Timestamp('2024-08-12')
df_last_year = df[df['InvoiceDate'] >= cutoff_date]

# Step 3: Create customer summary dimension
customer_summary = df_last_year.groupby('CustomerID').agg(
    TotalPurchases=('TotalSales', 'sum'),
    Country=('Country', 'first')
).reset_index()

# Step 4: Create time dimension (extract date parts)
time_dim = df_last_year[['InvoiceDate']].drop_duplicates().copy()
time_dim['Date'] = time_dim['InvoiceDate'].dt.date
time_dim['Year'] = time_dim['InvoiceDate'].dt.year
time_dim['Quarter'] = time_dim['InvoiceDate'].dt.quarter
time_dim['Month'] = time_dim['InvoiceDate'].dt.month

# Step 5: Load into SQLite database
conn = sqlite3.connect(r"C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\data\retail_dw.db")

# Load CustomerDim
customer_summary.to_sql('CustomerDim', conn, if_exists='replace', index=False)

# Load TimeDim
time_dim.to_sql('TimeDim', conn, if_exists='replace', index=False)

# Load SalesFact (fact table)
# Join df_last_year with time_dim to get date fields, and select needed columns
sales_fact = df_last_year.merge(time_dim, left_on='InvoiceDate', right_on='InvoiceDate', how='left')

# Select fact table columns: InvoiceNo, CustomerID, StockCode, Quantity, UnitPrice, TotalSales, Date, Year, Quarter, Month
sales_fact_table = sales_fact[['InvoiceNo', 'CustomerID', 'StockCode', 'Quantity', 'UnitPrice', 'TotalSales', 'Date', 'Year', 'Quarter', 'Month']]

# Save fact table
sales_fact_table.to_sql('SalesFact', conn, if_exists='replace', index=False)

conn.close()

print("ETL process completed. Data loaded into SQLite database.")
