## ETL Process

In [3]:
# Import Libraries
import pandas as pd
import sqlite3
from datetime import datetime


### Extraction

In [11]:
def extract_data(data_path):
    import pandas as pd
    df = pd.read_csv(data_path)
    return df

In [14]:
# Define file path again
data_path = r"C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\data\online_retail.csv"
df = extract_data(data_path)
df.head()

Unnamed: 0,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


#### Columns/Variables

In [7]:
print(df.columns)
print(df.info())

Index(['Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID',
       'Country'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Description  540455 non-null  object 
 1   Quantity     541909 non-null  int64  
 2   InvoiceDate  541909 non-null  object 
 3   UnitPrice    541909 non-null  float64
 4   CustomerID   406829 non-null  float64
 5   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 24.8+ MB
None


### Wrangle function for Transformation process

In [16]:
import pandas as pd
import numpy as np

def transform_data(df):
    """
    Cleans and transforms retail sales data based on column-specific rules.
    """
    
    # ------------------------------
    # 1. Object (string) Columns
    # ------------------------------
    if 'Description' in df.columns:
        df['Description'] = df['Description'].fillna("Unknown Product") \
                                             .str.strip().str.lower()
    
    if 'Country' in df.columns:
        df['Country'] = df['Country'].fillna("Unknown Country") \
                                     .str.strip().str.title()
    
    # ------------------------------
    # 2. Quantity (Integer)
    # ------------------------------
    if 'Quantity' in df.columns:
        df = df[df['Quantity'].notnull()]  # remove missing quantity
        df = df[df['Quantity'] > 0]        # remove invalid (<= 0) quantities
        df['Quantity'] = df['Quantity'].astype(int)
    
    # ------------------------------
    # 3. InvoiceDate (DateTime)
    # ------------------------------
    if 'InvoiceDate' in df.columns:
        df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
        df = df[df['InvoiceDate'].notnull()]  # remove invalid dates
        
        # Extract useful date parts
        df['InvoiceYear'] = df['InvoiceDate'].dt.year
        df['InvoiceMonth'] = df['InvoiceDate'].dt.month
        df['InvoiceDay'] = df['InvoiceDate'].dt.day
        df['InvoiceQuarter'] = df['InvoiceDate'].dt.quarter
    
    # ------------------------------
    # 4. UnitPrice (Float)
    # ------------------------------
    if 'UnitPrice' in df.columns:
        df = df[df['UnitPrice'].notnull()]  # remove missing prices
        df = df[df['UnitPrice'] > 0]        # remove invalid prices
        df['UnitPrice'] = df['UnitPrice'].astype(float)
    
    # ------------------------------
    # 5. CustomerID (Integer/String)
    # ------------------------------
    if 'CustomerID' in df.columns:
        df['CustomerID'] = df['CustomerID'].fillna("Unknown Customer")
        df['CustomerID'] = df['CustomerID'].astype(str).str.strip()
    
    return df


#### Usage of wrangle function

In [17]:
df_clean = transform_data(df)

###     Load


In [18]:
def load_data_to_sqlite(df, db_path):
    import sqlite3
    conn = sqlite3.connect(db_path)
    df.to_sql('SalesFact', conn, if_exists='replace', index=False)
    conn.commit()
    conn.close()
    print(f"Loaded {len(df)} rows into {db_path}")

In [19]:
db_path = r'C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\design\retail_dw.db'

load_data_to_sqlite(df_clean, db_path=db_path)


Loaded 530104 rows into C:\Users\Admin\OneDrive - United States International University (USIU)\Documents\USIU_A\US2025\DSA2040A\Final Exam\DSA_2040_Practical_Exam_Ambachow_550\Data_Warehousing\design\retail_dw.db
