## Import Packages

In [37]:
import pandas as pd
from dataclasses import dataclass
from datetime import datetime
import re


## Step 1: Load raw CSV and display first 3 rows

In [38]:
df = pd.read_csv("data/Sales Records.csv")
print("Table:", df.columns.tolist())
print(df.head(3))


Table: ['date', 'customer_id', 'product_id', 'price', 'quantity', 'coupon_code', 'shipping_city']
         date customer_id product_id    price  quantity coupon_code  \
0  2023-10-30    CUST0086       P001  1253.52         1    FREESHIP   
1  2024-05-24    CUST0075       P020   253.25         2         NaN   
2  2023-10-06    CUST0079       P007    58.91         2         NaN   

  shipping_city  
0  Jacksonville  
1      New York  
2        Denver  


## Step 2: Pick the Right Container

Considering the three options to store each row: dictionaries, namedtuples, and classes.  
Classes are the way to go because they let us add logic like cleaning up data, transforming it, or calculating totals. Plus, they keep things neat and reusable for each transaction.


# structure and define Transaction class

In [39]:


@dataclass
class Transaction:
    date: str
    customer_id: str
    product_id: str
    price: float
    quantity: int
    coupon_code: str
    shipping_city: str

    def clean(self):
        try:
            self.price = float(self.price)
            if self.price < 0:
                self.price = 0.0
        except:
            self.price = 0.0
        self.coupon_code = self.coupon_code.upper() if isinstance(self.coupon_code, str) else "nan"

    def total(self):
        return self.price * self.quantity if self.price else 0


Load Transaction data into the data class


In [40]:
def load_transactions(path: str) -> list[Transaction]:
    df = pd.read_csv(path)
    transactions = []
    for _, row in df.iterrows():        
        row_dict = row.to_dict()
        transaction = Transaction(**row_dict)    
        transactions.append(transaction)
    return transactions

Step 5: Quick Profiling

In [41]:
def profile_transactions(transactions: list[Transaction]) -> None:
    valid_prices_list = [float(t.price) for t in transactions if isinstance(t.price, (int, float))]
    unique_cities = len(set(t.shipping_city for t in transactions))
    print("Quick Profiling: ")
    print(f"Min: {min(valid_prices_list)}")
    print(f"Mean: {sum(valid_prices_list)/len(valid_prices_list):.2f}")
    print(f"Max: {max(valid_prices_list)}")
    print(f"Unique Shipping Cities: {unique_cities}\n")


Step 6: Injected 3 dirty values for demo.

In [42]:
def inject_dirty_val(transactions: list) -> None:
    if len(transactions) >= 3:
        transactions[0].price = "Nan"
        transactions[1].price = -150
        transactions[12].price = "aefada"

Step 7: Cleaned transactions.

In [43]:
def clean_all_data(transactions: list[Transaction]) -> None:
    before = 0
    for t in transactions:
        if not isinstance(t.price, float):
            before += 1
    
    for t in transactions:
        t.clean()
    
    after = 0
    for t in transactions:
        if not isinstance(t.price, float):
            after += 1
    
    print(f"Cleaning results:")
    print(f"Before: {before} invalid prices")
    print(f"After: {after} invalid prices")

In [44]:
def main():
    tx = load_transactions("data/Sales Records.csv")
    profile_transactions(tx)
    inject_dirty_val(tx)
    clean_all_data(tx)


In [45]:
if __name__ == "__main__":
    main()

Quick Profiling: 
Min: 19.06
Mean: 268.40
Max: 1257.09
Unique Shipping Cities: 20

Cleaning results:
Before: 3 invalid prices
After: 0 invalid prices
