In [58]:
import pickle
import pandas as pd
from datetime import datetime
import re
import time

class CustomerDataExtractor:
    @staticmethod
    def _clean_price(price) -> float:
        if isinstance(price, str):
            price = re.sub(r'[^0-9.]', '', price)
        try:
            return float(price)
        except:
            return 0.0


    
    CATEGORY_MAP = {
        1: 'Electronics',
        2: 'Apparel',
        3: 'Books',
        4: 'Home Goods'
    }
    
    def __init__(self, orders_file='customer_orders.pkl', vip_file='vip_customers.txt') -> None:
        self.orders_file = orders_file
        self.vip_file = vip_file
        self.customers = []
        self.vip_ids = set()

    def load_data(self) -> None:
        with open(self.orders_file, 'rb') as f:
            self.customers = pickle.load(f)

        with open(self.vip_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        self.vip_ids.add(int(line))
                    except ValueError:
                        continue
        
    def transform(self) -> pd.DataFrame:
        rows = []
        for cust in self.customers:

            cust_id = cust.get('id')
            if cust_id is None:
                continue
            
            cust_name = cust.get('name', '').strip()
            
            reg_date_raw = cust.get('registration_date')
            if not reg_date_raw:
                continue
            reg_date = pd.to_datetime(reg_date_raw, errors='coerce')
            if pd.isna(reg_date):
                continue
            
            is_vip = cust_id in self.vip_ids

            for order in cust.get('orders', []):
                order_id = order.get('order_id') # fill order_id whenever possible
                
                raw_odate = order.get('order_date')
                if not raw_odate:
                    continue
                order_date = pd.to_datetime(raw_odate, errors='coerce')
                if pd.isna(order_date):
                    # Skip invalid dates
                    continue
                
                for item in order.get('items', []):
                    product_id = item.get('item_id') # fill product id whenever possible
                    
                    product_name = item.get('product_name', "")
                    
                    if product_name is not None and product_id is None:
                        product_id = re.search(r"Item\s+(\d+)", product_name).group(1)

                    if product_name is not None and order_id is None:
                        order_id = re.search(r"Order\s+(\d+)", product_name).group(1)
                    
                    if product_id is not None and order_id is None and (
                        product_name is None                       
                        or (isinstance(product_name, float) and pd.isna(product_name))
                        or (isinstance(product_name, str) and not product_name.strip())
                    ):
                        product_name = f"Item {product_id} for Order {order_id}"
                        
                
                    
                    raw_cat = item.get('category')
                    category = self.CATEGORY_MAP.get(raw_cat, 'Misc')
                    unit_price = CustomerDataExtractor._clean_price(item.get('price', '0.0'))
                    
                    try:
                        qty = int(item.get('quantity'))
                    except (ValueError, TypeError):
                        continue
                    if qty == 0:
                        continue
                        
                    total_price = unit_price * qty

                    rows.append({
                        'customer_id': cust_id,
                        'customer_name': cust_name,
                        'registration_date': reg_date,
                        'is_vip': is_vip,
                        'order_id': order_id,
                        'order_date': order_date,
                        'product_id': product_id,
                        'product_name': product_name,
                        'category': category,
                        'unit_price': unit_price,
                        'item_quantity': qty,
                        'total_item_price': total_price
                    })

        if not rows:
            columns = [
                'customer_id','customer_name','registration_date','is_vip',
                'order_id','order_date','product_id','product_name','category',
                'unit_price','item_quantity','total_item_price','total_order_value_percentage'
            ]
            return pd.DataFrame(columns=columns)

        df = pd.DataFrame(rows)
 
        df = df.sort_values(['customer_id', 'order_id', 'product_id']).reset_index(drop=True)
        return df


if __name__ == "__main__":
    start = time.time()
    extractor = CustomerDataExtractor()
    extractor.load_data()
    df_final = extractor.transform()
    
    out = "customer_orders_flat.csv"
    df_final.to_csv(out, index=False)
    print(f"✓ Saved {len(df_final):,} rows → {out}")
    end = time.time()
    print(f"Time: {end - start}")


✓ Saved 182 rows → customer_orders_flat.csv
Time: 0.13871383666992188


0
0
0
0
0
0
0
0
0
0
0
0
