# Feature Engineering

This notebook processes transaction data to create customer-level features for LTV prediction.

In [7]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Add the src directory to Python path
repo_root = Path().resolve()
sys.path.append(str(repo_root))

from src.google_lifetime_value.utils.logger import setup_logger
from src.google_lifetime_value.preprocess.preprocess_transactions import process
from src.google_lifetime_value.utils.config_utils import get_active_companies, get_config

# Initialize logger
logger = setup_logger('feature_engineering')

# Define companies
COMPANYS = get_active_companies()

print(f"Will process these companies: {COMPANYS}")

2025-09-18 19:44:17,797 - INFO - Logging initialized. Log file: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/logs/feature_engineering_2025.09.18_19:44:17.log


Will process these companies: [10000, 101200010, 101410010, 101600010, 102100020, 102700020, 102840020, 103000030, 103338333, 103400030, 103600030, 103700030, 103800030, 104300040, 104400040, 104470040, 104900040, 105100050, 105150050, 107800070]


## 1. Process Companies to Generate Customer Features

In [4]:
# Process one company at a time with detailed output
results = {}

for company in COMPANYS:
    logger.info(f"Processing company {company}...")
    try:
        # Process the company and get customer-level data
        customer_df = process(company)
        results[company] = {
            "num_customers": len(customer_df),
            "avg_calibration_value": customer_df['calibration_value'].mean(),
            "avg_holdout_value": customer_df['holdout_value'].mean(),
        }
        print(f"✅ Completed company {company} - {len(customer_df)} customers")
    except Exception as e:
        logger.error(f"Error processing company {company}: {str(e)}")
        print(f"❌ Failed company {company}: {str(e)}")

# Show summary of results
summary_df = pd.DataFrame.from_dict(results, orient='index')
display(summary_df)

2025-09-18 19:26:37,620 - INFO - Processing company 10000...
2025-09-18 19:26:37,621 - INFO - Processing company 10000
2025-09-18 19:26:37,623 - INFO - Loading existing filtered data for company 10000 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_10000.csv
2025-09-18 19:26:40,032 - INFO - Loaded 7964915 transactions for company 10000
2025-09-18 19:26:52,320 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_10000.csv
2025-09-18 19:26:52,349 - INFO - Processing company 101200010...
2025-09-18 19:26:52,349 - INFO - Processing company 101200010
2025-09-18 19:26:52,349 - INFO - Loading existing filtered data for company 101200010 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_101200010.csv


✅ Completed company 10000 - 234385 customers


2025-09-18 19:26:53,476 - INFO - Loaded 3585166 transactions for company 101200010
2025-09-18 19:27:09,317 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_101200010.csv
2025-09-18 19:27:09,339 - INFO - Processing company 101410010...
2025-09-18 19:27:09,340 - INFO - Processing company 101410010
2025-09-18 19:27:09,340 - INFO - Loading existing filtered data for company 101410010 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_101410010.csv


✅ Completed company 101200010 - 230029 customers


2025-09-18 19:27:09,951 - INFO - Loaded 1980175 transactions for company 101410010
2025-09-18 19:27:20,657 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_101410010.csv
2025-09-18 19:27:20,668 - INFO - Processing company 101600010...
2025-09-18 19:27:20,668 - INFO - Processing company 101600010
2025-09-18 19:27:20,669 - INFO - Loading existing filtered data for company 101600010 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_101600010.csv


✅ Completed company 101410010 - 226295 customers


2025-09-18 19:27:22,358 - INFO - Loaded 5310050 transactions for company 101600010
2025-09-18 19:27:43,598 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_101600010.csv
2025-09-18 19:27:43,622 - INFO - Processing company 102100020...
2025-09-18 19:27:43,623 - INFO - Processing company 102100020
2025-09-18 19:27:43,624 - INFO - Loading existing filtered data for company 102100020 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_102100020.csv


✅ Completed company 101600010 - 278178 customers


2025-09-18 19:27:45,173 - INFO - Loaded 5063295 transactions for company 102100020
2025-09-18 19:28:05,076 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_102100020.csv
2025-09-18 19:28:05,111 - INFO - Processing company 102700020...
2025-09-18 19:28:05,111 - INFO - Processing company 102700020
2025-09-18 19:28:05,112 - INFO - Loading existing filtered data for company 102700020 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_102700020.csv


✅ Completed company 102100020 - 277575 customers


2025-09-18 19:28:05,628 - INFO - Loaded 1644653 transactions for company 102700020
2025-09-18 19:28:16,379 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_102700020.csv
2025-09-18 19:28:16,391 - INFO - Processing company 102840020...
2025-09-18 19:28:16,392 - INFO - Processing company 102840020
2025-09-18 19:28:16,392 - INFO - Loading existing filtered data for company 102840020 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_102840020.csv


✅ Completed company 102700020 - 230393 customers


2025-09-18 19:28:18,846 - INFO - Loaded 6053407 transactions for company 102840020
2025-09-18 19:28:44,375 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_102840020.csv
2025-09-18 19:28:44,420 - INFO - Processing company 103000030...
2025-09-18 19:28:44,421 - INFO - Processing company 103000030
2025-09-18 19:28:44,421 - INFO - Loading existing filtered data for company 103000030 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_103000030.csv


✅ Completed company 102840020 - 278596 customers


2025-09-18 19:28:45,096 - INFO - Loaded 2105498 transactions for company 103000030
2025-09-18 19:28:56,538 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_103000030.csv
2025-09-18 19:28:56,552 - INFO - Processing company 103338333...
2025-09-18 19:28:56,552 - INFO - Processing company 103338333
2025-09-18 19:28:56,553 - INFO - Loading existing filtered data for company 103338333 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_103338333.csv


✅ Completed company 103000030 - 235548 customers


2025-09-18 19:28:57,954 - INFO - Loaded 4282458 transactions for company 103338333
2025-09-18 19:29:16,408 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_103338333.csv
2025-09-18 19:29:16,446 - INFO - Processing company 103400030...
2025-09-18 19:29:16,447 - INFO - Processing company 103400030
2025-09-18 19:29:16,447 - INFO - Loading existing filtered data for company 103400030 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_103400030.csv


✅ Completed company 103338333 - 264431 customers


2025-09-18 19:29:17,358 - INFO - Loaded 2953185 transactions for company 103400030
2025-09-18 19:29:31,872 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_103400030.csv
2025-09-18 19:29:31,888 - INFO - Processing company 103600030...
2025-09-18 19:29:31,888 - INFO - Processing company 103600030
2025-09-18 19:29:31,889 - INFO - Loading existing filtered data for company 103600030 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_103600030.csv


✅ Completed company 103400030 - 264887 customers


2025-09-18 19:29:32,431 - INFO - Loaded 1807676 transactions for company 103600030
2025-09-18 19:29:42,595 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_103600030.csv
2025-09-18 19:29:42,606 - INFO - Processing company 103700030...
2025-09-18 19:29:42,606 - INFO - Processing company 103700030
2025-09-18 19:29:42,607 - INFO - Loading existing filtered data for company 103700030 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_103700030.csv


✅ Completed company 103600030 - 225182 customers


2025-09-18 19:29:45,012 - INFO - Loaded 5643063 transactions for company 103700030
2025-09-18 19:30:08,553 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_103700030.csv
2025-09-18 19:30:08,593 - INFO - Processing company 103800030...
2025-09-18 19:30:08,594 - INFO - Processing company 103800030
2025-09-18 19:30:08,595 - INFO - Loading existing filtered data for company 103800030 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_103800030.csv


✅ Completed company 103700030 - 292772 customers


2025-09-18 19:30:10,114 - INFO - Loaded 4510769 transactions for company 103800030
2025-09-18 19:30:28,625 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_103800030.csv
2025-09-18 19:30:28,658 - INFO - Processing company 104300040...
2025-09-18 19:30:28,658 - INFO - Processing company 104300040
2025-09-18 19:30:28,659 - INFO - Loading existing filtered data for company 104300040 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_104300040.csv


✅ Completed company 103800030 - 260580 customers


2025-09-18 19:30:29,600 - INFO - Loaded 2938314 transactions for company 104300040
2025-09-18 19:30:44,076 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_104300040.csv
2025-09-18 19:30:44,093 - INFO - Processing company 104400040...
2025-09-18 19:30:44,094 - INFO - Processing company 104400040
2025-09-18 19:30:44,095 - INFO - Loading existing filtered data for company 104400040 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_104400040.csv


✅ Completed company 104300040 - 256696 customers


2025-09-18 19:30:45,317 - INFO - Loaded 4133713 transactions for company 104400040
2025-09-18 19:31:02,973 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_104400040.csv
2025-09-18 19:31:02,992 - INFO - Processing company 104470040...
2025-09-18 19:31:02,992 - INFO - Processing company 104470040
2025-09-18 19:31:02,993 - INFO - Loading existing filtered data for company 104470040 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_104470040.csv


✅ Completed company 104400040 - 271857 customers


2025-09-18 19:31:03,757 - INFO - Loaded 2506871 transactions for company 104470040
2025-09-18 19:31:15,882 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_104470040.csv
2025-09-18 19:31:15,895 - INFO - Processing company 104900040...
2025-09-18 19:31:15,896 - INFO - Processing company 104900040
2025-09-18 19:31:15,896 - INFO - Loading existing filtered data for company 104900040 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_104900040.csv


✅ Completed company 104470040 - 227856 customers


2025-09-18 19:31:17,142 - INFO - Loaded 4043374 transactions for company 104900040
2025-09-18 19:31:35,424 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_104900040.csv
2025-09-18 19:31:35,453 - INFO - Processing company 105100050...
2025-09-18 19:31:35,454 - INFO - Processing company 105100050
2025-09-18 19:31:35,454 - INFO - Loading existing filtered data for company 105100050 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_105100050.csv


✅ Completed company 104900040 - 249285 customers


2025-09-18 19:31:36,648 - INFO - Loaded 3689389 transactions for company 105100050
2025-09-18 19:31:53,483 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_105100050.csv
2025-09-18 19:31:53,503 - INFO - Processing company 105150050...
2025-09-18 19:31:53,504 - INFO - Processing company 105150050
2025-09-18 19:31:53,504 - INFO - Loading existing filtered data for company 105150050 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_105150050.csv


✅ Completed company 105100050 - 265019 customers


2025-09-18 19:31:54,119 - INFO - Loaded 1733059 transactions for company 105150050
2025-09-18 19:32:04,763 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_105150050.csv
2025-09-18 19:32:04,774 - INFO - Processing company 107800070...
2025-09-18 19:32:04,775 - INFO - Processing company 107800070
2025-09-18 19:32:04,775 - INFO - Loading existing filtered data for company 107800070 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_107800070.csv


✅ Completed company 105150050 - 242684 customers


2025-09-18 19:32:05,558 - INFO - Loaded 2538320 transactions for company 107800070
2025-09-18 19:32:18,253 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_107800070.csv


✅ Completed company 107800070 - 228759 customers


Unnamed: 0,num_customers,avg_calibration_value,avg_holdout_value
10000,234385,4.122605,77.659704
101200010,230029,5.637354,71.168574
101410010,226295,4.124539,28.117041
101600010,278178,5.200163,61.677928
102100020,277575,4.607076,63.307058
102700020,230393,3.201437,17.02816
102840020,278596,5.049738,78.297621
103000030,235548,4.096042,27.015212
103338333,264431,3.77239,55.72083
103400030,264887,4.135925,30.993279


## 2. Explore Generated Customer Features

In [8]:
# Load the generated customer data for exploration
processed_dir = repo_root / 'data' / 'processed' / 'customers'
customer_files = list(processed_dir.glob('*.csv'))

print(f"Found {len(customer_files)} customer files:")
for file in customer_files:
    file_size = os.path.getsize(file) / (1024*1024)  # Convert to MB
    print(f"  - {file.name}: {file_size:.2f} MB")

# Load first customer file for exploration
if customer_files:
    cust_file = customer_files[0]
    cust_df = pd.read_csv(cust_file)
    
    print("\nCustomer data overview:")
    display(cust_df.head())
    
    print(f"\nShape: {cust_df.shape}")
    print(f"Numeric columns: {cust_df.select_dtypes(include=['number']).columns.tolist()}")
    print(f"Categorical columns: {cust_df.select_dtypes(include=['category', 'object']).columns.tolist()}")
    
    # Basic statistics
    print("\nNumeric feature statistics:")
    display(cust_df.describe())

Found 20 customer files:
  - customer_level_data_company_103400030.csv: 14.66 MB
  - customer_level_data_company_104400040.csv: 15.24 MB
  - customer_level_data_company_103338333.csv: 15.56 MB
  - customer_level_data_company_10000.csv: 12.80 MB
  - customer_level_data_company_101200010.csv: 13.13 MB
  - customer_level_data_company_107800070.csv: 12.90 MB
  - customer_level_data_company_105150050.csv: 13.66 MB
  - customer_level_data_company_104300040.csv: 14.59 MB
  - customer_level_data_company_101410010.csv: 12.53 MB
  - customer_level_data_company_104900040.csv: 14.25 MB
  - customer_level_data_company_103800030.csv: 14.65 MB
  - customer_level_data_company_104470040.csv: 13.08 MB
  - customer_level_data_company_102100020.csv: 16.02 MB
  - customer_level_data_company_105100050.csv: 15.15 MB
  - customer_level_data_company_103700030.csv: 17.18 MB
  - customer_level_data_company_103600030.csv: 12.75 MB
  - customer_level_data_company_102700020.csv: 12.74 MB
  - customer_level_data_com

Unnamed: 0,id,calibration_value,log_calibration_value,holdout_value,chain,dept,category,brand,productmeasure,return_count,label
0,86246,1.89,0.636577,160.17,205,4,416,8247,OZ,0,160.17
1,86252,0.99,-0.01005,132.41,205,4,416,15113,OZ,0,132.41
2,12262064,0.24,-1.427116,12.15,95,4,416,12022,OZ,0,12.15
3,12277270,0.99,-0.01005,0.99,95,4,416,9739,OZ,0,0.99
4,12332190,3.0,1.098612,49.06,95,4,418,15113,OZ,1,49.06



Shape: (264887, 11)
Numeric columns: ['id', 'calibration_value', 'log_calibration_value', 'holdout_value', 'chain', 'dept', 'category', 'brand', 'return_count', 'label']
Categorical columns: ['productmeasure']

Numeric feature statistics:


Unnamed: 0,id,calibration_value,log_calibration_value,holdout_value,chain,dept,category,brand,return_count,label
count,264887.0,264887.0,264887.0,264887.0,264887.0,264887.0,264887.0,264887.0,264887.0,264887.0
mean,2043034000.0,4.135925,1.10481,30.993279,153.833691,4.846976,500.041255,13879.026238,0.055005,30.993279
std,1596525000.0,5.38076,0.774244,1150.053516,153.798927,3.161908,314.73029,11485.529135,0.64358,1150.053526
min,86246.0,0.01,-4.60517,0.0,2.0,3.0,302.0,0.0,0.0,0.0
25%,541168200.0,1.94,0.662688,3.5,31.0,4.0,416.0,8247.0,0.0,3.5
50%,1882143000.0,2.99,1.095273,12.96,96.0,4.0,418.0,9739.0,0.0,12.96
75%,3611128000.0,4.99,1.607436,31.49,211.0,4.0,418.0,15113.0,0.0,31.49
max,4847629000.0,1340.82,7.201036,487792.24,526.0,97.0,9753.0,57369.0,259.0,487792.25
