# Feature Engineering

This notebook processes transaction data to create customer-level features for LTV prediction.

In [6]:
import sys
import os
import pandas as pd
import numpy as np
import pathlib

# Add the src directory to Python path
repo_root = pathlib.Path('').resolve()
sys.path.append(str(repo_root))

from src.google_lifetime_value.utils.logger import setup_logger
from src.google_lifetime_value.preprocess.preprocess_transactions import process

# Initialize logger
logger = setup_logger('feature_engineering')

2025-09-18 00:21:36,193 - INFO - Logging initialized. Log file: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/logs/feature_engineering_2025.09.18_00:21:36.log


## 1. Process Companies to Generate Customer Features

In [8]:
# Define companies

COMPANYS = [
    10000,
    ##101200010, 101410010, 101600010, 102100020, 102700020,
    ##102840020, 103000030, 103338333, 103400030, 103600030,
    ##103700030, 103800030, 104300040, 104400040, 104470040,
    ##104900040, 105100050, 105150050, 107800070
]

In [9]:
# Process one company at a time with detailed output
results = {}

for company in COMPANYS:
    logger.info(f"Processing company {company}...")
    try:
        # Process the company and get customer-level data
        customer_df = process(company)
        results[company] = {
            "num_customers": len(customer_df),
            "avg_calibration_value": customer_df['calibration_value'].mean(),
            "avg_holdout_value": customer_df['holdout_value'].mean(),
        }
        print(f"✅ Completed company {company} - {len(customer_df)} customers")
    except Exception as e:
        logger.error(f"Error processing company {company}: {str(e)}")
        print(f"❌ Failed company {company}: {str(e)}")

# Show summary of results
summary_df = pd.DataFrame.from_dict(results, orient='index')
display(summary_df)

2025-09-18 00:22:02,817 - INFO - Processing company 10000...
2025-09-18 00:22:02,819 - INFO - Processing company 10000
2025-09-18 00:22:02,821 - INFO - Loading existing filtered data for company 10000 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_10000.csv
2025-09-18 00:22:05,582 - INFO - Loaded 7964915 transactions for company 10000
2025-09-18 00:22:17,538 - INFO - Customer data saved to: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_10000.csv


✅ Completed company 10000 - 234385 customers


Unnamed: 0,num_customers,avg_calibration_value,avg_holdout_value
10000,234385,4.122605,77.659704


## 2. Explore Generated Customer Features

In [16]:
customer_files[3]

PosixPath('/Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/customers/customer_level_data_company_10000.csv')

In [None]:
# Load the generated customer data for exploration
processed_dir = repo_root / 'data' / 'processed' / 'customers'
customer_files = list(processed_dir.glob('*.csv'))

print(f"Found {len(customer_files)} customer files:")
for file in customer_files:
    file_size = os.path.getsize(file) / (1024*1024)  # Convert to MB
    print(f"  - {file.name}: {file_size:.2f} MB")

# Load first customer file for exploration
if customer_files:
    cust_file = customer_files[0]
    cust_df = pd.read_csv(cust_file)
    
    print("\nCustomer data overview:")
    display(cust_df.head())
    
    print(f"\nShape: {cust_df.shape}")
    print(f"Numeric columns: {cust_df.select_dtypes(include=['number']).columns.tolist()}")
    print(f"Categorical columns: {cust_df.select_dtypes(include=['category', 'object']).columns.tolist()}")
    
    # Basic statistics
    print("\nNumeric feature statistics:")
    display(cust_df.describe())

Found 20 customer files:
  - customer_level_data_company_103400030.csv: 14.16 MB
  - customer_level_data_company_104400040.csv: 14.72 MB
  - customer_level_data_company_103338333.csv: 15.06 MB
  - customer_level_data_company_10000.csv: 12.80 MB
  - customer_level_data_company_101200010.csv: 12.69 MB
  - customer_level_data_company_107800070.csv: 12.47 MB
  - customer_level_data_company_105150050.csv: 13.20 MB
  - customer_level_data_company_104300040.csv: 14.10 MB
  - customer_level_data_company_101410010.csv: 12.10 MB
  - customer_level_data_company_104900040.csv: 13.77 MB
  - customer_level_data_company_103800030.csv: 14.15 MB
  - customer_level_data_company_104470040.csv: 12.65 MB
  - customer_level_data_company_102100020.csv: 15.49 MB
  - customer_level_data_company_105100050.csv: 14.64 MB
  - customer_level_data_company_103700030.csv: 16.63 MB
  - customer_level_data_company_103600030.csv: 12.32 MB
  - customer_level_data_company_102700020.csv: 12.31 MB
  - customer_level_data_com

Unnamed: 0,id,calibration_value,log_calibration_value,holdout_value,chain,dept,category,brand,productmeasure,return_count,label
0,86246,0.69,-0.371064,322.73,205,97,9753,0,CT,30,322.73
1,86252,4.69,1.545433,310.04,205,0,0,0,UNKNOWN,30,310.04
2,12262064,0.99,-0.01005,11.73,95,97,9753,0,CT,0,11.73
3,12277270,1.99,0.688135,139.27,95,0,0,0,UNKNOWN,0,139.27
4,12332190,1.0,0.0,11.72,95,97,9753,0,CT,0,11.72



Shape: (234385, 11)
Numeric columns: ['id', 'calibration_value', 'log_calibration_value', 'holdout_value', 'chain', 'dept', 'category', 'brand', 'return_count', 'label']
Categorical columns: ['productmeasure']

Numeric feature statistics:


Unnamed: 0,id,calibration_value,log_calibration_value,holdout_value,chain,dept,category,brand,return_count,label
count,234385.0,234385.0,234385.0,234385.0,234385.0,234385.0,234385.0,234385.0,234385.0,234385.0
mean,1866191000.0,4.122605,0.787066,77.6597,110.16743,62.74838,6307.735559,2672.26564,7.576487,77.6597
std,1584831000.0,39.810956,1.054849,7187.212,99.050663,46.432976,4667.582806,7837.689164,20.099103,7187.212
min,86246.0,0.01,-4.60517,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,497959700.0,1.0,0.0,2.72,21.0,0.0,0.0,0.0,0.0,2.72
50%,764373200.0,2.08,0.732368,11.12,89.0,97.0,9753.0,0.0,1.0,11.12
75%,3369041000.0,4.34,1.467874,28.72,153.0,97.0,9753.0,0.0,6.0,28.72
max,4847629000.0,18191.34,9.808701,2062412.0,526.0,99.0,9908.0,33170.0,486.0,2062412.0
