# Data Preprocessing

This notebook assumes the Kaggle dataset is downloaded and performs initial preprocessing.

In [26]:
import sys
import os
import subprocess
import pandas as pd
import numpy as np
import pathlib

# Add the src directory to Python path
repo_root = pathlib.Path('').resolve()
sys.path.append(str(repo_root))

from google_lifetime_value.utils.logger import setup_logger
from google_lifetime_value.preprocess.preprocess_transactions import load_data

# Initialize logger
logger = setup_logger('data_preprocessing')

2025-09-18 00:20:03,683 - INFO - Logging initialized. Log file: /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/logs/data_preprocessing_2025.09.18_00:20:03.log


In [27]:
# Define companies

COMPANYS = [
    10000,
    ##101200010, 101410010, 101600010, 102100020, 102700020,
    ##102840020, 103000030, 103338333, 103400030, 103600030,
    ##103700030, 103800030, 104300040, 104400040, 104470040,
    ##104900040, 105100050, 105150050, 107800070
]

## 1. Extract and Process Data by Company

In [28]:
# Process transactions for each company in parallel
from multiprocessing.dummy import Pool as ThreadPool

# Function to process and save data for a company
def extract_company_data(company):
    try:
        df = load_data(company)
        logger.info(f"Successfully processed company {company} with {len(df)} records")
        return company, len(df)
    except Exception as e:
        logger.error(f"Error processing company {company}: {str(e)}")
        return company, 0

# Process in parallel
with ThreadPool() as pool:
    results = pool.map(extract_company_data, COMPANYS)

# Display results
for company, count in results:
    print(f"Company {company}: {count} records processed")

2025-09-18 00:20:06,565 - INFO - Loading existing filtered data for company 10000 from /Users/batuhansaritas/Desktop/repositories/google-lifetime-value/notebooks/data/processed/transactions/transactions_company_10000.csv
2025-09-18 00:20:08,986 - INFO - Loaded 7964915 transactions for company 10000
2025-09-18 00:20:08,987 - INFO - Successfully processed company 10000 with 7964915 records


Company 10000: 7964915 records processed


## 2. Verify Extracted Data

In [29]:
# Check the extracted files
processed_dir = repo_root / 'data' / 'processed' / 'customers'
extracted_files = list(processed_dir.glob('*.csv'))

print(f"Found {len(extracted_files)} extracted company files:")
for file in extracted_files:
    file_size = os.path.getsize(file) / (1024*1024)  # Convert to MB
    print(f"  - {file.name}: {file_size:.2f} MB")

# Sample one file to check structure
if extracted_files:
    sample_file = extracted_files[0]
    sample_df = pd.read_csv(sample_file)
    print("\nSample data structure:")
    display(sample_df.head())
    print(f"Columns: {sample_df.columns.tolist()}")
    print(f"Shape: {sample_df.shape}")

Found 20 extracted company files:
  - customer_level_data_company_103400030.csv: 14.16 MB
  - customer_level_data_company_104400040.csv: 14.72 MB
  - customer_level_data_company_103338333.csv: 15.06 MB
  - customer_level_data_company_10000.csv: 12.80 MB
  - customer_level_data_company_101200010.csv: 12.69 MB
  - customer_level_data_company_107800070.csv: 12.47 MB
  - customer_level_data_company_105150050.csv: 13.20 MB
  - customer_level_data_company_104300040.csv: 14.10 MB
  - customer_level_data_company_101410010.csv: 12.10 MB
  - customer_level_data_company_104900040.csv: 13.77 MB
  - customer_level_data_company_103800030.csv: 14.15 MB
  - customer_level_data_company_104470040.csv: 12.65 MB
  - customer_level_data_company_102100020.csv: 15.49 MB
  - customer_level_data_company_105100050.csv: 14.64 MB
  - customer_level_data_company_103700030.csv: 16.63 MB
  - customer_level_data_company_103600030.csv: 12.32 MB
  - customer_level_data_company_102700020.csv: 12.31 MB
  - customer_level

Unnamed: 0,id,calibration_value,chain,dept,category,brand,productmeasure,holdout_value,log_calibration_value,label
0,86246,1.89,205,4,416,8247,OZ,160.17,0.636577,160.17
1,86252,0.99,205,4,416,15113,OZ,132.41,-0.01005,132.41
2,12262064,0.24,95,4,416,12022,OZ,12.15,-1.427116,12.15
3,12277270,0.99,95,4,416,9739,OZ,0.99,-0.01005,0.99
4,12332190,3.0,95,4,418,15113,OZ,49.06,1.098612,49.06


Columns: ['id', 'calibration_value', 'chain', 'dept', 'category', 'brand', 'productmeasure', 'holdout_value', 'log_calibration_value', 'label']
Shape: (264887, 10)
