In [1]:
import pandas as pd
import numpy as np
import time

# 1. Setup
TRAIN_FILE = "train_data.csv"
LABEL_FILE = "train_labels.csv"
OUTPUT_FILE = "customer_features_enhanced.csv"

def process_advanced_features():
    start_time = time.time()
    print("ðŸš€ Starting Professional Feature Engineering Pipeline...")
    
    # Define Column Groups
    cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    
    chunksize = 200000 
    chunk_list = []

    print(f"Reading {TRAIN_FILE} in chunks...")
    
    with pd.read_csv(TRAIN_FILE, chunksize=chunksize) as reader:
        for i, chunk in enumerate(reader):
            print(f"  Processing Chunk {i+1}...", end="\r")

            # âœ” FIX: Sort so 'last' means truly LAST chronologically
            chunk = chunk.sort_values(["customer_ID", "S_2"])

            # Identify numeric columns
            num_cols = [c for c in chunk.columns if c not in cat_cols + ['customer_ID', 'S_2']]
            
            # Aggregation logic
            chunk_agg = chunk.groupby('customer_ID').agg({
                **{c: ['mean', 'max', 'last'] for c in num_cols},
                **{c: ['last', 'nunique'] for c in cat_cols}
            })
            
            # Flatten columns
            chunk_agg.columns = ['_'.join(x) for x in chunk_agg.columns]
            
            # Statement count
            chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
            
            chunk_agg.reset_index(inplace=True)
            chunk_list.append(chunk_agg)

            # Safety test option:
            # if i == 1: break
    
    print("\nConcatenating chunks...")
    full_df = pd.concat(chunk_list, axis=0)
    
    print("Final Groupby to merge split customers...")
    final_features = full_df.groupby('customer_ID').last().reset_index()

    print("Merging with Labels...")
    labels = pd.read_csv(LABEL_FILE)
    final_dataset = final_features.merge(labels, on='customer_ID', how='inner')
    
    print(f"Saving to {OUTPUT_FILE}...")
    final_dataset.to_csv(OUTPUT_FILE, index=False)
    
    elapsed = (time.time() - start_time) / 60
    print(f"ðŸŽ‰ SUCCESS! Pipeline finished in {elapsed:.1f} minutes.")
    print(f"Final Data Shape: {final_dataset.shape}")

if __name__ == "__main__":
    process_advanced_features()

ðŸš€ Starting Professional Feature Engineering Pipeline...
Reading train_data.csv in chunks...
  Processing Chunk 1...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 2...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 3...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 4...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 5...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 6...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 7...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 8...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 9...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 10...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 11...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 12...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 13...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 14...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 15...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 16...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 17...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 18...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 19...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 20...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 21...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 22...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 23...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 24...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 25...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 26...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 27...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)


  Processing Chunk 28...

  chunk_agg['statement_count'] = chunk.groupby('customer_ID')['S_2'].count()
  chunk_agg.reset_index(inplace=True)



Concatenating chunks...
Final Groupby to merge split customers...
Merging with Labels...
Saving to customer_features_enhanced.csv...
ðŸŽ‰ SUCCESS! Pipeline finished in 3.0 minutes.
Final Data Shape: (458913, 556)


In [2]:
df = pd.read_csv("customer_features_enhanced.csv")
df.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_max,P_2_last,D_39_mean,D_39_max,D_39_last,B_1_mean,B_1_max,B_1_last,...,D_63_last,D_63_nunique,D_64_last,D_64_nunique,D_66_last,D_66_nunique,D_68_last,D_68_nunique,statement_count,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.960384,0.934745,0.010704,0.091505,0.009119,0.012007,0.021655,0.009382,...,CR,1,O,1,,0,6.0,1,13,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.929122,0.880519,0.215205,0.567403,0.178126,0.025654,0.109644,0.034684,...,CO,1,O,1,,0,6.0,1,13,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878454,0.904482,0.880875,0.004181,0.009704,0.009704,0.004386,0.009997,0.004284,...,CO,1,R,1,,0,6.0,1,13,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.598969,0.623392,0.621776,0.048862,0.268476,0.001083,0.059876,0.279991,0.012564,...,CO,1,O,1,,0,3.0,3,13,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891679,0.940382,0.8719,0.004644,0.00868,0.005573,0.005941,0.009806,0.007679,...,CO,1,O,1,1.0,1,6.0,1,13,0
