In [1]:
import pandas as pd
import os

# Define filenames
TRAIN_FILE = 'train_data.csv'
LABEL_FILE = 'train_labels.csv'

print("--- Step 1: Checking File Existence ---")
# Check if files exist in the current folder
if os.path.exists(TRAIN_FILE) and os.path.exists(LABEL_FILE):
    print("SUCCESS: Both files found!")
else:
    print("ERROR: One or more files are missing. Please check your folder.")
    print(f"Current Working Directory: {os.getcwd()}")
    # Stop here if files are missing
    exit()

print("\n--- Step 2: Peeking at the Data ---")
try:
    # Read only the first 5 rows to ensure it works
    # This avoids crashing your memory
    df_preview = pd.read_csv(TRAIN_FILE, nrows=5)
    print("SUCCESS: Successfully read the first 5 rows of train_data.csv")
    
    print("\nHere are the columns in your data:")
    print(df_preview.columns.tolist()[:10]) # Print first 10 columns only
    
    print("\nHere is the first row of data:")
    print(df_preview.iloc[0])

except Exception as e:
    print(f"CRITICAL ERROR: Could not read the CSV file. Reason: {e}")

print("\n--- READY FOR NEXT STEP ---")

--- Step 1: Checking File Existence ---
SUCCESS: Both files found!

--- Step 2: Peeking at the Data ---
SUCCESS: Successfully read the first 5 rows of train_data.csv

Here are the columns in your data:
['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3']

Here is the first row of data:
customer_ID    0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...
S_2                                                   2017-03-09
P_2                                                     0.938469
D_39                                                    0.001733
B_1                                                     0.008724
                                     ...                        
D_141                                                   0.003818
D_142                                                        NaN
D_143                                                   0.000569
D_144                                                    0.00061
D_145                              

In [2]:
import pandas as pd
import numpy as np
import time

# 1. Setup Paths
TRAIN_PATH = 'train_data.csv'
LABEL_PATH = 'train_labels.csv'
OUTPUT_PATH = 'processed_customer_summary.csv'

def process_data():
    start_time = time.time()
    print("--- Starting Big Data Pipeline ---")

    # 2. Load Labels (Targets)
    # We need these to know who actually defaulted
    print("Loading labels...")
    labels = pd.read_csv(LABEL_PATH)
    print(f"Labels loaded: {len(labels)} rows")

    # 3. Initialize Chunk Processing
    chunk_size = 100000 
    chunk_list = []
    
    print(f"Processing {TRAIN_PATH} in chunks of {chunk_size}...")

    # 4. The Loop
    # We use 'numeric_only=True' to automatically skip the date column (S_2) and IDs
    with pd.read_csv(TRAIN_PATH, chunksize=chunk_size) as reader:
        for i, chunk in enumerate(reader):
            
            # --- SAFETY BRAKE FOR TESTING ---
            # Remove or comment out these 2 lines when you want to run the FULL file!
            if i == 5: 
                print("⚠️ TESTING MODE: Stopping after 5 chunks to save time.")
                break 
            # --------------------------------
            
            # Group by Customer ID and take the average of all numeric columns
            # This turns multiple monthly statements into one 'Average Behavior' row
            aggregated_chunk = chunk.groupby('customer_ID').mean(numeric_only=True)
            
            # Reset index to make customer_ID a normal column again
            aggregated_chunk.reset_index(inplace=True)
            
            chunk_list.append(aggregated_chunk)
            print(f"Chunk {i+1} processed")

    # 5. Combine and Finalize
    print("Concatenating all chunks...")
    train_df = pd.concat(chunk_list, axis=0)
    
    # Second Aggregation: 
    # Because a customer's data might be split across two different chunks,
    # we group by customer_ID one last time to ensure exactly 1 row per customer.
    print("Final aggregation to ensure unique customers...")
    train_df = train_df.groupby('customer_ID').mean(numeric_only=True).reset_index()
    
    print(f"Data shape after aggregation: {train_df.shape}")

    # 6. Merge with Labels
    print("Merging with Target Labels...")
    final_data = train_df.merge(labels, on='customer_ID', how='inner')
    
    # 7. Save to CSV
    print(f"Saving to {OUTPUT_PATH}...")
    final_data.to_csv(OUTPUT_PATH, index=False)
    
    elapsed = time.time() - start_time
    print(f"--- SUCCESS! Pipeline finished in {elapsed:.2f} seconds ---")
    print(f"File saved: {OUTPUT_PATH}")
    print(f"Final Row Count: {len(final_data)}")

if __name__ == "__main__":
    process_data()

--- Starting Big Data Pipeline ---
Loading labels...
Labels loaded: 458913 rows
Processing train_data.csv in chunks of 100000...
Chunk 1 processed
Chunk 2 processed
Chunk 3 processed
Chunk 4 processed
Chunk 5 processed
⚠️ TESTING MODE: Stopping after 5 chunks to save time.
Concatenating all chunks...
Final aggregation to ensure unique customers...
Data shape after aggregation: (41444, 187)
Merging with Target Labels...
Saving to processed_customer_summary.csv...
--- SUCCESS! Pipeline finished in 9.76 seconds ---
File saved: processed_customer_summary.csv
Final Row Count: 41444


In [3]:
df = pd.read_csv("processed_customer_summary.csv")
df.head()

Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.010704,0.012007,1.005086,0.004509,0.113215,0.005021,0.006456,,...,,,0.003664,0.005343,0.005178,,0.005066,0.005283,0.005814,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.215205,0.025654,0.991083,0.006246,0.120578,0.004993,0.005663,,...,,,0.004906,0.006271,0.006007,,0.004824,0.004218,0.004902,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878454,0.004181,0.004386,0.815677,0.006621,,0.006842,0.005493,,...,,,0.006006,0.004675,0.003607,,0.004288,0.005113,0.0045,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.598969,0.048862,0.059876,0.955264,0.005665,0.24775,0.00549,0.006423,,...,,,0.005775,0.005777,0.004181,,0.006742,0.004768,0.005236,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891679,0.004644,0.005941,0.814543,0.00418,0.173102,0.005352,0.005088,,...,,,0.003853,0.004818,0.004818,,0.004852,0.00438,0.004219,0
