# Transaction Data Processing

This notebook processes transaction data with optimized memory management and handles data quality issues.

In [1]:
import pandas as pd
import numpy as np
import os
import sys
from glob import glob
import gc
from datetime import datetime
import warnings
import psutil  # For memory monitoring
from tqdm import tqdm  # For progress bars (install with pip install tqdm if needed)
import time

In [3]:
def get_memory_usage():
    """Return the current memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)

def log_memory(message):
    """Log memory usage with a custom message"""
    print(f"{message}: {get_memory_usage():.2f} MB")

## 1. Data Cleaning Functions

Define functions for cleaning and feature engineering before we use them

In [4]:
def clean_chunk(df):
    """Clean a chunk of transaction data with optimized operations"""
    # Make a copy only if needed (can improve performance but may cause SettingWithCopyWarning)
    df = df.copy()
    
    # Handle string columns efficiently by processing only non-null values
    str_cols = ['merchant_name', 'merchant_category', 'currency', 'location_country', 'location_city', 'device_type', 'status']
    for col in str_cols:
        if col in df.columns:
            if df[col].dtype == 'string' or df[col].dtype == 'object':
                # Only process non-null values to save time
                mask = df[col].notna()
                if mask.any():
                    df.loc[mask, col] = df.loc[mask, col].str.strip().str.lower()
            
            # Handle categorical columns properly before filling with 'unknown'
            if pd.api.types.is_categorical_dtype(df[col]):
                # Add 'unknown' to the category list if it's not already there
                if 'unknown' not in df[col].cat.categories:
                    df[col] = df[col].cat.add_categories(['unknown'])
            
            # Fill missing values
            df[col] = df[col].fillna('unknown')
    
    # Handle missing numeric values more efficiently
    if 'amount' in df.columns:
        df['missing_amount'] = df['amount'].isna()
        df['amount'] = df['amount'].fillna(0)
    
    # Handle missing boolean values
    if 'is_online' in df.columns:
        df['is_online'] = df['is_online'].fillna(False)
    
    # Handle outliers in amount using vectorized operations
    if 'amount' in df.columns:
        # Use robust statistics to avoid influence of extreme outliers
        q1 = df['amount'].quantile(0.25)
        q3 = df['amount'].quantile(0.75)
        iqr = q3 - q1
        upper_limit = q3 + 1.5 * iqr
        df['is_amount_outlier'] = df['amount'] > upper_limit
    
    # Drop duplicates efficiently using subset
    df.drop_duplicates(subset='transaction_id', keep='first', inplace=True)
    
    return df

## 2. Feature Engineering Functions

In [5]:
def add_derived_features(df):
    """Add derived features to the dataframe with minimal memory impact"""
    # Ensure timestamp is a datetime object
    if 'timestamp' in df.columns:
        # Make sure timestamp is datetime type
        if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        
        # Extract time-based features efficiently
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6])
        df['hour'] = df['timestamp'].dt.hour
        
        # Time period categorization
        conditions = [
            (df['hour'] < 6),
            (df['hour'] >= 6) & (df['hour'] < 12),
            (df['hour'] >= 12) & (df['hour'] < 18),
            (df['hour'] >= 18)
        ]
        choices = ['night', 'morning', 'afternoon', 'evening']
        df['time_period'] = np.select(conditions, choices, default='unknown')
        
        # Calculate days since transaction efficiently
        current_date = pd.Timestamp(datetime.now().date())
        df['days_since_transaction'] = (current_date - df['timestamp'].dt.floor('D')).dt.days
    
    # Flag potential anomalies using vectorized operations
    if 'amount' in df.columns and 'is_online' in df.columns:
        df['high_value_online'] = (df['amount'] > 1000) & df['is_online']
    
    return df

## 3. Memory-Efficient Processing Function

In [6]:
def process_transactions(file_list, chunksize=100000, output_dir="processed_chunks", max_files=None):
    """
    Process transaction files in chunks with improved memory management and proper deduplication
    """

    # Import required modules
    import os
    import gc
    import pandas as pd
    import numpy as np
    import time
    from tqdm import tqdm
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Limit number of files if specified
    if max_files is not None and max_files < len(file_list):
        print(f"Limiting to {max_files} out of {len(file_list)} files")
        file_list = file_list[:max_files]
    
    # Track processing statistics
    total_rows_processed = 0
    total_chunks_processed = 0
    start_time = time.time()
    output_files = []
    
    # IMPROVED APPROACH: Create a single set of transaction IDs that we'll update as we process each file
    # This way we'll only keep track of transactions we've already processed
    seen_transaction_ids = set()
    
    # Process each file and save to parquet
    print("\n--- Processing and saving files ---")
    
    for i, file_path in enumerate(file_list):
        file_start_time = time.time()
        print(f"\nProcessing file {i+1}/{len(file_list)}: {os.path.basename(os.path.dirname(file_path))}")
        log_memory(f"Memory before processing file {i+1}")
        
        try:
            # Get file size for progress reporting
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
            print(f"File size: {file_size:.2f} MB")
            
            # Calculate expected chunks
            with open(file_path, 'r') as f:
                approx_rows = sum(1 for _ in f) - 1  # Subtract header
            print(f"Approximate rows: {approx_rows:,}")
            expected_chunks = approx_rows // chunksize + (1 if approx_rows % chunksize else 0)
            
            # Create output file path for this file
            output_file = os.path.join(output_dir, f"processed_file_{i+1}.parquet")
            
            # Process current file in chunks
            chunk_reader = pd.read_csv(
                file_path, 
                dtype=dtypes,
                parse_dates=['timestamp'],
                chunksize=chunksize
            )
            
            # Initialize a buffer for this file's processed chunks
            processed_chunks = []
            file_rows_processed = 0
            
            # Process each chunk
            for chunk_idx, chunk in enumerate(tqdm(chunk_reader, total=expected_chunks, desc=f"Processing file {i+1}")):
                # Convert timestamp if needed
                if not pd.api.types.is_datetime64_any_dtype(chunk['timestamp']):
                    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], errors='coerce')
                
                # Clean data
                chunk = clean_chunk(chunk)
                
                # IMPROVED: Filter out only transaction IDs we've already seen
                # Get the current chunk's transaction IDs
                current_ids = set(chunk['transaction_id'])
                
                # Find which transaction IDs in this chunk are duplicates (already seen)
                duplicate_ids = current_ids.intersection(seen_transaction_ids)
                
                # Remove duplicates if any found
                if duplicate_ids:
                    chunk = chunk[~chunk['transaction_id'].isin(duplicate_ids)]
                
                # Add the non-duplicate transaction IDs to our seen set
                seen_transaction_ids.update(current_ids - duplicate_ids)
                
                # Skip further processing if chunk is empty after deduplication
                if len(chunk) == 0:
                    continue
                
                # Add derived features
                chunk = add_derived_features(chunk)
                
                # Add to processed chunks
                processed_chunks.append(chunk)
                file_rows_processed += len(chunk)
                
                # Update stats
                total_rows_processed += len(chunk)
                total_chunks_processed += 1
                
                # Check memory usage, if too high, save intermediate results
                current_mem = get_memory_usage()
                if current_mem > 2000 or (chunk_idx > 0 and chunk_idx % 5 == 0):  # Save every 5 chunks or if memory high
                    # Combine processed chunks and save
                    if processed_chunks:
                        combined_df = pd.concat(processed_chunks, ignore_index=True)
                        # If first chunk set, save to new file, otherwise append
                        if not os.path.exists(output_file):
                            combined_df.to_parquet(output_file)
                        else:
                            existing_df = pd.read_parquet(output_file)
                            pd.concat([existing_df, combined_df], ignore_index=True).to_parquet(output_file)
                            del existing_df
                        
                        # Clear memory
                        del combined_df
                        processed_chunks = []
                        gc.collect()
            
            # Save any remaining chunks
            if processed_chunks:
                combined_df = pd.concat(processed_chunks, ignore_index=True)
                # If we've already written to the file, append
                if os.path.exists(output_file):
                    existing_df = pd.read_parquet(output_file)
                    pd.concat([existing_df, combined_df], ignore_index=True).to_parquet(output_file)
                    del existing_df
                else:
                    combined_df.to_parquet(output_file)
                
                # Clear memory
                del combined_df
                processed_chunks = []
                gc.collect()
            
            # Add output file to list if it exists and has data
            if os.path.exists(output_file):
                file_size = os.path.getsize(output_file) / (1024 * 1024)  # Size in MB
                print(f"Created output file: {output_file} ({file_size:.2f} MB)")
                output_files.append(output_file)
                print(f"Rows processed in this file: {file_rows_processed:,}")
            
            file_time = time.time() - file_start_time
            print(f"File {i+1} processed in {file_time:.2f} seconds")
            
            # Periodic garbage collection
            gc.collect()
            
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
            import traceback
            traceback.print_exc()
    
    # Report overall statistics
    total_time = time.time() - start_time
    rows_per_second = total_rows_processed / total_time if total_time > 0 else 0
    print(f"\nProcessing complete!")
    print(f"Total rows processed: {total_rows_processed:,}")
    print(f"Total chunks processed: {total_chunks_processed:,}")
    print(f"Total output files: {len(output_files)}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Processing speed: {rows_per_second:.2f} rows per second")
    
    # Return the list of output files
    return output_files

## 4. Configure Data Loading

In [8]:
# Define optimized dtypes
dtypes = {
    'transaction_id': 'string',
    'user_id': 'string',
    'merchant_id': 'string',
    'merchant_name': 'string',
    'merchant_category': 'category',
    'amount': 'float32',
    'currency': 'category',
    'location_country': 'category',
    'location_city': 'category',
    'is_online': 'bool',
    'device_type': 'category',
    'status': 'category'
}

# Find all transaction files
base_dir = 'transaction_data'
all_csv_files = glob(os.path.join(base_dir, '**', 'transactions.csv'), recursive=True)
print(f"Found {len(all_csv_files)} transaction files")

Found 31 transaction files


## 5. Process the Data

In [9]:
SAMPLE_FRACTION = None # Process 5% of the data to build a representative sample
MAX_FILES = None  # Limit to first N files for initial analysis
CHUNK_SIZE = 100000  # Adjust based on available RAM

In [10]:
print("\n=== STARTING DATA PROCESSING ===")
log_memory("Initial memory usage")

# Comment out the sampling parameters for full dataset processing once your code is working
output_files = process_transactions(
    all_csv_files, 
    chunksize=CHUNK_SIZE,
    output_dir="processed_chunks",
    max_files=MAX_FILES  # Remove this for full processing
)

log_memory("Memory usage after data loading")

# Load the processed data for analysis
if output_files:
    print(f"Successfully processed {len(output_files)} files")
    # Load the first file to get a sample for initial exploration
    sample_df = pd.read_parquet(output_files[0])
    print(f"Sample DataFrame shape: {sample_df.shape}")
    print(f"Sample DataFrame memory usage: {sample_df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
else:
    print("No output files were generated")


=== STARTING DATA PROCESSING ===
Initial memory usage: 119.88 MB

--- Processing and saving files ---

Processing file 1/31: 2025-04-07
Memory before processing file 1: 119.88 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 1: 100%|██████████| 18/18 [00:12<00:00,  1.40it/s]


Created output file: processed_chunks\processed_file_1.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 1 processed in 16.94 seconds

Processing file 2/31: 2025-04-08
Memory before processing file 2: 1112.18 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 2: 100%|██████████| 18/18 [00:13<00:00,  1.30it/s]


Created output file: processed_chunks\processed_file_2.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 2 processed in 18.02 seconds

Processing file 3/31: 2025-04-09
Memory before processing file 3: 1328.94 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 3: 100%|██████████| 18/18 [00:20<00:00,  1.12s/it]


Created output file: processed_chunks\processed_file_3.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 3 processed in 25.93 seconds

Processing file 4/31: 2025-04-10
Memory before processing file 4: 1796.72 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 4: 100%|██████████| 18/18 [00:16<00:00,  1.08it/s]


Created output file: processed_chunks\processed_file_4.parquet (81.96 MB)
Rows processed in this file: 1,708,500
File 4 processed in 20.96 seconds

Processing file 5/31: 2025-04-11
Memory before processing file 5: 1929.74 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 5: 100%|██████████| 18/18 [00:33<00:00,  1.87s/it]


Created output file: processed_chunks\processed_file_5.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 5 processed in 34.76 seconds

Processing file 6/31: 2025-04-12
Memory before processing file 6: 2105.76 MB
File size: 245.75 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 6: 100%|██████████| 18/18 [00:47<00:00,  2.67s/it]


Created output file: processed_chunks\processed_file_6.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 6 processed in 49.19 seconds

Processing file 7/31: 2025-04-13
Memory before processing file 7: 2623.96 MB
File size: 245.78 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 7: 100%|██████████| 18/18 [01:01<00:00,  3.44s/it]


Created output file: processed_chunks\processed_file_7.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 7 processed in 63.13 seconds

Processing file 8/31: 2025-04-14
Memory before processing file 8: 2770.77 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 8: 100%|██████████| 18/18 [01:04<00:00,  3.59s/it]


Created output file: processed_chunks\processed_file_8.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 8 processed in 65.93 seconds

Processing file 9/31: 2025-04-15
Memory before processing file 9: 2889.83 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 9: 100%|██████████| 18/18 [01:18<00:00,  4.35s/it]


Created output file: processed_chunks\processed_file_9.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 9 processed in 79.52 seconds

Processing file 10/31: 2025-04-16
Memory before processing file 10: 3069.75 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 10: 100%|██████████| 18/18 [01:12<00:00,  4.05s/it]


Created output file: processed_chunks\processed_file_10.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 10 processed in 74.16 seconds

Processing file 11/31: 2025-04-17
Memory before processing file 11: 3319.56 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 11: 100%|██████████| 18/18 [01:20<00:00,  4.47s/it]


Created output file: processed_chunks\processed_file_11.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 11 processed in 81.65 seconds

Processing file 12/31: 2025-04-18
Memory before processing file 12: 3379.13 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 12: 100%|██████████| 18/18 [01:01<00:00,  3.41s/it]


Created output file: processed_chunks\processed_file_12.parquet (81.98 MB)
Rows processed in this file: 1,708,500
File 12 processed in 62.66 seconds

Processing file 13/31: 2025-04-19
Memory before processing file 13: 3934.44 MB
File size: 245.78 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 13: 100%|██████████| 18/18 [01:08<00:00,  3.80s/it]


Created output file: processed_chunks\processed_file_13.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 13 processed in 69.56 seconds

Processing file 14/31: 2025-04-20
Memory before processing file 14: 4066.86 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 14: 100%|██████████| 18/18 [01:11<00:00,  3.97s/it]


Created output file: processed_chunks\processed_file_14.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 14 processed in 72.61 seconds

Processing file 15/31: 2025-04-21
Memory before processing file 15: 4287.55 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 15: 100%|██████████| 18/18 [01:14<00:00,  4.15s/it]


Created output file: processed_chunks\processed_file_15.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 15 processed in 75.90 seconds

Processing file 16/31: 2025-04-22
Memory before processing file 16: 4381.43 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 16: 100%|██████████| 18/18 [01:15<00:00,  4.19s/it]


Created output file: processed_chunks\processed_file_16.parquet (81.96 MB)
Rows processed in this file: 1,708,500
File 16 processed in 76.54 seconds

Processing file 17/31: 2025-04-23
Memory before processing file 17: 4620.72 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 17: 100%|██████████| 18/18 [01:17<00:00,  4.31s/it]


Created output file: processed_chunks\processed_file_17.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 17 processed in 78.59 seconds

Processing file 18/31: 2025-04-24
Memory before processing file 18: 4893.09 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 18: 100%|██████████| 18/18 [01:18<00:00,  4.37s/it]


Created output file: processed_chunks\processed_file_18.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 18 processed in 79.67 seconds

Processing file 19/31: 2025-04-25
Memory before processing file 19: 4885.09 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 19: 100%|██████████| 18/18 [01:20<00:00,  4.49s/it]


Created output file: processed_chunks\processed_file_19.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 19 processed in 81.78 seconds

Processing file 20/31: 2025-04-26
Memory before processing file 20: 5111.52 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 20: 100%|██████████| 18/18 [01:29<00:00,  4.96s/it]


Created output file: processed_chunks\processed_file_20.parquet (81.96 MB)
Rows processed in this file: 1,708,500
File 20 processed in 90.23 seconds

Processing file 21/31: 2025-04-27
Memory before processing file 21: 5345.29 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 21: 100%|██████████| 18/18 [01:28<00:00,  4.94s/it]


Created output file: processed_chunks\processed_file_21.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 21 processed in 89.85 seconds

Processing file 22/31: 2025-04-28
Memory before processing file 22: 5507.49 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 22: 100%|██████████| 18/18 [01:32<00:00,  5.11s/it]


Created output file: processed_chunks\processed_file_22.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 22 processed in 93.00 seconds

Processing file 23/31: 2025-04-29
Memory before processing file 23: 5564.02 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 23: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Created output file: processed_chunks\processed_file_23.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 23 processed in 93.40 seconds

Processing file 24/31: 2025-04-30
Memory before processing file 24: 5664.48 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 24: 100%|██████████| 18/18 [01:48<00:00,  6.03s/it]


Created output file: processed_chunks\processed_file_24.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 24 processed in 109.39 seconds

Processing file 25/31: 2025-05-01
Memory before processing file 25: 6829.34 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 25: 100%|██████████| 18/18 [01:55<00:00,  6.42s/it]


Created output file: processed_chunks\processed_file_25.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 25 processed in 116.61 seconds

Processing file 26/31: 2025-05-02
Memory before processing file 26: 7020.30 MB
File size: 245.78 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 26: 100%|██████████| 18/18 [01:58<00:00,  6.58s/it]


Created output file: processed_chunks\processed_file_26.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 26 processed in 119.51 seconds

Processing file 27/31: 2025-05-03
Memory before processing file 27: 7199.67 MB
File size: 245.78 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 27: 100%|██████████| 18/18 [03:00<00:00, 10.05s/it]


Created output file: processed_chunks\processed_file_27.parquet (81.98 MB)
Rows processed in this file: 1,708,500
File 27 processed in 181.95 seconds

Processing file 28/31: 2025-05-04
Memory before processing file 28: 7128.91 MB
File size: 245.78 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 28: 100%|██████████| 18/18 [02:21<00:00,  7.86s/it]


Created output file: processed_chunks\processed_file_28.parquet (81.97 MB)
Rows processed in this file: 1,708,500
File 28 processed in 142.66 seconds

Processing file 29/31: 2025-05-05
Memory before processing file 29: 7365.68 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 29: 100%|██████████| 18/18 [02:20<00:00,  7.79s/it]


Created output file: processed_chunks\processed_file_29.parquet (81.96 MB)
Rows processed in this file: 1,708,500
File 29 processed in 141.20 seconds

Processing file 30/31: 2025-05-06
Memory before processing file 30: 7487.72 MB
File size: 245.77 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 30: 100%|██████████| 18/18 [02:47<00:00,  9.31s/it]


Created output file: processed_chunks\processed_file_30.parquet (81.98 MB)
Rows processed in this file: 1,708,500
File 30 processed in 168.75 seconds

Processing file 31/31: 2025-05-07
Memory before processing file 31: 7710.64 MB
File size: 245.76 MB
Approximate rows: 1,708,500


  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
  if pd.api.types.is_categorical_dtype(df[col]):
Processing file 31: 100%|██████████| 18/18 [02:57<00:00,  9.89s/it]


Created output file: processed_chunks\processed_file_31.parquet (81.98 MB)
Rows processed in this file: 1,708,500
File 31 processed in 178.93 seconds

Processing complete!
Total rows processed: 52,963,500
Total chunks processed: 558
Total output files: 31
Total processing time: 2695.49 seconds
Processing speed: 19648.93 rows per second
Memory usage after data loading: 1721.60 MB
Successfully processed 31 files
Sample DataFrame shape: (1708500, 21)
Sample DataFrame memory usage: 658.27 MB


## Data Overview