In [17]:
import pandas as pd
import os
from datetime import datetime

In [24]:
Test_data_inputpath='Data\\TestData\\feature_added_eng_data_limited.csv'
Output_Folder="Data\TestData\TestSet"



# Create output directory if it doesn't exist
os.makedirs(Output_Folder, exist_ok=True)

print(f"Reading data from: {Test_data_inputpath}")
print(f"Output folder: {Output_Folder}")

Reading data from: Data\TestData\feature_added_eng_data_limited.csv
Output folder: Data\TestData\TestSet


In [25]:
# Read the CSV file
try:
    df = pd.read_csv(Test_data_inputpath)
    print(f"Data loaded successfully! Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Display first few rows to understand the structure
    print("\nFirst 5 rows:")
    print(df.head())
    
    # Check for date columns
    print(f"\nData types:")
    print(df.dtypes)
    
except FileNotFoundError:
    print(f"Error: File not found at {Test_data_inputpath}")
except Exception as e:
    print(f"Error reading file: {e}")

Data loaded successfully! Shape: (59933, 19)
Columns: ['Unnamed: 0', 'elevatorunitId', 'elevatorunitnumber', 'Date', 'slow_door_operations', 'total_door_cycles', 'total_door_operations', 'total_door_reversals', 'door_failure_events', 'hoistway_faults', 'safety_chain_issues', 'levelling_total_errors', 'startup_delays', 'average_run_time', 'total_run_starts', 'door_reversal_rate', 'safety_chain_issues_ratio', 'slow_door_operations_ratio', 'is_slow_door']

First 5 rows:
   Unnamed: 0  elevatorunitId elevatorunitnumber       Date  \
0          36              53              K0132  7/20/2025   
1          37              53              K0132  7/21/2025   
2          38              53              K0132  7/22/2025   
3          39              53              K0132  7/23/2025   
4          57              59              G3297  7/20/2025   

   slow_door_operations  total_door_cycles  total_door_operations  \
0                     0                  0                     34   
1          

In [26]:
# Identify date columns and convert them to datetime
date_columns = ['Date']

# # Look for common date column names
# possible_date_cols = ['Date']

# for col in df.columns:
#     if any(date_word in col.lower() for date_word in ['date', 'time']):
#         date_columns.append(col)
#         print(f"Found potential date column: {col}")

# # If no obvious date columns found, check data types
# if not date_columns:
#     for col in df.columns:
#         if df[col].dtype == 'object':
#             # Try to convert first few values to see if they're dates
#             sample_values = df[col].dropna().head()
#             for val in sample_values:
#                 try:
#                     pd.to_datetime(val)
#                     date_columns.append(col)
#                     print(f"Detected date column by content: {col}")
#                     break
#                 except:
#                     continue
#             if col in date_columns:
#                 break

print(f"\nDate columns found: {date_columns}")


Date columns found: ['Date']


In [28]:
# Process the date column and split data
if date_columns:
    # Use the first date column found
    date_col = date_columns[0]
    print(f"Using date column: {date_col}")
    
    # Convert to datetime
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Extract date (without time) for grouping
    df['split_date'] = df[date_col].dt.date
    
    # Get unique dates
    unique_dates = df['split_date'].unique()
    print(f"\nFound {len(unique_dates)} unique dates")
    print(f"Date range: {min(unique_dates)} to {max(unique_dates)}")
    
    # File size parameters
    MIN_RECORDS_PER_FILE = 400
    MAX_RECORDS_PER_FILE = 600
    
    # Split data by date and save (with record range limit)
    total_files_created = 0
    
    for date in unique_dates:
        # Filter data for this date
        date_data = df[df['split_date'] == date].copy()
        
        # Remove the temporary split_date column
        date_data = date_data.drop('split_date', axis=1)
        
        date_str = date.strftime('%Y-%m-%d')
        total_records_for_date = len(date_data)
        
        print(f"\nProcessing {date_str}: {total_records_for_date} records")
        
        # If records are within acceptable range (250-500), save as single file
        if MIN_RECORDS_PER_FILE <= total_records_for_date <= MAX_RECORDS_PER_FILE:
            filename = f"test_data_{date_str}.csv"
            filepath = os.path.join(Output_Folder, filename)
            date_data.to_csv(filepath, index=False)
            print(f"  📁 Saved {len(date_data)} records to {filename}")
            total_files_created += 1
        
        # If less than 250 records, we'll still save as single file but note it
        elif total_records_for_date < MIN_RECORDS_PER_FILE:
            filename = f"test_data_{date_str}.csv"
            filepath = os.path.join(Output_Folder, filename)
            date_data.to_csv(filepath, index=False)
            print(f"  📁⚠️  Saved {len(date_data)} records to {filename} (below minimum)")
            total_files_created += 1
        
        # If more than 500 records, split into optimal chunks
        else:
            # Calculate optimal number of files to keep each file between 250-500 records
            # Aim for files closer to 400 records for better balance
            target_records_per_file = 500
            num_files_needed = max(1, (total_records_for_date + target_records_per_file - 1) // target_records_per_file)
            
            # Recalculate to ensure no file goes below 250 or above 500
            records_per_file = total_records_for_date // num_files_needed
            
            # If calculated size is below 250, reduce number of files
            while records_per_file < MIN_RECORDS_PER_FILE and num_files_needed > 1:
                num_files_needed -= 1
                records_per_file = total_records_for_date // num_files_needed
            
            print(f"  ⚠️  Date has {total_records_for_date} records, splitting into {num_files_needed} files (~{records_per_file} records each)")
            
            for file_num in range(num_files_needed):
                start_idx = file_num * records_per_file
                
                # For the last file, include all remaining records
                if file_num == num_files_needed - 1:
                    end_idx = total_records_for_date
                else:
                    end_idx = (file_num + 1) * records_per_file
                
                chunk_data = date_data.iloc[start_idx:end_idx].copy()
                chunk_size = len(chunk_data)
                
                # Create filename with part number
                filename = f"test_data_{date_str}_part{file_num + 1:02d}.csv"
                filepath = os.path.join(Output_Folder, filename)
                
                chunk_data.to_csv(filepath, index=False)
                
                # Status indicator based on size
                if MIN_RECORDS_PER_FILE <= chunk_size <= MAX_RECORDS_PER_FILE:
                    status = "✅"
                elif chunk_size < MIN_RECORDS_PER_FILE:
                    status = "⚠️"
                else:
                    status = "❌"
                
                print(f"    📁{status} Part {file_num + 1}: Saved {chunk_size} records to {filename}")
                total_files_created += 1
    
    print(f"\nData splitting completed!")
    print(f"Total files created: {total_files_created}")
    print(f"Target range: {MIN_RECORDS_PER_FILE}-{MAX_RECORDS_PER_FILE} records per file")
    print(f"Files saved in: {Output_Folder}")
    
else:
    print("No date columns found. Please check your data or specify the date column manually.")
    print("Available columns:", df.columns.tolist())

Using date column: Date

Found 10 unique dates
Date range: 2025-07-20 to 2025-07-29

Processing 2025-07-20: 12386 records
  ⚠️  Date has 12386 records, splitting into 25 files (~495 records each)
    📁✅ Part 1: Saved 495 records to test_data_2025-07-20_part01.csv
    📁✅ Part 2: Saved 495 records to test_data_2025-07-20_part02.csv
    📁✅ Part 3: Saved 495 records to test_data_2025-07-20_part03.csv
    📁✅ Part 4: Saved 495 records to test_data_2025-07-20_part04.csv
    📁✅ Part 5: Saved 495 records to test_data_2025-07-20_part05.csv
    📁✅ Part 6: Saved 495 records to test_data_2025-07-20_part06.csv
    📁✅ Part 7: Saved 495 records to test_data_2025-07-20_part07.csv
    📁✅ Part 8: Saved 495 records to test_data_2025-07-20_part08.csv
    📁✅ Part 9: Saved 495 records to test_data_2025-07-20_part09.csv
    📁✅ Part 10: Saved 495 records to test_data_2025-07-20_part10.csv
    📁✅ Part 11: Saved 495 records to test_data_2025-07-20_part11.csv
    📁✅ Part 12: Saved 495 records to test_data_2025-07