In [None]:
# from pathlib import Path
# import pandas as pd

# # Scan all CSV files
# cleaned_path = Path("meta_scan_csvs/cleaned")
# csv_files = list(cleaned_path.rglob("*.csv"))

# # Create metadata inventory
# for csv_file in csv_files_clean:
#     df = pd.read_csv(csv_file)
#     print(f"  Loaded clean {csv_file.name}: {df.shape}")

# revealed files with the least features to remove (so we can use more features for our model)
    # lab_person_4_cleaned.csv: 32 columns
    # kitchen_7_cleaned.csv: 34 columns
    # kitchen_8_cleaned.csv: 35 columns
    # kitchen_person_1_cleaned.csv: 35 columns
    # kitchen_5_cleaned.csv: 38 columns

In [None]:
from pathlib import Path
import pandas as pd

def parse_filename(filepath):
    """
    Parse filename to extract room_type, scan_type, and trial_number
    Examples:
    - blinds_1_cleaned.csv -> room: blinds, scan: base, trial: 1
    - blinds_up_1_cleaned.csv -> room: blinds_up, scan: base, trial: 1
    - kitchen_motion_3_cleaned.csv -> room: kitchen, scan: motion, trial: 3
    """
    filename = filepath.stem.replace('_cleaned', '')
    
    # Handle blinds_up as a special case (two-word room type)
    if filename.startswith('blinds_up'):
        room_type = 'blinds_up'
        remainder = filename.replace('blinds_up_', '', 1)
    else:
        # Split and take first part as room type
        parts = filename.split('_')
        room_type = parts[0]
        remainder = '_'.join(parts[1:])
    
    # Now parse the remainder for scan type and trial
    remainder_parts = remainder.split('_')
    
    if len(remainder_parts) == 1:
        # Base scan (just a number)
        scan_type = 'base'
        trial_number = int(remainder_parts[0])
    elif len(remainder_parts) == 2:
        # Scan type + number (e.g., motion_3)
        scan_type = remainder_parts[0]
        trial_number = int(remainder_parts[1])
    else:
        scan_type = 'unknown'
        trial_number = -1
    
    return room_type, scan_type, trial_number

# Scan all CSV files
cleaned_path = Path("meta_scan_csvs/cleaned")
csv_files = list(cleaned_path.rglob("*.csv"))

# Create metadata inventory
metadata = []
for csv_file in csv_files:
    room_type, scan_type, trial_number = parse_filename(csv_file)
    
    # Get number of rows
    df = pd.read_csv(csv_file)
    print(f"  Loaded {csv_file.name}: {df.shape}")
    num_rows = len(df)
    
    metadata.append({
        'filepath': str(csv_file),
        'filename': csv_file.name,
        'room_type': room_type,
        'scan_type': scan_type,
        'trial_number': trial_number,
        'num_rows': num_rows,
        'folder': csv_file.parent.name
    })

# Convert to DataFrame
metadata_df = pd.DataFrame(metadata)

# Display summary
print("=" * 80)
print("DATA INVENTORY SUMMARY")
print("=" * 80)
print(f"\nTotal CSV files: {len(metadata_df)}")
print(f"\nRoom types ({len(metadata_df['room_type'].unique())}): {sorted(metadata_df['room_type'].unique())}")
print(f"Scan types ({len(metadata_df['scan_type'].unique())}): {sorted(metadata_df['scan_type'].unique())}")

print("\n" + "=" * 80)
print("COUNTS BY ROOM TYPE AND SCAN TYPE")
print("=" * 80)
pivot = metadata_df.groupby(['room_type', 'scan_type']).size().unstack(fill_value=0)
print(pivot)
print(f"\nTotal scans per room type:")
print(pivot.sum(axis=1))

print("\n" + "=" * 80)
print("ROW COUNTS STATISTICS BY ROOM TYPE")
print("=" * 80)
print(metadata_df.groupby('room_type')['num_rows'].describe().round(2))

print("\n" + "=" * 80)
print("SAMPLE FILES (sorted by room_type, scan_type, trial)")
print("=" * 80)
sample = metadata_df.sort_values(['room_type', 'scan_type', 'trial_number'])
print(sample[['filename', 'room_type', 'scan_type', 'trial_number', 'num_rows']].head(15))

# Verify we have exactly 105 scans
print("\n" + "=" * 80)
print("VERIFICATION")
print("=" * 80)
print(f"Expected: 100 scans (was 105, removed 5 with insufficient features)")
print(f"Actual: {len(metadata_df)} scans")
if len(metadata_df) == 100:
    print("✓ Count matches!")
else:
    print("⚠ Count mismatch - please review")

# Save metadata to CSV for reference
metadata_df.to_csv("meta_scan_csvs/data_inventory.csv", index=False)
print(f"\n✓ Saved complete inventory to: meta_scan_csvs/data_inventory.csv")

In [None]:
from sklearn.model_selection import train_test_split
# Load the inventory we just created
metadata_df = pd.read_csv("meta_scan_csvs/data_inventory.csv")

# Create a stratification column combining room_type and scan_type
metadata_df['strata'] = metadata_df['room_type'] + '_' + metadata_df['scan_type']

print("=" * 80)
print("MANUAL STRATIFIED TRAIN/TEST SPLIT (~80/20)")
print("=" * 80)

# Manually assign splits to ensure good distribution
train_indices = []
test_indices = []

for strata in metadata_df['strata'].unique():
    strata_df = metadata_df[metadata_df['strata'] == strata]
    n = len(strata_df)
    
    # Calculate split
    n_test = max(1, round(n * 0.2))  # At least 1 for test
    n_train = n - n_test
    
    print(f"{strata}: {n} total → {n_train} train, {n_test} test")
    
    # Shuffle and split
    shuffled = strata_df.sample(frac=1, random_state=42)
    test_indices.extend(shuffled.index[:n_test].tolist())
    train_indices.extend(shuffled.index[n_test:].tolist())

train_df = metadata_df.loc[train_indices]
test_df = metadata_df.loc[test_indices]

print(f"\nTotal files: {len(metadata_df)}")
print(f"Training files: {len(train_df)} ({len(train_df)/len(metadata_df)*100:.1f}%)")
print(f"Testing files: {len(test_df)} ({len(test_df)/len(metadata_df)*100:.1f}%)")

print("\n" + "=" * 80)
print("TRAINING SET DISTRIBUTION")
print("=" * 80)
train_dist = train_df.groupby(['room_type', 'scan_type']).size().unstack(fill_value=0)
print(train_dist)
print(f"\nTotal per room type:\n{train_dist.sum(axis=1)}")

print("\n" + "=" * 80)
print("TESTING SET DISTRIBUTION")
print("=" * 80)
test_dist = test_df.groupby(['room_type', 'scan_type']).size().unstack(fill_value=0)
print(test_dist)
print(f"\nTotal per room type:\n{test_dist.sum(axis=1)}")

# Save the splits
train_df.to_csv("meta_scan_csvs/train_metadata.csv", index=False)
test_df.to_csv("meta_scan_csvs/test_metadata.csv", index=False)

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print("✓ meta_scan_csvs/train_metadata.csv")
print("✓ meta_scan_csvs/test_metadata.csv")

# Show some examples
print("\n" + "=" * 80)
print("SAMPLE TRAINING FILES")
print("=" * 80)
print(train_df[['filename', 'room_type', 'scan_type', 'trial_number']].head(10))

print("\n" + "=" * 80)
print("SAMPLE TESTING FILES")
print("=" * 80)
print(test_df[['filename', 'room_type', 'scan_type', 'trial_number']].head(10))

In [None]:
# !pip install sktime --break-system-packages

In [None]:
cleaned_path = Path("meta_scan_csvs/cleaned")

print("=" * 80)
print("ANALYZING SENSOR COLUMNS ACROSS ALL 100 CSVs")
print("=" * 80)

all_columns_sets = []

for csv_file in cleaned_path.rglob("*.csv"):
    df = pd.read_csv(csv_file)
    cols = [c for c in df.columns if c != 'Time (s)']
    all_columns_sets.append(set(cols))
    print(f"{csv_file.name}: {len(cols)} columns")

# Find common columns across ALL CSVs
common_columns = set.intersection(*all_columns_sets)

print(f"\n" + "=" * 80)
print(f"COMMON COLUMNS ACROSS ALL 100 CSVs: {len(common_columns)}")
print("=" * 80)
print(sorted(common_columns))

# Find the range of column counts
all_column_counts = [len(s) for s in all_columns_sets]
print(f"\nColumn count range: {min(all_column_counts)} to {max(all_column_counts)}")

In [None]:
import numpy as np

# Define the common columns (from the analysis above)
COMMON_COLUMNS = [
    '% Prims Clipped', '% Prims Trivially Rejected', '% Stalled on System Memory', 
    '% Texture L2 Miss', '% Vertex Fetch Stall', 'ALU / Fragment', 'ALU / Vertex', 
    'Average Polygon Area', 'Average Vertices / Polygon', 'Avg Bytes / Fragment', 
    'Avg Bytes / Vertex', 'Avg Preemption Delay', 'Clocks / Second', 'EFU / Fragment', 
    'EFU / Vertex', 'Fragment ALU Instructions / Sec (Full)', 
    'Fragment ALU Instructions / Sec (Half)', 'Fragment EFU Instructions / Second', 
    'Fragment Instructions / Second', 'Fragments Shaded / Second', 'GPU % Bus Busy', 
    'GPU Frequency', 'L1 Texture Cache Miss Per Pixel', 'Pre-clipped Polygons/Second', 
    'Preemptions / second', 'Read Total (Bytes/sec)', 'Reused Vertices / Second', 
    'SP Memory Read (Bytes/Second)', 'Texture Memory Read BW (Bytes/Second)', 
    'Textures / Fragment', 'Textures / Vertex', 'Vertex Instructions / Second', 
    'Vertex Memory Read (Bytes/Second)', 'Vertices Shaded / Second', 
    'Write Total (Bytes/sec)', 'app_gpu_ms', 'app_rss_mb', 'app_uss_mb', 'app_vss_mb', 
    'application_layer_count', 'application_prediction_milliseconds', 'available_memory_mb', 
    'cpu_frequency_mhz', 'cpu_level', 'cpu_util_0', 'cpu_util_1', 'cpu_util_2', 
    'cpu_util_3', 'cpu_util_4', 'cpu_util_5', 'display_refresh_rate', 
    'gpu_frequency_mhz', 'gpu_level', 'gpu_util', 'mem_frequency_mhz', 
    'stale_frames_per_second', 'timewarp_gpu_ms'
]

def create_windows(df, window_size=75):
    """
    Create non-overlapping tumbling windows from a time series dataframe.
    Returns a list of window dataframes.
    """
    num_windows = len(df) // window_size
    windows = []
    
    for i in range(num_windows):
        start_idx = i * window_size
        end_idx = start_idx + window_size
        window = df.iloc[start_idx:end_idx].copy()
        windows.append(window)
    
    return windows

def process_dataset(metadata_df, output_dir, dataset_name):
    """
    Process all CSVs in a dataset (train or test), create windows using ONLY common features.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    all_windows_data = []
    
    print(f"\n{'=' * 80}")
    print(f"PROCESSING {dataset_name.upper()} SET (COMMON FEATURES ONLY)")
    print(f"{'=' * 80}\n")
    
    for idx, row in metadata_df.iterrows():
        csv_path = Path(row['filepath'])
        room_type = row['room_type']
        scan_type = row['scan_type']
        trial_number = row['trial_number']
        
        # Load CSV
        df = pd.read_csv(csv_path)
        
        # Select ONLY common columns (no Time column)
        # Check which common columns exist in this CSV
        available_common = [col for col in COMMON_COLUMNS if col in df.columns]
        
        if len(available_common) != len(COMMON_COLUMNS):
            missing = set(COMMON_COLUMNS) - set(available_common)
            print(f"WARNING: {row['filename']} missing columns: {missing}")
        
        df = df[available_common]
        
        # Create windows
        windows = create_windows(df, window_size=75)
        
        print(f"{row['filename']}: {len(df)} rows → {len(windows)} windows (57 features)")
        
        # Save each window with metadata
        for window_idx, window_df in enumerate(windows):
            window_data = {
                'original_filename': row['filename'],
                'room_type': room_type,
                'scan_type': scan_type,
                'trial_number': trial_number,
                'window_id': window_idx,
                'window_data': window_df.values  # Store as numpy array
            }
            all_windows_data.append(window_data)
    
    print(f"\n{'=' * 80}")
    print(f"{dataset_name.upper()} SUMMARY")
    print(f"{'=' * 80}")
    print(f"Total CSVs processed: {len(metadata_df)}")
    print(f"Total windows created: {len(all_windows_data)}")
    print(f"Windows per CSV (avg): {len(all_windows_data) / len(metadata_df):.1f}")
    print(f"Features per window: 57 (common features only)")
    
    # Save as pickle for easy loading later
    windows_df = pd.DataFrame(all_windows_data)
    output_file = output_path / f"{dataset_name}_windows_common.pkl"
    windows_df.to_pickle(output_file)
    print(f"\n✓ Saved to: {output_file}")
    
    # Show distribution
    print(f"\nWindows by room type:")
    print(windows_df.groupby('room_type').size())
    
    return windows_df

# Load train and test metadata
train_metadata = pd.read_csv("meta_scan_csvs/train_metadata.csv")
test_metadata = pd.read_csv("meta_scan_csvs/test_metadata.csv")

# Process training set
train_windows = process_dataset(
    train_metadata, 
    "meta_scan_csvs/windowed_data", 
    "train"
)

# Process testing set
test_windows = process_dataset(
    test_metadata, 
    "meta_scan_csvs/windowed_data", 
    "test"
)

print(f"\n{'=' * 80}")
print("WINDOWING COMPLETE (COMMON FEATURES)")
print(f"{'=' * 80}")
print(f"Training windows: {len(train_windows)}")
print(f"Testing windows: {len(test_windows)}")
print(f"Features per window: 57")
print(f"No padding needed - all windows have same shape!")

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sktime.classification.kernel_based import RocketClassifier
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("STEP 4: ROCKET CLASSIFIER FOR ROOM TYPE (COMMON FEATURES)")
print("=" * 80)

# Load windowed data with common features
print("\nLoading windowed data...")
train_windows = pd.read_pickle("meta_scan_csvs/windowed_data/train_windows_common.pkl")
test_windows = pd.read_pickle("meta_scan_csvs/windowed_data/test_windows_common.pkl")

print(f"Training windows: {len(train_windows)}")
print(f"Testing windows: {len(test_windows)}")

# Prepare data for ROCKET (no padding needed!)
def prepare_rocket_data(windows_df):
    """
    Convert windowed data to ROCKET format: (n_samples, n_timepoints, n_features)
    All windows have the same shape now!
    """
    X = np.stack(windows_df['window_data'].values)  # Stack all windows
    y = windows_df['room_type'].values  # Room type labels
    
    return X, y

print("\nPreparing training data...")
X_train, y_train = prepare_rocket_data(train_windows)
print(f"X_train shape: {X_train.shape}")
print(f"  - Samples: {X_train.shape[0]}")
print(f"  - Time points: {X_train.shape[1]}")
print(f"  - Features (common sensors): {X_train.shape[2]}")

print("\nPreparing testing data...")
X_test, y_test = prepare_rocket_data(test_windows)
print(f"X_test shape: {X_test.shape}")

# Check class distribution
print("\n" + "=" * 80)
print("CLASS DISTRIBUTION")
print("=" * 80)
print("\nTraining set:")
train_dist = pd.Series(y_train).value_counts().sort_index()
print(train_dist)
print(f"\nTesting set:")
test_dist = pd.Series(y_test).value_counts().sort_index()
print(test_dist)

# Train ROCKET classifier
print("\n" + "=" * 80)
print("TRAINING ROCKET CLASSIFIER")
print("=" * 80)
print("\nInitializing ROCKET classifier...")
print("  - Number of kernels: 10,000")
print("  - Random state: 42")

rocket_classifier = RocketClassifier(num_kernels=10000, rocket_transform="minirocket", random_state=42)

print("\nFitting ROCKET classifier (this may take a few minutes)...")
rocket_classifier.fit(X_train, y_train)
print("✓ Training complete!")

# Make predictions
print("\n" + "=" * 80)
print("EVALUATION")
print("=" * 80)

print("\nMaking predictions on training set...")
y_train_pred = rocket_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

print("Making predictions on test set...")
y_test_pred = rocket_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORT (TEST SET)")
print("=" * 80)
print(classification_report(y_test, y_test_pred, digits=4))

print("\n" + "=" * 80)
print("CONFUSION MATRIX (TEST SET)")
print("=" * 80)
cm = confusion_matrix(y_test, y_test_pred)
room_types = sorted(np.unique(y_test))
cm_df = pd.DataFrame(cm, index=room_types, columns=room_types)
print(cm_df)
print("\nRows = Actual, Columns = Predicted")

# Calculate per-class accuracy
print("\n" + "=" * 80)
print("PER-CLASS ACCURACY")
print("=" * 80)
for i, room_type in enumerate(room_types):
    class_accuracy = cm[i, i] / cm[i, :].sum()
    print(f"{room_type:12s}: {class_accuracy:.4f} ({class_accuracy*100:.2f}%)")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE - NO MODEL SAVED")
print("=" * 80)

STEP 4: ROCKET CLASSIFIER FOR ROOM TYPE (COMMON FEATURES)

Loading windowed data...
Training windows: 34597
Testing windows: 5932

Preparing training data...
X_train shape: (34597, 75, 57)
  - Samples: 34597
  - Time points: 75
  - Features (common sensors): 57

Preparing testing data...
X_test shape: (5932, 75, 57)

CLASS DISTRIBUTION

Training set:
blinds        7368
blinds_up     4726
hallway      12907
kitchen       4834
lab           4762
Name: count, dtype: int64

Testing set:
blinds       1194
blinds_up    1402
hallway      1171
kitchen      1030
lab          1135
Name: count, dtype: int64

TRAINING ROCKET CLASSIFIER

Initializing ROCKET classifier...
  - Number of kernels: 10,000
  - Random state: 42

Fitting ROCKET classifier (this may take a few minutes)...
