In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("GRADIENT BOOSTING CLASSIFIER WITH FEATURE ENGINEERING")
print("=" * 80)

# Define common columns
COMMON_COLUMNS = [
    '% Prims Clipped', '% Prims Trivially Rejected', '% Stalled on System Memory', 
    '% Texture L2 Miss', '% Vertex Fetch Stall', 'ALU / Fragment', 'ALU / Vertex', 
    'Average Polygon Area', 'Average Vertices / Polygon', 'Avg Bytes / Fragment', 
    'Avg Bytes / Vertex', 'Avg Preemption Delay', 'Clocks / Second', 'EFU / Fragment', 
    'EFU / Vertex', 'Fragment ALU Instructions / Sec (Full)', 
    'Fragment ALU Instructions / Sec (Half)', 'Fragment EFU Instructions / Second', 
    'Fragment Instructions / Second', 'Fragments Shaded / Second', 'GPU % Bus Busy', 
    'GPU Frequency', 'L1 Texture Cache Miss Per Pixel', 'Pre-clipped Polygons/Second', 
    'Preemptions / second', 'Read Total (Bytes/sec)', 'Reused Vertices / Second', 
    'SP Memory Read (Bytes/Second)', 'Texture Memory Read BW (Bytes/Second)', 
    'Textures / Fragment', 'Textures / Vertex', 'Vertex Instructions / Second', 
    'Vertex Memory Read (Bytes/Second)', 'Vertices Shaded / Second', 
    'Write Total (Bytes/sec)', 'app_gpu_ms', 'app_rss_mb', 'app_uss_mb', 'app_vss_mb', 
    'application_layer_count', 'application_prediction_milliseconds', 'available_memory_mb', 
    'cpu_frequency_mhz', 'cpu_level', 'cpu_util_0', 'cpu_util_1', 'cpu_util_2', 
    'cpu_util_3', 'cpu_util_4', 'cpu_util_5', 'display_refresh_rate', 
    'gpu_frequency_mhz', 'gpu_level', 'gpu_util', 'mem_frequency_mhz', 
    'stale_frames_per_second', 'timewarp_gpu_ms'
]

def create_overlapping_windows(df, window_size=75, stride=25):
    """
    Create overlapping windows with stride.
    stride=25 with window_size=75 gives 66% overlap and ~3x more windows
    """
    windows = []
    for i in range(0, len(df) - window_size + 1, stride):
        window = df.iloc[i:i+window_size].copy()
        windows.append(window)
    return windows

def extract_statistical_features(window_df):
    """
    Extract statistical features with emphasis on features that separate kitchen from blinds.
    """
    features = []
    
    # Top 5 most discriminative features - add MORE statistics for these
    top_features = [
        'ALU / Fragment',
        'EFU / Fragment', 
        'Average Vertices / Polygon',
        '% Stalled on System Memory',
        'Fragment ALU Instructions / Sec (Half)'
    ]
    
    # For top features, extract MANY statistics
    for col in window_df.columns:
        if col in top_features:
            # Add 10 statistics for highly discriminative features
            features.extend([
                window_df[col].mean(),
                window_df[col].std(),
                window_df[col].max(),
                window_df[col].min(),
                window_df[col].median(),
                window_df[col].quantile(0.25),
                window_df[col].quantile(0.75),
                (window_df[col].max() - window_df[col].min()),  # range
                window_df[col].skew(),
                window_df[col].kurtosis()
            ])
        else:
            # For other features, keep original 5 statistics
            features.extend([
                window_df[col].mean(),
                window_df[col].std(),
                window_df[col].max(),
                window_df[col].min(),
                window_df[col].median()
            ])
    
    return np.array(features)

def process_dataset(metadata_df, dataset_name, window_size=75, stride=25):
    """
    Process dataset: load CSVs, create overlapping windows, extract features
    """
    all_features = []
    all_labels = []
    
    print(f"\n{'=' * 80}")
    print(f"PROCESSING {dataset_name.upper()} SET")
    print(f"{'=' * 80}\n")
    
    for idx, row in metadata_df.iterrows():
        csv_path = Path(row['filepath'])
        room_type = row['room_type']
        
        # Load CSV
        df = pd.read_csv(csv_path)
        
        # Select only common columns
        available_common = [col for col in COMMON_COLUMNS if col in df.columns]
        df = df[available_common]
        
        # Create overlapping windows
        windows = create_overlapping_windows(df, window_size, stride)
        
        print(f"{row['filename']}: {len(df)} rows → {len(windows)} windows")
        
        # Extract features from each window
        for window_df in windows:
            features = extract_statistical_features(window_df)
            all_features.append(features)
            all_labels.append(room_type)
    
    X = np.array(all_features)
    y = np.array(all_labels)
    
    print(f"\n{dataset_name.upper()} SUMMARY:")
    print(f"Total windows: {len(X)}")
    print(f"Feature vector size: {X.shape[1]} (57 sensors × 5 stats)")
    
    return X, y

# Load train and test metadata
train_metadata = pd.read_csv("meta_scan_csvs/train_metadata.csv")
test_metadata = pd.read_csv("meta_scan_csvs/test_metadata.csv")

# Process training set
X_train, y_train = process_dataset(
    train_metadata, 
    "train",
    window_size=75,
    stride=25  # 66% overlap = ~3x more windows
)

# Process testing set
X_test, y_test = process_dataset(
    test_metadata, 
    "test",
    window_size=75,
    stride=25
)

# Check class distribution
print("\n" + "=" * 80)
print("CLASS DISTRIBUTION")
print("=" * 80)
print("\nTraining set:")
train_dist = pd.Series(y_train).value_counts().sort_index()
print(train_dist)
print(f"\nTesting set:")
test_dist = pd.Series(y_test).value_counts().sort_index()
print(test_dist)

# Scale features
print("\n" + "=" * 80)
print("FEATURE SCALING")
print("=" * 80)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✓ Features scaled using StandardScaler")

# Train Gradient Boosting Classifier
print("\n" + "=" * 80)
print("TRAINING GRADIENT BOOSTING CLASSIFIER")
print("=" * 80)
print("\nHyperparameters:")
print("  - n_estimators: 200")
print("  - learning_rate: 0.05")
print("  - max_depth: 5")
print("  - random_state: 42")

clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    verbose=1
)

print("\nTraining...")
clf.fit(X_train_scaled, y_train)
print("✓ Training complete!")

# Make predictions
print("\n" + "=" * 80)
print("EVALUATION")
print("=" * 80)

y_train_pred = clf.predict(X_train_scaled)
y_test_pred = clf.predict(X_test_scaled)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORT (TEST SET)")
print("=" * 80)
print(classification_report(y_test, y_test_pred, digits=4))

print("\n" + "=" * 80)
print("CONFUSION MATRIX (TEST SET)")
print("=" * 80)
cm = confusion_matrix(y_test, y_test_pred)
room_types = sorted(np.unique(y_test))
cm_df = pd.DataFrame(cm, index=room_types, columns=room_types)
print(cm_df)
print("\nRows = Actual, Columns = Predicted")

# Per-class accuracy
print("\n" + "=" * 80)
print("PER-CLASS ACCURACY")
print("=" * 80)
for i, room_type in enumerate(room_types):
    class_accuracy = cm[i, i] / cm[i, :].sum()
    print(f"{room_type:12s}: {class_accuracy:.4f} ({class_accuracy*100:.2f}%)")

# Feature importance (top 20)
print("\n" + "=" * 80)
print("TOP 20 MOST IMPORTANT FEATURES")
print("=" * 80)

# Build feature names matching the new extraction function
feature_names = []

# Top 5 discriminative features get 10 stats each
top_features = [
    'ALU / Fragment',
    'EFU / Fragment', 
    'Average Vertices / Polygon',
    '% Stalled on System Memory',
    'Fragment ALU Instructions / Sec (Half)'
]

top_stats = ['mean', 'std', 'max', 'min', 'median', 'q25', 'q75', 'range', 'skew', 'kurtosis']
regular_stats = ['mean', 'std', 'max', 'min', 'median']

for col in COMMON_COLUMNS:
    if col in top_features:
        for stat in top_stats:
            feature_names.append(f"{col}_{stat}")
    else:
        for stat in regular_stats:
            feature_names.append(f"{col}_{stat}")

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(20).to_string(index=False))

GRADIENT BOOSTING CLASSIFIER WITH FEATURE ENGINEERING

PROCESSING TRAIN SET

blinds_motion_5_cleaned.csv: 327 rows → 11 windows
blinds_motion_3_cleaned.csv: 335 rows → 11 windows
blinds_motion_1_cleaned.csv: 1838 rows → 71 windows
blinds_motion_4_cleaned.csv: 312 rows → 10 windows
blinds_object_5_cleaned.csv: 1827 rows → 71 windows
blinds_object_3_cleaned.csv: 327 rows → 11 windows
blinds_object_1_cleaned.csv: 1821 rows → 70 windows
blinds_object_4_cleaned.csv: 325 rows → 11 windows
blinds_person_5_cleaned.csv: 327 rows → 11 windows
blinds_person_3_cleaned.csv: 326 rows → 11 windows
blinds_person_1_cleaned.csv: 350 rows → 12 windows
blinds_person_4_cleaned.csv: 312 rows → 10 windows
hallway_5_cleaned.csv: 1493 rows → 57 windows
hallway_3_cleaned.csv: 306 rows → 10 windows
hallway_1_cleaned.csv: 300 rows → 10 windows
hallway_4_cleaned.csv: 320 rows → 10 windows
hallway_motion_5_cleaned.csv: 314 rows → 10 windows
hallway_motion_3_cleaned.csv: 313 rows → 10 windows
hallway_motion_1_cleane

ValueError: All arrays must be of the same length

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# Define common columns
COMMON_COLUMNS = [
    '% Prims Clipped', '% Prims Trivially Rejected', '% Stalled on System Memory', 
    '% Texture L2 Miss', '% Vertex Fetch Stall', 'ALU / Fragment', 'ALU / Vertex', 
    'Average Polygon Area', 'Average Vertices / Polygon', 'Avg Bytes / Fragment', 
    'Avg Bytes / Vertex', 'Avg Preemption Delay', 'Clocks / Second', 'EFU / Fragment', 
    'EFU / Vertex', 'Fragment ALU Instructions / Sec (Full)', 
    'Fragment ALU Instructions / Sec (Half)', 'Fragment EFU Instructions / Second', 
    'Fragment Instructions / Second', 'Fragments Shaded / Second', 'GPU % Bus Busy', 
    'GPU Frequency', 'L1 Texture Cache Miss Per Pixel', 'Pre-clipped Polygons/Second', 
    'Preemptions / second', 'Read Total (Bytes/sec)', 'Reused Vertices / Second', 
    'SP Memory Read (Bytes/Second)', 'Texture Memory Read BW (Bytes/Second)', 
    'Textures / Fragment', 'Textures / Vertex', 'Vertex Instructions / Second', 
    'Vertex Memory Read (Bytes/Second)', 'Vertices Shaded / Second', 
    'Write Total (Bytes/sec)', 'app_gpu_ms', 'app_rss_mb', 'app_uss_mb', 'app_vss_mb', 
    'application_layer_count', 'application_prediction_milliseconds', 'available_memory_mb', 
    'cpu_frequency_mhz', 'cpu_level', 'cpu_util_0', 'cpu_util_1', 'cpu_util_2', 
    'cpu_util_3', 'cpu_util_4', 'cpu_util_5', 'display_refresh_rate', 
    'gpu_frequency_mhz', 'gpu_level', 'gpu_util', 'mem_frequency_mhz', 
    'stale_frames_per_second', 'timewarp_gpu_ms'
]

# Load all kitchen files
kitchen_path = Path("meta_scan_csvs/cleaned/kitchen")
kitchen_dfs = []
for csv_file in kitchen_path.glob("*.csv"):
    df = pd.read_csv(csv_file)
    available = [col for col in COMMON_COLUMNS if col in df.columns]
    kitchen_dfs.append(df[available])

kitchen_all = pd.concat(kitchen_dfs, ignore_index=True)

# Load all blinds files
blinds_path = Path("meta_scan_csvs/cleaned/blinds")
blinds_dfs = []
for csv_file in blinds_path.glob("*.csv"):
    df = pd.read_csv(csv_file)
    available = [col for col in COMMON_COLUMNS if col in df.columns]
    blinds_dfs.append(df[available])

blinds_all = pd.concat(blinds_dfs, ignore_index=True)

# Compute statistics for each feature
feature_separation = []

for feature in COMMON_COLUMNS:
    if feature in kitchen_all.columns and feature in blinds_all.columns:
        # Get statistics
        k_mean = kitchen_all[feature].mean()
        k_std = kitchen_all[feature].std()
        b_mean = blinds_all[feature].mean()
        b_std = blinds_all[feature].std()
        
        # Calculate separation metrics
        # 1. Effect size (Cohen's d)
        pooled_std = np.sqrt((k_std**2 + b_std**2) / 2)
        cohens_d = abs(k_mean - b_mean) / (pooled_std + 1e-10)
        
        # 2. Relative difference
        rel_diff = abs(k_mean - b_mean) / (abs(k_mean) + abs(b_mean) + 1e-10)
        
        # 3. Combined score (higher = better separation)
        separation_score = cohens_d * rel_diff
        
        feature_separation.append({
            'feature': feature,
            'kitchen_mean': k_mean,
            'kitchen_std': k_std,
            'blinds_mean': b_mean,
            'blinds_std': b_std,
            'cohens_d': cohens_d,
            'relative_diff': rel_diff,
            'separation_score': separation_score
        })

# Convert to DataFrame and sort
sep_df = pd.DataFrame(feature_separation).sort_values('separation_score', ascending=False)

print("=" * 100)
print("TOP 20 FEATURES THAT SEPARATE KITCHEN FROM BLINDS")
print("=" * 100)
print(f"\n{'Rank':<5} {'Feature':<50} {'Sep Score':<12} {'Cohen\'s d':<12} {'Rel Diff':<12}")
print("-" * 100)

for i, row in sep_df.head(20).iterrows():
    print(f"{i+1:<5} {row['feature']:<50} {row['separation_score']:<12.4f} {row['cohens_d']:<12.4f} {row['relative_diff']:<12.4f}")

print("\n" + "=" * 100)
print("DETAILED VIEW: TOP 10 FEATURES")
print("=" * 100)

for i, row in sep_df.head(10).iterrows():
    print(f"\n{i+1}. {row['feature']}")
    print(f"   Kitchen: {row['kitchen_mean']:12.4f} ± {row['kitchen_std']:10.4f}")
    print(f"   Blinds:  {row['blinds_mean']:12.4f} ± {row['blinds_std']:10.4f}")
    print(f"   Separation: {row['separation_score']:.4f} (Cohen's d: {row['cohens_d']:.4f})")

# Also check which stats (mean, std, max, etc.) work best for these features
print("\n" + "=" * 100)
print("RECOMMENDED ADDITIONAL STATISTICS FOR TOP 5 FEATURES")
print("=" * 100)

for i, row in sep_df.head(5).iterrows():
    feature = row['feature']
    print(f"\n{feature}:")
    
    # Check different statistics
    k_vals = kitchen_all[feature].dropna()
    b_vals = blinds_all[feature].dropna()
    
    stats = {
        'mean': (k_vals.mean(), b_vals.mean()),
        'std': (k_vals.std(), b_vals.std()),
        'median': (k_vals.median(), b_vals.median()),
        'q25': (k_vals.quantile(0.25), b_vals.quantile(0.25)),
        'q75': (k_vals.quantile(0.75), b_vals.quantile(0.75)),
        'max': (k_vals.max(), b_vals.max()),
        'min': (k_vals.min(), b_vals.min()),
        'range': (k_vals.max() - k_vals.min(), b_vals.max() - b_vals.min()),
        'skew': (k_vals.skew(), b_vals.skew()),
        'kurtosis': (k_vals.kurtosis(), b_vals.kurtosis())
    }
    
    # Calculate separation for each stat
    stat_sep = {}
    for stat_name, (k_val, b_val) in stats.items():
        diff = abs(k_val - b_val) / (abs(k_val) + abs(b_val) + 1e-10)
        stat_sep[stat_name] = diff
    
    # Sort and show top 5
    top_stats = sorted(stat_sep.items(), key=lambda x: x[1], reverse=True)[:5]
    print(f"   Best statistics: {', '.join([s[0] for s in top_stats])}")

TOP 20 FEATURES THAT SEPARATE KITCHEN FROM BLINDS

Rank  Feature                                            Sep Score    Cohen's d    Rel Diff    
----------------------------------------------------------------------------------------------------
6     ALU / Fragment                                     0.7107       0.7107       1.0000      
14    EFU / Fragment                                     0.6515       0.7818       0.8333      
9     Average Vertices / Polygon                         0.3276       0.4488       0.7298      
3     % Stalled on System Memory                         0.2666       0.6198       0.4301      
17    Fragment ALU Instructions / Sec (Half)             0.1705       0.5324       0.3202      
11    Avg Bytes / Vertex                                 0.1677       0.7118       0.2356      
16    Fragment ALU Instructions / Sec (Full)             0.1670       0.5272       0.3167      
28    SP Memory Read (Bytes/Second)                      0.1454       0.4653    

In [1]:
from pathlib import Path
import pandas as pd

def parse_filename(filepath):
    """
    Parse filename to extract room_type, scan_type, and trial_number
    Examples:
    - blinds_1_cleaned.csv -> room: blinds, scan: base, trial: 1
    - blinds_up_1_cleaned.csv -> room: blinds_up, scan: base, trial: 1
    - kitchen_motion_3_cleaned.csv -> room: kitchen, scan: motion, trial: 3
    """
    filename = filepath.stem.replace('_cleaned', '')
    
    # Handle blinds_up as a special case (two-word room type)
    if filename.startswith('blinds_up'):
        room_type = 'blinds_up'
        remainder = filename.replace('blinds_up_', '', 1)
    else:
        # Split and take first part as room type
        parts = filename.split('_')
        room_type = parts[0]
        remainder = '_'.join(parts[1:])
    
    # Now parse the remainder for scan type and trial
    remainder_parts = remainder.split('_')
    
    if len(remainder_parts) == 1:
        # Base scan (just a number)
        scan_type = 'base'
        trial_number = int(remainder_parts[0])
    elif len(remainder_parts) == 2:
        # Scan type + number (e.g., motion_3)
        scan_type = remainder_parts[0]
        trial_number = int(remainder_parts[1])
    else:
        scan_type = 'unknown'
        trial_number = -1
    
    return room_type, scan_type, trial_number

# Scan all CSV files
cleaned_path = Path("meta_scan_csvs/cleaned")
csv_files = list(cleaned_path.rglob("*.csv"))

# Create metadata inventory
metadata = []
for csv_file in csv_files:
    room_type, scan_type, trial_number = parse_filename(csv_file)
    
    # Get number of rows
    df = pd.read_csv(csv_file)
    print(f"  Loaded {csv_file.name}: {df.shape}")
    num_rows = len(df)
    
    metadata.append({
        'filepath': str(csv_file),
        'filename': csv_file.name,
        'room_type': room_type,
        'scan_type': scan_type,
        'trial_number': trial_number,
        'num_rows': num_rows,
        'folder': csv_file.parent.name
    })

# Convert to DataFrame
metadata_df = pd.DataFrame(metadata)

# Display summary
print("=" * 80)
print("DATA INVENTORY SUMMARY")
print("=" * 80)
print(f"\nTotal CSV files: {len(metadata_df)}")
print(f"\nRoom types ({len(metadata_df['room_type'].unique())}): {sorted(metadata_df['room_type'].unique())}")
print(f"Scan types ({len(metadata_df['scan_type'].unique())}): {sorted(metadata_df['scan_type'].unique())}")

print("\n" + "=" * 80)
print("COUNTS BY ROOM TYPE AND SCAN TYPE")
print("=" * 80)
pivot = metadata_df.groupby(['room_type', 'scan_type']).size().unstack(fill_value=0)
print(pivot)
print(f"\nTotal scans per room type:")
print(pivot.sum(axis=1))

print("\n" + "=" * 80)
print("ROW COUNTS STATISTICS BY ROOM TYPE")
print("=" * 80)
print(metadata_df.groupby('room_type')['num_rows'].describe().round(2))

print("\n" + "=" * 80)
print("SAMPLE FILES (sorted by room_type, scan_type, trial)")
print("=" * 80)
sample = metadata_df.sort_values(['room_type', 'scan_type', 'trial_number'])
print(sample[['filename', 'room_type', 'scan_type', 'trial_number', 'num_rows']].head(15))

# Verify we have exactly 105 scans
print("\n" + "=" * 80)
print("VERIFICATION")
print("=" * 80)
print(f"Expected: 100 scans")
print(f"Actual: {len(metadata_df)} scans")
if len(metadata_df) == 100:
    print("✓ Count matches!")
else:
    print("⚠ Count mismatch - please review")

# Save metadata to CSV for reference
metadata_df.to_csv("meta_scan_csvs/data_inventory.csv", index=False)
print(f"\n✓ Saved complete inventory to: meta_scan_csvs/data_inventory.csv")

  Loaded blinds_10_cleaned.csv: (1719, 70)
  Loaded blinds_6_cleaned.csv: (1365, 70)
  Loaded blinds_7_cleaned.csv: (301, 58)
  Loaded blinds_8_cleaned.csv: (312, 59)
  Loaded blinds_9_cleaned.csv: (290, 70)
  Loaded blinds_motion_1_cleaned.csv: (1838, 66)
  Loaded blinds_motion_2_cleaned.csv: (329, 65)
  Loaded blinds_motion_3_cleaned.csv: (335, 59)
  Loaded blinds_motion_4_cleaned.csv: (312, 58)
  Loaded blinds_motion_5_cleaned.csv: (327, 60)
  Loaded blinds_object_1_cleaned.csv: (1821, 62)
  Loaded blinds_object_2_cleaned.csv: (327, 59)
  Loaded blinds_object_3_cleaned.csv: (327, 59)
  Loaded blinds_object_4_cleaned.csv: (325, 59)
  Loaded blinds_object_5_cleaned.csv: (1827, 59)
  Loaded blinds_person_1_cleaned.csv: (350, 68)
  Loaded blinds_person_2_cleaned.csv: (313, 64)
  Loaded blinds_person_3_cleaned.csv: (326, 58)
  Loaded blinds_person_4_cleaned.csv: (312, 59)
  Loaded blinds_person_5_cleaned.csv: (327, 59)
  Loaded blinds_up_10_cleaned.csv: (319, 58)
  Loaded blinds_up_6_cle

In [11]:
cleaned_path = Path("meta_scan_csvs/cleaned")

print("=" * 80)
print("ANALYZING SENSOR COLUMNS ACROSS ALL 100 CSVs")
print("=" * 80)

all_columns_sets = []

for csv_file in cleaned_path.rglob("*.csv"):
    df = pd.read_csv(csv_file)
    cols = set(df.columns)
    all_columns_sets.append(cols)
    print(f"{csv_file.name}: {len(cols)} columns")

# Find common columns across ALL CSVs
common_columns = set.intersection(*all_columns_sets)

print(f"\n" + "=" * 80)
print(f"COMMON COLUMNS ACROSS ALL 100 CSVs: {len(common_columns)}")
print("=" * 80)

# Print in list format ready for copying into code
print("\nCOMMON_COLUMNS = [")
for col in sorted(common_columns):
    print(f"    '{col}',")
print("]")

# Find the range of column counts
all_column_counts = [len(s) for s in all_columns_sets]
print(f"\n" + "=" * 80)
print(f"Column count range: {min(all_column_counts)} to {max(all_column_counts)}")
print("=" * 80)

ANALYZING SENSOR COLUMNS ACROSS ALL 100 CSVs
blinds_10_cleaned.csv: 70 columns
blinds_6_cleaned.csv: 70 columns
blinds_7_cleaned.csv: 58 columns
blinds_8_cleaned.csv: 59 columns
blinds_9_cleaned.csv: 70 columns
blinds_motion_1_cleaned.csv: 66 columns
blinds_motion_2_cleaned.csv: 65 columns
blinds_motion_3_cleaned.csv: 59 columns
blinds_motion_4_cleaned.csv: 58 columns
blinds_motion_5_cleaned.csv: 60 columns
blinds_object_1_cleaned.csv: 62 columns
blinds_object_2_cleaned.csv: 59 columns
blinds_object_3_cleaned.csv: 59 columns
blinds_object_4_cleaned.csv: 59 columns
blinds_object_5_cleaned.csv: 59 columns
blinds_person_1_cleaned.csv: 68 columns
blinds_person_2_cleaned.csv: 64 columns
blinds_person_3_cleaned.csv: 58 columns
blinds_person_4_cleaned.csv: 59 columns
blinds_person_5_cleaned.csv: 59 columns
blinds_up_10_cleaned.csv: 58 columns
blinds_up_6_cleaned.csv: 70 columns
blinds_up_7_cleaned.csv: 69 columns
blinds_up_8_cleaned.csv: 59 columns
blinds_up_9_cleaned.csv: 59 columns
blinds_u

In [13]:
!pip install sktime --break-system-packages



In [17]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the inventory we just created
metadata_df = pd.read_csv("meta_scan_csvs/data_inventory.csv")

# Create a stratification column combining room_type and scan_type
metadata_df['strata'] = metadata_df['room_type'] + '_' + metadata_df['scan_type']

print("=" * 80)
print("MANUAL STRATIFIED TRAIN/TEST SPLIT (~80/20)")
print("=" * 80)

# Manually assign splits to ensure good distribution
train_indices = []
test_indices = []

for strata in metadata_df['strata'].unique():
    strata_df = metadata_df[metadata_df['strata'] == strata]
    n = len(strata_df)
    
    # Calculate split
    n_test = max(1, round(n * 0.2))  # At least 1 for test
    n_train = n - n_test
    
    print(f"{strata}: {n} total → {n_train} train, {n_test} test")
    
    # Shuffle and split
    shuffled = strata_df.sample(frac=1, random_state=42)
    test_indices.extend(shuffled.index[:n_test].tolist())
    train_indices.extend(shuffled.index[n_test:].tolist())

train_df = metadata_df.loc[train_indices]
test_df = metadata_df.loc[test_indices]

print(f"\nTotal files: {len(metadata_df)}")
print(f"Training files: {len(train_df)} ({len(train_df)/len(metadata_df)*100:.1f}%)")
print(f"Testing files: {len(test_df)} ({len(test_df)/len(metadata_df)*100:.1f}%)")

# Save the splits
train_df.to_csv("meta_scan_csvs/train_metadata.csv", index=False)
test_df.to_csv("meta_scan_csvs/test_metadata.csv", index=False)

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print("✓ meta_scan_csvs/train_metadata.csv")
print("✓ meta_scan_csvs/test_metadata.csv")

MANUAL STRATIFIED TRAIN/TEST SPLIT (~80/20)
blinds_base: 5 total → 4 train, 1 test
blinds_motion: 5 total → 4 train, 1 test
blinds_object: 5 total → 4 train, 1 test
blinds_person: 5 total → 4 train, 1 test
blinds_up_base: 5 total → 4 train, 1 test
blinds_up_motion: 5 total → 4 train, 1 test
blinds_up_object: 5 total → 4 train, 1 test
blinds_up_person: 5 total → 4 train, 1 test
hallway_base: 5 total → 4 train, 1 test
hallway_motion: 5 total → 4 train, 1 test
hallway_object: 5 total → 4 train, 1 test
hallway_person: 5 total → 4 train, 1 test
kitchen_base: 7 total → 6 train, 1 test
kitchen_motion: 5 total → 4 train, 1 test
kitchen_object: 5 total → 4 train, 1 test
kitchen_person: 4 total → 3 train, 1 test
lab_base: 5 total → 4 train, 1 test
lab_motion: 5 total → 4 train, 1 test
lab_object: 5 total → 4 train, 1 test
lab_person: 4 total → 3 train, 1 test

Total files: 100
Training files: 80 (80.0%)
Testing files: 20 (20.0%)

FILES SAVED
✓ meta_scan_csvs/train_metadata.csv
✓ meta_scan_csvs/t

In [5]:
from pathlib import Path
import pandas as pd

COMMON_COLUMNS = [
    '% Prims Clipped',
    '% Prims Trivially Rejected',
    '% Stalled on System Memory',
    '% Texture L2 Miss',
    '% Vertex Fetch Stall',
    'ALU / Fragment',
    'ALU / Vertex',
    'Average Polygon Area',
    'Average Vertices / Polygon',
    'Avg Bytes / Fragment',
    'Avg Bytes / Vertex',
    'Avg Preemption Delay',
    'Clocks / Second',
    'EFU / Fragment',
    'EFU / Vertex',
    'Fragment ALU Instructions / Sec (Full)',
    'Fragment ALU Instructions / Sec (Half)',
    'Fragment EFU Instructions / Second',
    'Fragment Instructions / Second',
    'Fragments Shaded / Second',
    'GPU % Bus Busy',
    'GPU Frequency',
    'L1 Texture Cache Miss Per Pixel',
    'Pre-clipped Polygons/Second',
    'Preemptions / second',
    'Read Total (Bytes/sec)',
    'Reused Vertices / Second',
    'SP Memory Read (Bytes/Second)',
    'Texture Memory Read BW (Bytes/Second)',
    'Textures / Fragment',
    'Textures / Vertex',
    'Vertex Instructions / Second',
    'Vertex Memory Read (Bytes/Second)',
    'Vertices Shaded / Second',
    'Write Total (Bytes/sec)',
    'app_rss_mb',
    'app_uss_mb',
    'app_vss_mb',
    'available_memory_mb',
    'cpu_frequency_mhz',
    'cpu_level',
    'cpu_util_0',
    'cpu_util_1',
    'cpu_util_2',
    'cpu_util_3',
    'cpu_util_4',
    'cpu_util_5',
    'display_refresh_rate',
    'gpu_frequency_mhz',
    'gpu_level',
    'gpu_util',
    'mem_frequency_mhz',
    'timewarp_gpu_ms',
]

def create_windows(df, window_size=50):  # Changed from 75 to 50
    """
    Create non-overlapping tumbling windows from a time series dataframe.
    Returns a list of window dataframes.
    """
    num_windows = len(df) // window_size
    windows = []
    
    for i in range(num_windows):
        start_idx = i * window_size
        end_idx = start_idx + window_size
        window = df.iloc[start_idx:end_idx].copy()
        windows.append(window)
    
    return windows

def process_dataset(metadata_df, output_dir, dataset_name):
    """
    Process all CSVs in a dataset (train or test), create windows using ONLY common features.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    all_windows_data = []
    
    print(f"\n{'=' * 80}")
    print(f"PROCESSING {dataset_name.upper()} SET (COMMON FEATURES ONLY)")
    print(f"{'=' * 80}\n")
    
    for idx, row in metadata_df.iterrows():
        csv_path = Path(row['filepath'])
        room_type = row['room_type']
        scan_type = row['scan_type']
        trial_number = row['trial_number']
        
        # Load CSV
        df = pd.read_csv(csv_path)
        
        # Select ONLY common columns
        available_common = [col for col in COMMON_COLUMNS if col in df.columns]
        
        if len(available_common) != len(COMMON_COLUMNS):
            missing = set(COMMON_COLUMNS) - set(available_common)
            print(f"WARNING: {row['filename']} missing columns: {missing}")
        
        df = df[available_common]
        
        # Create windows with size 50
        windows = create_windows(df, window_size=50)  # Changed from 75 to 50
        
        print(f"{row['filename']}: {len(df)} rows → {len(windows)} windows (54 features)")
        
        # Save each window with metadata
        for window_idx, window_df in enumerate(windows):
            window_data = {
                'original_filename': row['filename'],
                'room_type': room_type,
                'scan_type': scan_type,
                'trial_number': trial_number,
                'window_id': window_idx,
                'window_data': window_df.values  # Store as numpy array
            }
            all_windows_data.append(window_data)
    
    print(f"\n{'=' * 80}")
    print(f"{dataset_name.upper()} SUMMARY")
    print(f"{'=' * 80}")
    print(f"Total CSVs processed: {len(metadata_df)}")
    print(f"Total windows created: {len(all_windows_data)}")
    print(f"Windows per CSV (avg): {len(all_windows_data) / len(metadata_df):.1f}")
    print(f"Features per window: 54 (common features only)")
    
    # Save as pickle for easy loading later
    windows_df = pd.DataFrame(all_windows_data)
    output_file = output_path / f"{dataset_name}_windows_common.pkl"
    windows_df.to_pickle(output_file)
    print(f"\n✓ Saved to: {output_file}")
    
    # Show distribution
    print(f"\nWindows by room type:")
    print(windows_df.groupby('room_type').size())
    
    return windows_df

# Load train and test metadata
train_metadata = pd.read_csv("meta_scan_csvs/train_metadata.csv")
test_metadata = pd.read_csv("meta_scan_csvs/test_metadata.csv")

# Process training set
train_windows = process_dataset(
    train_metadata, 
    "meta_scan_csvs/windowed_data", 
    "train"
)

# Process testing set
test_windows = process_dataset(
    test_metadata, 
    "meta_scan_csvs/windowed_data", 
    "test"
)

print(f"\n{'=' * 80}")
print("WINDOWING COMPLETE (COMMON FEATURES)")
print(f"{'=' * 80}")
print(f"Training windows: {len(train_windows)}")
print(f"Testing windows: {len(test_windows)}")
print(f"Window size: 50 timesteps")
print(f"Features per window: 54")
print(f"No padding needed - all windows have same shape!")


PROCESSING TRAIN SET (COMMON FEATURES ONLY)

blinds_9_cleaned.csv: 290 rows → 5 windows (54 features)
blinds_7_cleaned.csv: 301 rows → 6 windows (54 features)
blinds_10_cleaned.csv: 1719 rows → 34 windows (54 features)
blinds_8_cleaned.csv: 312 rows → 6 windows (54 features)
blinds_motion_5_cleaned.csv: 327 rows → 6 windows (54 features)
blinds_motion_3_cleaned.csv: 335 rows → 6 windows (54 features)
blinds_motion_1_cleaned.csv: 1838 rows → 36 windows (54 features)
blinds_motion_4_cleaned.csv: 312 rows → 6 windows (54 features)
blinds_object_5_cleaned.csv: 1827 rows → 36 windows (54 features)
blinds_object_3_cleaned.csv: 327 rows → 6 windows (54 features)
blinds_object_1_cleaned.csv: 1821 rows → 36 windows (54 features)
blinds_object_4_cleaned.csv: 325 rows → 6 windows (54 features)
blinds_person_5_cleaned.csv: 327 rows → 6 windows (54 features)
blinds_person_3_cleaned.csv: 326 rows → 6 windows (54 features)
blinds_person_1_cleaned.csv: 350 rows → 7 windows (54 features)
blinds_person