# Setup and Data Load
Import required libraries, configure plotting style, and load the training and test CSV files into pandas DataFrames.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Dataset Statistics
Show basic dataset statistics including memory usage, data types, and descriptive statistics for numeric columns.

In [None]:
print("=
,
TRAINING DATA STATISTICS")
print("=
,
,
\nShape: {train_df.shape}")
print(f"Memory usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nData types:")
print(train_df.dtypes.value_counts())

print(f"\nBasic statistics:")
print(train_df.describe())

# Missing Values Analysis
Compute missing value counts and percentages per column, and visualize the top columns with missing data.

In [None]:
print("=
,
MISSING VALUES ANALYSIS")
print("=
,
,
,
100
,
,
,
,
,

,
,
\nTotal missing values: {train_df.isnull().sum().sum():,}")
print(f"\nColumns with missing values:")
print(missing_df)

if len(missing_df) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    missing_df.head(20).plot(x='Column', y='Missing_Percentage', kind='barh', ax=ax)
    plt.title('Top 20 Columns with Missing Values')
    plt.xlabel('Missing Percentage (%)')
    plt.tight_layout()
    plt.savefig('../plots/01_missing_values.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("\n Saved: plots/01_missing_values.png")

# Target Variable Analysis
Check available columns and analyze the distribution of `sequence_type` if it exists (likely a categorical target or grouping variable).

In [None]:
print("=
,
TARGET VARIABLE ANALYSIS")
print("=
,
,
,
\nColumn names:")
print(train_df.columns.tolist()[:20])  # first 20 columns

# check sequence_type (possible label)
if 'sequence_type' in train_df.columns:
    print(f"\nSequence Type Distribution:")
    seq_dist = train_df['sequence_type'].value_counts()
    print(seq_dist)
    print(f"\nTotal unique sequence types: {train_df['sequence_type'].nunique()}")
    
    # visualization
    fig, ax = plt.subplots(figsize=(14, 6))
    seq_dist.plot(kind='barh', ax=ax)
    plt.title('Sequence Type Distribution (Training Data)')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.savefig('../plots/02_sequence_type_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("\n Saved: plots/02_sequence_type_distribution.png")

# Sensor Data Identification
Identify sensor feature columns (exclude metadata columns) and categorize them by sensor type (IMU, temperature, TOF).

In [None]:
print("=
,
SENSOR DATA ANALYSIS")
print("=
,
,
,
,
,
,
\nSensor columns: {len(sensor_cols)}")
print(f"Examples: {sensor_cols[:10]}")

# Categorize by sensor type
imu_cols = [col for col in sensor_cols if 'acc' in col or 'rot' in col]
temp_cols = [col for col in sensor_cols if 'thm' in col]
tof_cols = [col for col in sensor_cols if 'tof' in col]

print(f"\nSensor breakdown:")
print(f"  IMU (accel + gyro): {len(imu_cols)} columns")
print(f"  Temperature: {len(temp_cols)} columns")
print(f"  TOF (distance): {len(tof_cols)} columns")

# Sensor statistics
print(f"\nIMU Statistics:")
print(train_df[imu_cols].describe())

# Sensor Distributions
Plot sample distributions for selected IMU and temperature sensor columns to inspect value ranges and shapes.

In [None]:
print("=
,
SENSOR DATA DISTRIBUTIONS")
print("=
,
,
,
,
,
,
3
8
,
,
,
,