# Imports and Data Loading
Import common libraries (pandas, numpy, matplotlib, seaborn) and load the training and test CSV files. Set plotting style and default figure size for consistent visuals.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Training Data Basic Statistics
Display basic dataset statistics: shape, memory usage, data types distribution, and descriptive statistics for numeric columns.

In [None]:
print("="*80)
print("TRAINING DATA STATISTICS")
print("="*80)

print(f"\nShape: {train_df.shape}")
print(f"Memory usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nData types:")
print(train_df.dtypes.value_counts())

print(f"\nBasic statistics:")
print(train_df.describe())

# Missing Values Analysis
Identify columns with missing values, compute counts and percentages, and visualize the top columns with missing data.

In [None]:
print("="*80)
print("MISSING VALUES ANALYSIS")
print("="*80)

missing_count = train_df.isnull().sum()
missing_pct = (train_df.isnull().sum() / len(train_df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_count.index,
    'Missing_Count': missing_count.values,
    'Missing_Percentage': missing_pct.values
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print(f"\nTotal missing values: {train_df.isnull().sum().sum():,}")
print(f"\nColumns with missing values:")
print(missing_df)

if len(missing_df) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    missing_df.head(20).plot(x='Column', y='Missing_Percentage', kind='barh', ax=ax)
    plt.title('Top 20 Columns with Missing Values')
    plt.xlabel('Missing Percentage (%)')
    plt.tight_layout()
    plt.savefig('../plots/01_missing_values.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("\n Saved: plots/01_missing_values.png")

# Target Variable Analysis
Inspect potential target columns (e.g., `sequence_type`), show distribution, and visualize counts if present.

In [None]:
print("="*80)
print("TARGET VARIABLE ANALYSIS")
print("="*80)

# check column names
print("\nColumn names:")
print(train_df.columns.tolist()[:20])  # first 20 columns

# check sequence_type (possible label)
if 'sequence_type' in train_df.columns:
    print(f"\nSequence Type Distribution:")
    seq_dist = train_df['sequence_type'].value_counts()
    print(seq_dist)
    print(f"\nTotal unique sequence types: {train_df['sequence_type'].nunique()}")
    
    # visualization
    fig, ax = plt.subplots(figsize=(14, 6))
    seq_dist.plot(kind='barh', ax=ax)
    plt.title('Sequence Type Distribution (Training Data)')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.savefig('../plots/02_sequence_type_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("\n Saved: plots/02_sequence_type_distribution.png")

# Sensor Data Analysis
Identify sensor columns, categorize by sensor type (IMU, temperature, TOF), and compute basic statistics for sensor groups.

In [None]:
print("="*80)
print("SENSOR DATA ANALYSIS")
print("="*80)

# Separate sensor columns
sensor_cols = [col for col in train_df.columns if col not in 
               ['row_id', 'sequence_type', 'sequence_id', 'sequence_counter', 'subject', 'orientation']]

print(f"\nSensor columns: {len(sensor_cols)}")
print(f"Examples: {sensor_cols[:10]}")

# Categorize by sensor type
imu_cols = [col for col in sensor_cols if 'acc' in col or 'rot' in col]
temp_cols = [col for col in sensor_cols if 'thm' in col]
tof_cols = [col for col in sensor_cols if 'tof' in col]

print(f"\nSensor breakdown:")
print(f"  IMU (accel + gyro): {len(imu_cols)} columns")
print(f"  Temperature: {len(temp_cols)} columns")
print(f"  TOF (distance): {len(tof_cols)} columns")

# Sensor statistics
print(f"\nIMU Statistics:")
print(train_df[imu_cols].describe())

# Sensor Data Distributions
Plot sample distributions for selected IMU and temperature sensors to inspect value ranges and shapes.

In [None]:
print("="*80)
print("SENSOR DATA DISTRIBUTIONS")
print("="*80)

# Select sample columns for visualization
sample_imu = imu_cols[:3]
sample_temp = temp_cols[:2]

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
fig.suptitle('Sample Sensor Data Distributions')

# IMU
for idx, col in enumerate(sample_imu):
    row = idx // 3
    col_idx = idx % 3
    axes[0, col_idx].hist(train_df[col].dropna(), bins=50, alpha=0.7, color='blue')
    axes[0, col_idx].set_title(f'{col}')
    axes[0, col_idx].set_xlabel('Value')
    axes[0, col_idx].set_ylabel('Frequency')

# Temperature
for idx, col in enumerate(sample_temp):
    axes[1, idx].hist(train_df[col].dropna(), bins=50, alpha=0.7, color='orange')
    axes[1, idx].set_title(f'{col}')
    axes[1, idx].set_xlabel('Value')
    axes[1, idx].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('../plots/03_sensor_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print("\n Saved: plots/03_sensor_distributions.png")

# Correlation Analysis
Compute and visualize a correlation matrix for a small sample of sensor columns to identify strongly correlated features.

In [None]:
print("="*80)
print("CORRELATION ANALYSIS")
print("="*80)

# Calculate correlation (sampled for speed)
sample_cols = imu_cols[:5] + temp_cols[:2]
corr_matrix = train_df[sample_cols].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, ax=ax)
plt.title('Sensor Data Correlation Matrix (Sample)')
plt.tight_layout()
plt.savefig('../plots/04_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("\n Saved: plots/04_correlation_matrix.png")

# EDA Summary
Summarize the key dataset statistics and list the generated plots for reference.

In [None]:
print("="*80)
print("EDA SUMMARY")
print("="*80)

summary = {
    'Training Samples': len(train_df),
    'Test Samples': len(test_df),
    'Total Features': len(sensor_cols),
    'IMU Sensors': len(imu_cols),
    'Temperature Sensors': len(temp_cols),
    'TOF Sensors': len(tof_cols),
    'Missing Values': train_df.isnull().sum().sum(),
}

for key, value in summary.items():
    print(f"{key}: {value}")

print("\n EDA Complete!")
print("Generated plots:")
print("  - 01_missing_values.png")
print("  - 02_sequence_type_distribution.png")
print("  - 03_sensor_distributions.png")
print("  - 04_correlation_matrix.png")