# Data Loading
This cell loads the raw training and test CSV files into pandas DataFrames and displays their shapes and the first few rows for a quick sanity check.

In [None]:
import pandas as pd
import numpy as np
import os
import pickle

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {train_df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(train_df.head())

# Data Overview
Show general information about the training DataFrame and check for missing values in both train and test datasets.

In [None]:
print("Train data info:")
print(train_df.info())

print("\n" + "=
\n")

print("Missing values:")
print(f"Train: {train_df.isnull().sum().sum()}")
print(f"Test: {test_df.isnull().sum().sum()}")

# Label Distribution
If the dataset contains a `label` column (target), display the class distribution and number of unique classes; otherwise, list available columns.

In [None]:
if 'label' in train_df.columns:
    print("Label distribution:")
    print(train_df['label'].value_counts())
    print(f"\nTotal classes: {train_df['label'].nunique()}")
else:
    print("No 'label' column found")
    print("Available columns:", train_df.columns.tolist())

# Save Ingested Data
Save the loaded (and optionally preprocessed) DataFrames back to the `../data` directory for downstream steps.

In [None]:
os.makedirs("../data", exist_ok=True)

train_df.to_csv("../data/train_ingested.csv", index=False)
test_df.to_csv("../data/test_ingested.csv", index=False)

print(f"Saved to ../data/train_ingested.csv and ../data/test_ingested.csv")