# 01 · Data Ingestion
This notebook loads the IPO dataset, does a light schema inspection, and writes a clean copy for downstream steps.


In [3]:

import pandas as pd
from pathlib import Path

RAW_CSV = "../data/raw/delisted_data.csv"
CLEAN_CSV = "../data/processed/data_clean.csv"

df = pd.read_csv(RAW_CSV)
print(df.shape)
df.head()


(1617, 39)


Unnamed: 0,symbol,name,exchange,ipoDate,delistingDate,status,URL,5 Year Annual Info? (y/n),salesQ5,salesQ4,...,opCashflowQ4,opCashflowQ3,opCashflowQ2,opCashflowQ1,netCashflowQ5,netCashflowQ4,netCashflowQ3,netCashflowQ2,netCashflowQ1,Unnamed: 8
0,AACQU,Origin Materials Inc - Units (1 Ord Share Clas...,NASDAQ,2020-07-14,2021-06-24,Delisted,https://www.barchart.com/stocks/quotes/ACNDWS/...,,,,...,,,,,,,,,,
1,AAIN,Arlington Asset Investment Corp,NYSE,2021-07-19,2024-01-30,Delisted,https://www.barchart.com/stocks/quotes/AAIN/fi...,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,AAQC,Accelerate Acquisition Corp - Class A,NYSE,2021-05-10,2022-12-15,Delisted,https://www.barchart.com/stocks/quotes/AAQC/fi...,,0.0,0.0,...,-560.0,-290.0,-1500.0,-1260.0,-360.0,-280.0,-290.0,940.0,1170.0,
3,ABGI,ABG Acquisition Corp I - Class A,NASDAQ,2021-02-17,2023-02-27,Delisted,https://www.barchart.com/stocks/quotes/ABGI/fi...,,0.0,0.0,...,-270.0,-190.0,-980.0,-870.0,-410.0,-310.0,-240.0,450.0,560.0,
4,ACACU,PLAYSTUDIOS Inc - Units (1 Ord Share Class A &...,NASDAQ,2020-10-23,2021-06-21,Delisted,https://www.barchart.com/stocks/quotes/ACACU/f...,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


## Basic cleanup
Drop fully-empty columns and trim whitespace.

In [5]:

# Drop columns that are entirely empty or clearly artifacts
drop_cols = [c for c in df.columns if df[c].isna().all()] + [c for c in df.columns if c.lower().startswith("unnamed")]
drop_cols = list(dict.fromkeys(drop_cols))  # de-dup
print("Dropping columns:", drop_cols)
df = df.drop(columns=drop_cols, errors="ignore")

# Strip whitespace from string columns
for c in df.columns:
    if df[c].dtype == object:
        df[c] = df[c].astype(str).str.strip()

# Save a clean copy
df.to_csv(CLEAN_CSV, index=False)
print("Wrote:", CLEAN_CSV, "with shape", df.shape)


Dropping columns: []
Wrote: ../data/processed/data_clean.csv with shape (1617, 37)


✅ **Output**: `data_clean.csv` (used by feature engineering).