# 01_pipeline.ipynb

Data cleaning and preparation pipeline for the ISI Final Project.

In [None]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
import os

In [None]:
# --- Define Paths ---
RAW_DATA_PATH = "../data/raw"
FILE_NAME = "sla_active.csv"  # Ensure this matches your dataset name
FILE_PATH = os.path.join(RAW_DATA_PATH, FILE_NAME)

In [None]:
# --- Load Data ---
try:
    df = pd.read_csv(FILE_PATH)
    print("✅ Data loaded successfully.")
    print(f"Shape: {df.shape}")
except FileNotFoundError:
    print("❌ CSV file not found. Check your path or filename.")
except Exception as e:
    print(f"⚠️ Error loading CSV: {e}")

In [None]:
# --- Basic Cleaning ---
if 'df' in locals():
    # Remove duplicates
    df.drop_duplicates(inplace=True)
    # Strip whitespace from column names
    df.columns = df.columns.str.strip()
    # Fill missing values
    df.fillna("N/A", inplace=True)

    print("✅ Basic cleaning complete.")
    display(df.head())

In [None]:
# --- Save Cleaned Data ---
CLEAN_PATH = "../data/clean"
os.makedirs(CLEAN_PATH, exist_ok=True)
df.to_csv(os.path.join(CLEAN_PATH, "sla_clean.csv"), index=False)
print("💾 Cleaned dataset saved successfully.")