In [1]:

import os
from pathlib import Path

import numpy as np
import pandas as pd

import sys
sys.path.append(str(Path('.').resolve()))
sys.path.append(str(Path('src').resolve()))

from cleaning import fill_missing_median, drop_missing, normalize_data

RAW_PATH = Path('starter_data.csv')
PROCESSED_DIR = Path('data/processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

EXCLUDE_COLS = []  # e.g., ['id', 'target']
NORMALIZE_METHOD = 'zscore'  # 'zscore' or 'minmax'
CLIP_QUANTILES = None  # e.g., (0.01, 0.99) to winsorize before scaling


## 1) Load dataset

In [2]:

if not RAW_PATH.exists():
    raise FileNotFoundError(f"Raw dataset not found at {RAW_PATH}. Please add your file ")

df_raw = pd.read_csv(RAW_PATH)
print('Shape (raw):', df_raw.shape)
display(df_raw.head())


Shape (raw): (10, 3)


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


## 2) Baseline profile

In [4]:

def summarize_missing(df: pd.DataFrame) -> pd.DataFrame:
    s = df.isna().sum()
    frac = df.isna().mean()
    return pd.DataFrame({'missing': s, 'missing_frac': frac}).sort_values('missing', ascending=False)

summary_raw = summarize_missing(df_raw)
display(summary_raw.head(20))
display(df_raw.describe(include='all'))


Unnamed: 0,missing,missing_frac
category,0,0.0
value,0,0.0
date,0,0.0


Unnamed: 0,category,value,date
count,10,10.0,10
unique,3,,10
top,A,,2025-08-01
freq,4,,1
mean,,17.6,
std,,7.381659,
min,,10.0,
25%,,12.25,
50%,,14.5,
75%,,23.25,


## 3) Apply cleaning functions

In [5]:

# 3a) Fill missing values (numeric) via median
df_imputed = fill_missing_median(df_raw)

# 3b) Drop rows/columns with excessive missingness
df_dropped = drop_missing(df_imputed, row_thresh=0.5, col_thresh=0.5)

# 3c) Normalize numeric features (excluding IDs/targets if specified)
df_norm, norm_params = normalize_data(df_dropped, exclude=EXCLUDE_COLS, method=NORMALIZE_METHOD, clip_quantiles=CLIP_QUANTILES)

print('Normalization summary:')
print(norm_params)


Normalization summary:
NormalizationParams(columns=['value'], method='zscore', params={'value': {'mean': 17.6, 'std': 7.002856560004639}})


## 4) Save cleaned dataset

In [6]:

out_path = PROCESSED_DIR / 'data_cleaned.csv'
df_norm.to_csv(out_path, index=False)
print(f'Cleaned dataset saved to: {out_path}')


Cleaned dataset saved to: data\processed\data_cleaned.csv


## 5) Compare original vs cleaned

In [7]:

def shape_and_missing(df: pd.DataFrame, label: str) -> pd.DataFrame:
    return pd.DataFrame({
        'label': [label],
        'n_rows': [df.shape[0]],
        'n_cols': [df.shape[1]],
        'total_missing': [int(df.isna().sum().sum())],
        'missing_frac': [float(df.isna().mean().mean())]
    })

cmp = pd.concat([
    shape_and_missing(df_raw, 'raw'),
    shape_and_missing(df_norm, 'cleaned')
], ignore_index=True)

display(cmp)

# Column-level missing comparison
missing_cmp = pd.DataFrame({
    'raw_missing': df_raw.isna().sum(),
    'cleaned_missing': df_norm.isna().sum()
})
missing_cmp['delta_missing'] = missing_cmp['cleaned_missing'] - missing_cmp['raw_missing']
display(missing_cmp.sort_values('delta_missing'))

# Basic distributional check for numeric columns that survived cleaning
num_cols = df_norm.select_dtypes(include=[np.number]).columns.tolist()
display(df_norm[num_cols].describe())


Unnamed: 0,label,n_rows,n_cols,total_missing,missing_frac
0,raw,10,3,0,0.0
1,cleaned,10,3,0,0.0


Unnamed: 0,raw_missing,cleaned_missing,delta_missing
category,0,0,0
value,0,0,0
date,0,0,0


Unnamed: 0,value
count,10.0
mean,-1.998401e-16
std,1.054093
min,-1.085271
25%,-0.763974
50%,-0.4426765
75%,0.8068136
max,1.770706
