In [1]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')

Sample dataset created and saved to ../data/raw/sample_data.csv


In [2]:
import sys, os
from pathlib import Path

# Make parent folder importable so `from src import cleaning` finds ../src
sys.path.append(os.path.abspath(".."))

import pandas as pd
from src import cleaning

RAW = Path("../data/raw")
PROC = Path("../data/processed")
PROC.mkdir(parents=True, exist_ok=True)

RAW, PROC


(PosixPath('../data/raw'), PosixPath('../data/processed'))

In [3]:
df = pd.read_csv(RAW / "sample_data.csv")
display(df.head())
df.info()
df.isna().sum()


Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         6 non-null      float64
 1   income      4 non-null      float64
 2   score       6 non-null      float64
 3   zipcode     7 non-null      int64  
 4   city        7 non-null      object 
 5   extra_data  2 non-null      float64
dtypes: float64(4), int64(1), object(1)
memory usage: 468.0+ bytes


age           1
income        3
score         1
zipcode       0
city          0
extra_data    5
dtype: int64

In [4]:
# choose numeric columns automatically
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
print("Numeric columns:", num_cols)

df1 = cleaning.fill_missing_median(df, columns=num_cols)
df2 = cleaning.drop_missing(df1, threshold=0.6, axis="rows")
df_clean = cleaning.normalize_data(df2, columns=num_cols)

df_clean.head()


Numeric columns: ['age', 'income', 'score', 'zipcode', 'extra_data']


Unnamed: 0,age,income,score,zipcode,city,extra_data
0,-0.861209,0.767146,0.227593,0.823591,Beverly,0.0
1,0.861209,0.122743,1.37466,-1.526191,New York,1.870829
2,-1.644127,-2.025264,0.036415,-0.043446,Chicago,0.0
3,1.644127,1.411548,-0.537119,0.93764,SF,0.0
4,-0.234875,0.122743,0.992304,0.32823,Austin,0.0


In [5]:
out_path = PROC / "sample_data_cleaned.csv"
df_clean.to_csv(out_path, index=False)
out_path


PosixPath('../data/processed/sample_data_cleaned.csv')

In [6]:
summary = {
    "orig_shape": df.shape,
    "clean_shape": df_clean.shape,
    "orig_na_counts": df.isna().sum().to_dict(),
    "clean_na_counts": df_clean.isna().sum().to_dict(),
}
summary


{'orig_shape': (7, 6),
 'clean_shape': (7, 6),
 'orig_na_counts': {'age': 1,
  'income': 3,
  'score': 1,
  'zipcode': 0,
  'city': 0,
  'extra_data': 5},
 'clean_na_counts': {'age': 0,
  'income': 0,
  'score': 0,
  'zipcode': 0,
  'city': 0,
  'extra_data': 0}}