### Making sample dataset

In [1]:
import os
import pandas as pd

data_raw = "../data/raw/"
data_processed = "../data/processed/"

os.makedirs(data_raw, exist_ok=True)
os.makedirs(data_processed, exist_ok=True)

df = pd.DataFrame({
    "id": [1, 2, 3, 4, 5, 6],
    "age": [25, None, 30, 22, None, 28],
    "income": [50000, 60000, None, 45000, 52000, None],
    "city": ["NY", "LA", None, "SF", "NY", "LA"]
})

df.to_csv(os.path.join(data_raw, "people.csv"), index=False)


### Importing data, cleaning and comparing datasets

In [6]:
import sys
sys.path.append("..")  

from src.cleaning import fill_missing_median, drop_missing, normalize_data

df_raw = pd.read_csv(os.path.join(data_raw, "people.csv"))
print("Original Data:")
display(df_raw)

df_filled = fill_missing_median(df_raw, columns=["age", "income"])
df_dropped = drop_missing(df_filled, threshold=0.7)
df_cleaned = normalize_data(df_dropped, columns=["age", "income"])

print("Cleaned Data:")
display(df_cleaned)

df_cleaned.to_csv(os.path.join(data_processed, "people_cleaned.csv"), index=False)

Original Data:


Unnamed: 0,id,age,income,city
0,1,25.0,50000.0,NY
1,2,,60000.0,LA
2,3,30.0,,
3,4,22.0,45000.0,SF
4,5,,52000.0,NY
5,6,28.0,,LA


Cleaned Data:


Unnamed: 0,id,age,income,city
0,1,0.5,0.333333,NY
1,2,0.75,1.0,LA
3,4,0.0,0.0,SF
4,5,0.75,0.466667,NY
5,6,1.0,0.4,LA


### Comparing Shapes and datatypes

In [9]:
print("Original shape:", df_raw.shape)
print("Cleaned shape:", df_cleaned.shape)
print("Dtypes before:\n", df_raw.dtypes)
print("Dtypes after:\n", df_cleaned.dtypes)


Original shape: (6, 4)
Cleaned shape: (5, 4)
Dtypes before:
 id          int64
age       float64
income    float64
city       object
dtype: object
Dtypes after:
 id          int64
age       float64
income    float64
city       object
dtype: object
