In [16]:
# Generate example data if not exists
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
csv_path = '../data/raw/sample_data.csv'
os.makedirs('../data/raw', exist_ok=True)

if not os.path.exists(csv_path):
    df_demo = pd.DataFrame({
        'numeric_col': [10, None, 40, 55, 70],
        'category_col': ['A', 'B', 'A', 'B', 'C'],
        'price': ['$100', '$200', '$150', None, '$250'],
        'date_str': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo.to_csv(csv_path, index=False)
    print(f"Demo CSV created at {csv_path}")
else:
    print(f"CSV already exists at {csv_path}")

CSV already exists at ../data/raw/sample_data.csv


In [10]:
import sys
sys.path.append('..')
from src import cleaning

# Load Raw Dataset

In [5]:
df = pd.read_csv('../data/raw/sample_data.csv')
df.head()

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,$100,2025-08-01,Electronics
1,,B,$200,2025-08-02,Furniture
2,40.0,A,$150,,Toys
3,55.0,B,,2025-08-04,Clothing
4,70.0,C,$250,2025-08-05,


# Apply Cleaning Functions

In [20]:
# Apply cleaning functions
df1 = cleaning.fill_missing_median(df, ['numeric_col'])
df11 = cleaning.fill_missing_median(df)
df1, df11

(   numeric_col category_col price    date_str     category
 0         10.0            A  $100  2025-08-01  Electronics
 1         47.5            B  $200  2025-08-02    Furniture
 2         40.0            A  $150         NaN         Toys
 3         55.0            B   NaN  2025-08-04     Clothing
 4         70.0            C  $250  2025-08-05          NaN,
    numeric_col category_col price    date_str     category
 0         10.0            A  $100  2025-08-01  Electronics
 1         47.5            B  $200  2025-08-02    Furniture
 2         40.0            A  $150         NaN         Toys
 3         55.0            B   NaN  2025-08-04     Clothing
 4         70.0            C  $250  2025-08-05          NaN)

In [36]:
# apply droping missing functions
df2 = cleaning.drop_missing(df, threshold=0.8)
df22 = cleaning.drop_missing(df, columns=['numeric_col', 'category_col'])
df2, df22

(   numeric_col category_col price    date_str     category
 0     0.000000            A  $100  2025-08-01  Electronics
 1     0.833333            B  $200  2025-08-02    Furniture
 3     1.000000            B   NaN  2025-08-04     Clothing,
    numeric_col category_col price    date_str     category
 0     0.000000            A  $100  2025-08-01  Electronics
 1     0.833333            B  $200  2025-08-02    Furniture
 3     1.000000            B   NaN  2025-08-04     Clothing)

In [26]:
# apply normalization functions
df3 = cleaning.normalize_data(df1, ['numeric_col'])
df3

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,0.0,A,$100,2025-08-01,Electronics
1,0.625,B,$200,2025-08-02,Furniture
2,0.5,A,$150,,Toys
3,0.75,B,,2025-08-04,Clothing
4,1.0,C,$250,2025-08-05,


# Save Cleaned Datasets

In [34]:
df = cleaning.fill_missing_median(df)
df = cleaning.drop_missing(df, columns=['date_str', 'category'])
df = cleaning.normalize_data(df, ['numeric_col'])
df

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,0.0,A,$100,2025-08-01,Electronics
1,0.833333,B,$200,2025-08-02,Furniture
3,1.0,B,,2025-08-04,Clothing


In [35]:
df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)