# Homework Stage 6: Data Preprocessing
Data cleansing and processing

In [22]:
import pandas as pd
from src import cleaning
import os

In [23]:
#creating the data set because there's nothing provided

csv_path = 'data/raw_data_create.csv'
os.makedirs('data', exist_ok=True)

if not os.path.exists(csv_path):
    df_demo = pd.DataFrame({
        'numeric_col': [10, None, 40, 55, 70],
        'category_col': ['A', 'B', 'A', 'B', 'C'],
        'price': ['$100', '$200', '$150', None, '$250'],
        'date_str': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo.to_csv(csv_path, index=False)
    print(f"Demo CSV created at {csv_path}")
else:
    print(f"CSV already exists at {csv_path}")

CSV already exists at data/raw_data_create.csv


## Load Raw Dataset

In [24]:
df = pd.read_csv('data/raw_data_create.csv')
df.head()

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,$100,2025-08-01,Electronics
1,,B,$200,2025-08-02,Furniture
2,40.0,A,$150,,Toys
3,55.0,B,,2025-08-04,Clothing
4,70.0,C,$250,2025-08-05,


## Apply Cleaning Functions

In [25]:
import numpy as np

#cleaning type of data
df = df.copy()

# Convert price → float
df["price"] = (
    df["price"]
    .astype(str)                    # ensure everything is string
    .str.replace("$", "", regex=False)  # remove $
    .replace("nan", np.nan)         # turn "nan" string back to real NaN
    .astype(float)                  # convert to float
)


# Convert date_str → datetime
df["date_str"] = pd.to_datetime(df["date_str"], errors="coerce")




In [33]:
# TODO: Apply your functions here
# Example:
# df = cleaning.fill_missing_median(df, ['col1','col2'])
# df = cleaning.drop_missing(df, threshold=0.5)
# df = cleaning.normalize_data(df, ['col1','col2'])

df_clean = cleaning.fill_missing_median(df)
df_clean = cleaning.drop_missing(df_clean, threshold=0.5)
df_clean = cleaning.normalize_data(df_clean, method="standard")
df_clean = cleaning.fill_missing_general(df)


df


Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,100.0,2025-08-01,Electronics
1,,B,200.0,2025-08-02,Furniture
2,40.0,A,150.0,NaT,Toys
3,55.0,B,,2025-08-04,Clothing
4,70.0,C,250.0,2025-08-05,


## Save Cleaned Dataset

In [27]:
# df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)

import os

# Make sure folder exists
os.makedirs("data", exist_ok=True)

# Save
df_clean.to_csv('data/cleaned_data_create.csv', index=False)

df_clean.head()


Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,100.0,2025-08-01,Electronics
1,47.5,B,200.0,2025-08-02,Furniture
2,40.0,A,150.0,2025-08-02,Toys
3,55.0,B,175.0,2025-08-04,Clothing
4,70.0,C,250.0,2025-08-05,unknown


## Comparison between Raw vs Processed

In [None]:


def compare_raw_vs_processed(raw_df, processed_df):
    print("=== Shape Comparison ===")
    print(f"Raw shape:       {raw_df.shape}")
    print(f"Processed shape: {processed_df.shape}")
    print()

    print("=== Missing Values (Raw) ===")
    print(raw_df.isna().sum())
    print()

    print("=== Missing Values (Processed) ===")
    print(processed_df.isna().sum())
    print()

    print("=== Dtypes Comparison ===")
    print(pd.DataFrame({
        "raw_dtype": raw_df.dtypes,
        "processed_dtype": processed_df.dtypes
    }))
    print()

    print("=== Sample Differences (first 5 rows) ===")
    diff = processed_df.head().compare(raw_df.head(), keep_shape=True, keep_equal=False)
    print(diff if not diff.empty else "No differences in first 5 rows!")

    return diff




In [32]:
compare_raw_vs_processed(df, df_clean)

print("Data cleaning and comparison complete!")
print("Existing DF")
df

print("Cleaned DF")
df_clean

=== Shape Comparison ===
Raw shape:       (5, 5)
Processed shape: (5, 5)

=== Missing Values (Raw) ===
numeric_col     1
category_col    0
price           1
date_str        1
category        1
dtype: int64

=== Missing Values (Processed) ===
numeric_col     0
category_col    0
price           0
date_str        0
category        0
dtype: int64

=== Dtypes Comparison ===
                   raw_dtype processed_dtype
numeric_col          float64         float64
category_col          object          object
price                float64         float64
date_str      datetime64[ns]  datetime64[ns]
category              object          object

=== Sample Differences (first 5 rows) ===
  numeric_col       category_col        price         date_str       category  \
         self other         self other   self other       self other     self   
0         NaN   NaN          NaN   NaN    NaN   NaN        NaT   NaT      NaN   
1        47.5   NaN          NaN   NaN    NaN   NaN        NaT   NaT    

Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,100.0,2025-08-01,Electronics
1,47.5,B,200.0,2025-08-02,Furniture
2,40.0,A,150.0,2025-08-02,Toys
3,55.0,B,175.0,2025-08-04,Clothing
4,70.0,C,250.0,2025-08-05,unknown
