In [3]:
import pandas as pd
import numpy as np

# create raw dataset
data = {
    "customer_id": [1,2,2,3,4,5],
    "name": ["Ravi","Sita","Sita","Arun","Kiran","Latha"],
    "age": [25, np.nan, np.nan, 40, 29, 35],
    "city": ["Hyd","Hyd","Hyd",None,"Chennai","Hyd"],
    "purchase_amount": [2000,3000,3000,1500,2200,None],
    "date": ["2025-01-01","02-01-2025","02-01-2025","wrong","2025/01/05","2025-01-06"]
}

df = pd.DataFrame(data)

print("RAW DATA")
print(df)

# cleaning
df = df.drop_duplicates()
df["age"] = df["age"].fillna(df["age"].mean())
df["city"] = df["city"].fillna("Unknown")
df["purchase_amount"] = df["purchase_amount"].fillna(df["purchase_amount"].median())
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# transformation
df["age_group"] = df["age"].apply(lambda x: "Young" if x < 30 else "Adult")

print("\nCLEANED DATA")
print(df)

# save file
df.to_csv("cleaned_data.csv", index=False)

RAW DATA
   customer_id   name   age     city  purchase_amount        date
0            1   Ravi  25.0      Hyd           2000.0  2025-01-01
1            2   Sita   NaN      Hyd           3000.0  02-01-2025
2            2   Sita   NaN      Hyd           3000.0  02-01-2025
3            3   Arun  40.0     None           1500.0       wrong
4            4  Kiran  29.0  Chennai           2200.0  2025/01/05
5            5  Latha  35.0      Hyd              NaN  2025-01-06

CLEANED DATA
   customer_id   name    age     city  purchase_amount       date age_group
0            1   Ravi  25.00      Hyd           2000.0 2025-01-01     Young
1            2   Sita  32.25      Hyd           3000.0        NaT     Adult
3            3   Arun  40.00  Unknown           1500.0        NaT     Adult
4            4  Kiran  29.00  Chennai           2200.0        NaT     Young
5            5  Latha  35.00      Hyd           2100.0 2025-01-06     Adult
