In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Project modules
from src.load_data import load_data
from src.preprocess import clean_return_reason, fill_missing_size
from src.impute_missing import impute_customer_rating


In [None]:
df = pd.read_csv("../data/fashion_boutique_dataset.csv")



In [161]:
print("shape:" ,df.shape)
df.head()

shape: (2176, 14)


Unnamed: 0,product_id,category,brand,season,size,color,original_price,markdown_percentage,current_price,purchase_date,stock_quantity,customer_rating,is_returned,return_reason
0,FB000001,Outerwear,Zara,Spring,XL,Red,196.01,0.0,196.01,2025-07-05,37,3.0,False,
1,FB000002,Tops,Uniqlo,Winter,L,Pink,119.64,0.0,119.64,2025-08-06,2,2.5,False,
2,FB000003,Accessories,Uniqlo,Winter,,Black,33.8,0.0,33.8,2025-08-06,22,4.3,False,
3,FB000004,Shoes,Uniqlo,Spring,XL,Black,75.36,0.0,75.36,2025-07-07,48,2.6,False,
4,FB000005,Tops,Banana Republic,Winter,XL,Black,105.02,0.0,105.02,2025-08-06,10,,False,


In [162]:
na = df.isna().sum().sort_values(ascending=False)
na

return_reason          1856
size                    491
customer_rating         362
product_id                0
category                  0
brand                     0
season                    0
color                     0
original_price            0
markdown_percentage       0
current_price             0
purchase_date             0
stock_quantity            0
is_returned               0
dtype: int64

In [163]:
(na / len(df) * 100).round(2)


return_reason          85.29
size                   22.56
customer_rating        16.64
product_id              0.00
category                0.00
brand                   0.00
season                  0.00
color                   0.00
original_price          0.00
markdown_percentage     0.00
current_price           0.00
purchase_date           0.00
stock_quantity          0.00
is_returned             0.00
dtype: float64

In [164]:
df.dtypes

product_id              object
category                object
brand                   object
season                  object
size                    object
color                   object
original_price         float64
markdown_percentage    float64
current_price          float64
purchase_date           object
stock_quantity           int64
customer_rating        float64
is_returned               bool
return_reason           object
dtype: object

In [165]:
# return_reason is only applicable when is_returned = True
# Missing values are expected for non-returned purchases
df["return_reason"] = df["return_reason"].fillna("Not Returned")
df.loc[df["is_returned"] == 1, "return_reason"] = df.loc[df["is_returned"] == 1, "return_reason"].replace("Not Returned", "Unknown")


In [166]:
df["return_reason"].value_counts(dropna=False).head(10)


return_reason
Not Returned      1856
Changed Mind        68
Size Issue          60
Quality Issue       55
Wrong Item          47
Color Mismatch      46
Damaged             44
Name: count, dtype: int64

In [167]:
(df[(df["is_returned"]) & (df["return_reason"] == "Not Returned")].shape[0])


0

In [168]:
target_col = "customer_rating"

X = df.drop(columns =[target_col])
y = df[target_col]

In [169]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "bool"]).columns.tolist()


In [170]:
# ML-based imputation:
# customer_rating is a numeric column and can be predicted from product + pricing + return behavior

df_train = df[df["customer_rating"].notna()].copy()
df_missing = df[df["customer_rating"].isna()].copy()

X_train = df_train.drop(columns = ["customer_rating"])
y_train = df_train["customer_rating"]

X_missing = df_missing.drop(columns =["customer_rating"])

preprocess = ColumnTransformer(
    transformers = [
        ("num","passthrough", X_train.select_dtypes(include=["int64","float64"]).columns),
        ("cat", OneHotEncoder(handle_unknown="ignore"), X_train.select_dtypes(include = ["object"]).columns)
    ]
)

model = RandomForestRegressor(
    n_estimators = 200,
    random_state = 42,
    n_jobs =-1
)

pipe = Pipeline(steps = [
    ("preprocess", preprocess),
    ("model", model)
])

pipe.fit(X_train, y_train)

#predict missing ratings 
pred_ratings = pipe.predict(X_missing)

df.loc[df["customer_rating"].isna(), "customer_rating"] = pred_ratings






In [171]:
# Keep ratings realistic (1â€“5) and clean decimals for reporting
df["customer_rating"] = df["customer_rating"].clip(1, 5).round(1)

In [172]:
df["customer_rating"].isna().sum()

np.int64(0)

In [173]:
df["customer_rating"].describe()

count    2176.000000
mean        2.986949
std         1.065093
min         1.000000
25%         2.200000
50%         3.000000
75%         3.800000
max         5.000000
Name: customer_rating, dtype: float64

In [174]:
df["size"] = df["size"].fillna("Unknown")

In [175]:
df.isna().sum()

product_id             0
category               0
brand                  0
season                 0
size                   0
color                  0
original_price         0
markdown_percentage    0
current_price          0
purchase_date          0
stock_quantity         0
customer_rating        0
is_returned            0
return_reason          0
dtype: int64

In [176]:
df.to_csv("../outputs/fashion_boutique_clean.csv", index=False)
print("Saved: outputs/fashion_boutique_clean.csv")

Saved: outputs/fashion_boutique_clean.csv
