In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("FINAL_USO.csv")

# -------- Before Cleaning --------
print("Shape before:", df.shape)
print("Duplicates before:", df.duplicated().sum())
print("Missing values before:\n", df.isnull().sum())

# 1. Remove duplicates
df.drop_duplicates(inplace=True)

# 2. Handle missing values
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# 3. Treat outliers (IQR)
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

for col in num_cols:
    lower = Q1[col] - 1.5*IQR[col]
    upper = Q3[col] + 1.5*IQR[col]
    df[col] = df[col].clip(lower, upper)

# -------- After Cleaning --------
print("Shape after:", df.shape)
print("Duplicates after:", df.duplicated().sum())
print("Missing values after:\n", df.isnull().sum())

# Save cleaned dataset
df.to_csv("FINAL_USO_cleaned.csv", index=False)
