In [1]:
# ---------------------------------------------------------
# Limpeza e Feature Engineering
# ---------------------------------------------------------
import pandas as pd
import numpy as np

RAW_PATH = "../data/raw/global_air_quality.csv"
OUTPUT_PATH = "../data/processed/air_quality_clean.csv"

df = pd.read_csv(RAW_PATH)
print(f"📥 Dataset original: {df.shape}")

# Normalização
df['City'] = df['City'].str.strip().str.title()
df['Country'] = df['Country'].str.strip().str.title()
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Remove duplicatas
dups = df.duplicated(subset=['City','Country','Date']).sum()
df = df.drop_duplicates(subset=['City','Country','Date'])
print(f"🔁 Duplicatas removidas: {dups}")

# Remove outliers (IQR)
num_cols = ['PM2.5','PM10','NO2','SO2','CO','O3','Temperature','Humidity','Wind Speed']
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Novas features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Pollution_Index'] = df[['PM2.5','PM10','NO2','SO2','CO','O3']].mean(axis=1)

# Faixas de temperatura
df['Temp_Bin'] = pd.cut(
    df['Temperature'],
    bins=[-15, 0, 10, 20, 30, 45],
    labels=['Muito Frio','Frio','Ameno','Quente','Muito Quente']
)

# Salva
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Dados processados salvos em: {OUTPUT_PATH}")


📥 Dataset original: (10000, 12)
🔁 Duplicatas removidas: 4797
✅ Dados processados salvos em: ../data/processed/air_quality_clean.csv
