# Limpeza — demographics.csv 
esse caderno, por enquanto, apenas organiza em ordem crescente as informações, e retira duplicatas

In [7]:
from pathlib import Path
# (ajuste para o seu usuário/sistema):
CSV_PATH = Path("/Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data/demographics.csv")


print("CSV_PATH:", CSV_PATH)
print("Existe o arquivo?", CSV_PATH.exists())
if not CSV_PATH.exists():
    raise FileNotFoundError("Arquivo não encontrado. Edite CSV_PATH acima e rode novamente.")

CSV_PATH: /Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data/demographics.csv
Existe o arquivo? True


In [None]:

import pandas as pd
import numpy as np

df = pd.read_csv(CSV_PATH)
print("Dimensão inicial (linhas, colunas):", df.shape)

if "person_id" not in df.columns:
    raise ValueError("A coluna 'person_id' não existe no CSV.")

# Preenchimentos padrão
fill_na = {
    "person_id": -1,
    "age": -1,
    "income": -1,
    "household_size": -1,
    "dependents": -1,
}
for col, val in fill_na.items():
    if col in df.columns:
        df[col] = df[col].fillna(val)

for cat_col in ["sex","region","urban_rural","education","marital_status","employment_status"]:
    if cat_col in df.columns:
        df[cat_col] = df[cat_col].replace("", pd.NA).fillna("no_data")

# =========================
# 2️⃣ Ordenar por person_id e remover duplicatas
# =========================

pid_num = pd.to_numeric(df["person_id"], errors="coerce")
df = (
    df.assign(_pid_num=pid_num, _pid_str=df["person_id"].astype(str))
      .sort_values(by=["_pid_num", "_pid_str"], kind="mergesort", na_position="last")
      .drop(columns=["_pid_num", "_pid_str"])
      .reset_index(drop=True)
)

before = len(df)
df = df.drop_duplicates(keep="first")
after = len(df)
print(f"Duplicatas exatas removidas: {before - after} (de {before} → {after})")

# =========================
# 3️⃣ Normalização e validação de domínios
# =========================

def _norm_token(x):
    if pd.isna(x): return "no_data"
    s = str(x).strip()
    if s == "" or s.lower() in {"na","n/a","none","null","no data","sem dado"}:
        return "no_data"
    return s

def normalize_categorical(series, mapping, allowed):
    s = series.map(_norm_token)
    def map_to_canon(v):
        if v == "no_data": return "no_data"
        key = v.lower()
        return mapping.get(key, v)
    s = s.map(map_to_canon)
    s = s.where(s.isin(allowed | {"no_data"}), "no_data")
    return s

# Mapas e domínios
sex_map = {"male":"Male","m":"Male","female":"Female","f":"Female"}
sex_allowed = {"Male","Female"}
region_map = {"north":"North","south":"South","east":"East","west":"West","central":"Central"}
region_allowed = {"North","South","East","West","Central"}
urban_map = {"urban":"Urban","rural":"Rural","suburban":"Suburban"}
urban_allowed = {"Urban","Rural","Suburban"}
edu_map = {"no hs":"No HS","hs":"HS","high school":"HS","some college":"Some College","bachelors":"Bachelors","masters":"Masters","doctorate":"Doctorate","phd":"Doctorate"}
edu_allowed = {"No HS","HS","Some College","Bachelors","Masters","Doctorate"}
marital_map = {"single":"Single","married":"Married","divorced":"Divorced","widowed":"Widowed"}
marital_allowed = {"Single","Married","Divorced","Widowed"}
emp_map = {"employed":"Employed","unemployed":"Unemployed","self-employed":"Self-employed","self employed":"Self-employed","retired":"Retired"}
emp_allowed = {"Employed","Unemployed","Self-employed","Retired"}

# Aplicar
for col, mapping, allowed in [
    ("sex", sex_map, sex_allowed),
    ("region", region_map, region_allowed),
    ("urban_rural", urban_map, urban_allowed),
    ("education", edu_map, edu_allowed),
    ("marital_status", marital_map, marital_allowed),
    ("employment_status", emp_map, emp_allowed),
]:
    if col in df.columns:
        df[col] = normalize_categorical(df[col], mapping, allowed)

# =========================
# 4️⃣ Regras para AGE
# =========================

age = pd.to_numeric(df["age"], errors="coerce")
age_valid = age.where(age >= 0)

edu = df["education"].fillna("no_data")
emp = df["employment_status"].fillna("no_data")

invalid_mask = (age_valid < 13) | (age_valid > 85)

age_13_15 = (age_valid >= 13) & (age_valid <= 15)
age_16_17 = (age_valid >= 16) & (age_valid <= 17)
age_18_20 = (age_valid >= 18) & (age_valid <= 20)

edu_ok_13_15 = {"No HS","HS"}
edu_ok_16_17 = {"No HS","HS"}
edu_mismatch_13_15 = age_13_15 & (~edu.isin(edu_ok_13_15)) & (edu != "no_data")
edu_mismatch_16_17 = age_16_17 & (~edu.isin(edu_ok_16_17)) & (edu != "no_data")
edu_mismatch_18_20 = age_18_20 & (edu.isin({"Masters","Doctorate"}))

emp_bad_13_15 = {"Employed","Self-employed","Retired"}
emp_bad_16_17 = {"Self-employed","Retired"}
emp_bad_18_20 = {"Retired"}
emp_mismatch_13_15 = age_13_15 & (emp.isin(emp_bad_13_15))
emp_mismatch_16_17 = age_16_17 & (emp.isin(emp_bad_16_17))
emp_mismatch_18_20 = age_18_20 & (emp.isin(emp_bad_18_20))

teen_inconsistent = (
    edu_mismatch_13_15 | edu_mismatch_16_17 | edu_mismatch_18_20 |
    emp_mismatch_13_15 | emp_mismatch_16_17 | emp_mismatch_18_20
)

df.loc[invalid_mask | teen_inconsistent, "age"] = -1

print(f"[LIMPEZA AGE] Linhas com age alterado para -1: {(invalid_mask | teen_inconsistent).sum()}")

# =========================
# 5️⃣ Consistência household_size × dependents
# =========================

hh = pd.to_numeric(df["household_size"], errors="coerce")
dep = pd.to_numeric(df["dependents"], errors="coerce")

issue_hh = (hh.isna()) | (hh < 1) | (hh % 1 != 0)
issue_dep = (dep.isna()) | (dep < 0) | (dep % 1 != 0)
issue_dep_gt = (hh.notna()) & (dep.notna()) & (dep > (hh - 1))

df.loc[issue_hh, "household_size"] = -1
df.loc[issue_dep | issue_dep_gt, "dependents"] = -1

# =========================
# 6️⃣ Limpeza de income (manter -1 para ausentes; outlier flag em válidos)
# =========================

# 1) Garantir que income final mantenha -1 para ausentes
inc_raw = pd.to_numeric(df["income"], errors="coerce")
df["income"] = inc_raw.fillna(-1)

# 2) Criar série de trabalho somente com valores válidos (>= 0) para estatística
inc_valid = inc_raw.where(inc_raw >= 0, np.nan)

# 3) Outliers por IQR com base nos válidos
q1, q3 = inc_valid.quantile(0.25), inc_valid.quantile(0.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr

outlier_flag = (inc_valid < lower) | (inc_valid > upper)
df["_income_outlier_flag"] = outlier_flag.fillna(False).astype(int)


# =========================
# 7️⃣ Salvar arquivos de saída
# =========================

repo_dir = CSV_PATH.parent.parent
out_clean = repo_dir / "data" / "demographics_cleaned.csv"

out_clean.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_clean, index=False)
print("✅ CSV limpo salvo em:", out_clean)


Dimensão inicial (linhas, colunas): (71280, 11)
Duplicatas exatas removidas: 6480 (de 71280 → 64800)
[LIMPEZA AGE] Linhas com age alterado para -1: 2411
✅ CSV limpo salvo em: /Users/augusto/Library/Mobile Documents/com~apple~CloudDocs/git/avalensurance-bia/data/demographics_cleaned.csv
