In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# =========================
# 1. Chargement des données
# =========================
df = pd.read_csv("../6.Data/kaggle_b2_fraud_train_v3.csv")

# =========================
# 2. Suppression colonnes inutiles
# =========================
cols_to_drop = [
    "customer_id", "account_id", "city", "postal_code",
    "chargeback_resolution_time_days", "post_event_status_code",
    "referrer_code",
    "manual_review_result", "terms_accepted_flag",
    "occupation", "merchant_category",
    "customer_note", "last_ticket_subject",
    "region", "country", "signup_source"
]

df = df.drop(columns=cols_to_drop, errors="ignore")

# =========================
# 3. Imputation des valeurs manquantes
# =========================

# Médiane (robuste aux outliers)
median_cols = [
    "annual_income_eur",
    "credit_score",
    "avg_amount_30d_eur",
    "max_amount_30d_eur"
]

for col in median_cols:
    df[col] = df[col].fillna(df[col].median())

# Moyenne (variables déjà normalisées)
mean_cols = ["device_trust_z", "ip_risk_z"]
for col in mean_cols:
    df[col] = df[col].fillna(df[col].mean())

# Booléen présence secondary_email
df["secondary_email"] = df["secondary_email"].notna().astype(int)

# Valeurs legacy
df["legacy_partner_score"] = df["legacy_partner_score"].fillna(0)
df["partner_risk_indicator"] = df["partner_risk_indicator"].fillna(0)

# =========================
# 4. Encodage catégoriel
# =========================

# One-hot : plan_type
plan_dummies = pd.get_dummies(df["plan_type"], prefix="plan_type")
df = pd.concat([df.drop(columns=["plan_type"]), plan_dummies], axis=1)

# One-hot : channel
channel_dummies = pd.get_dummies(df["channel"], prefix="channel")
df = pd.concat([df.drop(columns=["channel"]), channel_dummies], axis=1)

# Booléens
binary_cols = ["is_vpn", "is_new_device"]
for col in binary_cols:
    df[col] = df[col].astype(int)

# Target encoding
target_cols = ["payment_method", "browser", "os", "device_type"]
for col in target_cols:
    target_mean = df.groupby(col)["target_is_fraud"].mean()
    df[col] = df[col].map(target_mean)

# =========================
# 5. Feature engineering dates
# =========================
df["signup_date"] = pd.to_datetime(df["signup_date"])
df["account_age_days"] = (pd.Timestamp.today() - df["signup_date"]).dt.days

df = df.drop(columns=[
    "signup_date",
    "payment_method", "browser", "os", "device_type",
    "is_vpn", "is_new_device"
])

# =========================
# 6. Traitement outliers
# =========================

# Clip sur variables skewed
log_clip_cols = [
    "chargebacks_12m",
    "days_since_last_login",
    "tenure_months",
    "max_amount_30d_eur",
    "income_estimate_alt_eur",
    "tx_amount_total_30d_eur",
    "annual_income_eur",
    "avg_amount_30d_eur",
    "num_devices_30d",
    "support_tickets_90d",
    "failed_payments_6m",
    "age",
    "credit_score"
]

for col in log_clip_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)

# =========================
# 7. Standardisation finale
# =========================

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
numeric_cols = numeric_cols.drop("target_is_fraud")

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# =========================
# Résultat
# =========================
print(df.shape)
df.head()

df.to_csv("../6.Data/Yann_Process_Step_3.csv", index=False)



(160000, 40)
