In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# =========================
# 1. Chargement des données
# =========================
df_train = pd.read_csv("../6.Data/kaggle_b2_fraud_train_v3.csv")
df_test  = pd.read_csv("../6.Data/kaggle_b2_fraud_test_v3.csv")  # pas de colonne target

# =========================
# 2. Suppression colonnes inutiles
# =========================
cols_to_drop = [
    "account_id", "city", "postal_code",
    "chargeback_resolution_time_days", "post_event_status_code",
    "referrer_code", "manual_review_result", "terms_accepted_flag",
    "occupation", "merchant_category", "customer_note",
    "last_ticket_subject", "region", "country", "signup_source"
]
df_train = df_train.drop(columns=cols_to_drop, errors="ignore")
df_test  = df_test.drop(columns=cols_to_drop, errors="ignore")

# =========================
# 3. Imputation des valeurs manquantes
# (paramètres calculés sur train uniquement)
# =========================
median_cols = ["annual_income_eur", "credit_score", "avg_amount_30d_eur", "max_amount_30d_eur"]
median_values = df_train[median_cols].median()  # ← appris sur train
for col in median_cols:
    df_train[col] = df_train[col].fillna(median_values[col])
    df_test[col]  = df_test[col].fillna(median_values[col])

mean_cols = ["device_trust_z", "ip_risk_z"]
mean_values = df_train[mean_cols].mean()  # ← appris sur train
for col in mean_cols:
    df_train[col] = df_train[col].fillna(mean_values[col])
    df_test[col]  = df_test[col].fillna(mean_values[col])

# Booléen présence secondary_email
df_train["secondary_email"] = df_train["secondary_email"].notna().astype(int)
df_test["secondary_email"]  = df_test["secondary_email"].notna().astype(int)

# Valeurs legacy
df_train["legacy_partner_score"]    = df_train["legacy_partner_score"].fillna(0)
df_train["partner_risk_indicator"]  = df_train["partner_risk_indicator"].fillna(0)
df_test["legacy_partner_score"]     = df_test["legacy_partner_score"].fillna(0)
df_test["partner_risk_indicator"]   = df_test["partner_risk_indicator"].fillna(0)

# =========================
# 4. Encodage catégoriel
# =========================
# One-hot : plan_type (align sur les colonnes du train)
plan_dummies_train = pd.get_dummies(df_train["plan_type"], prefix="plan_type")
plan_dummies_test  = pd.get_dummies(df_test["plan_type"],  prefix="plan_type")
plan_dummies_test  = plan_dummies_test.reindex(columns=plan_dummies_train.columns, fill_value=0)

df_train = pd.concat([df_train.drop(columns=["plan_type"]), plan_dummies_train], axis=1)
df_test  = pd.concat([df_test.drop(columns=["plan_type"]),  plan_dummies_test],  axis=1)

# One-hot : channel
channel_dummies_train = pd.get_dummies(df_train["channel"], prefix="channel")
channel_dummies_test  = pd.get_dummies(df_test["channel"],  prefix="channel")
channel_dummies_test  = channel_dummies_test.reindex(columns=channel_dummies_train.columns, fill_value=0)

df_train = pd.concat([df_train.drop(columns=["channel"]), channel_dummies_train], axis=1)
df_test  = pd.concat([df_test.drop(columns=["channel"]),  channel_dummies_test],  axis=1)

# Booléens
binary_cols = ["is_vpn", "is_new_device"]
for col in binary_cols:
    df_train[col] = df_train[col].astype(int)
    df_test[col]  = df_test[col].astype(int)

# Target encoding : mapping calculé sur le TRAIN uniquement
target_cols = ["payment_method", "browser", "os", "device_type"]
target_encodings = {}  # on sauvegarde les mappings
for col in target_cols:
    mapping = df_train.groupby(col)["target_is_fraud"].mean()
    target_encodings[col] = mapping
    df_train[col] = df_train[col].map(mapping)
    # Pour le test : catégories inconnues → moyenne globale du train (fallback)
    global_mean = df_train["target_is_fraud"].mean()
    df_test[col] = df_test[col].map(mapping).fillna(global_mean)

# =========================
# 5. Feature engineering dates
# =========================
today = pd.Timestamp.today()  # une seule référence pour train et test

df_train["signup_date"] = pd.to_datetime(df_train["signup_date"])
df_train["account_age_days"] = (today - df_train["signup_date"]).dt.days

df_test["signup_date"] = pd.to_datetime(df_test["signup_date"])
df_test["account_age_days"] = (today - df_test["signup_date"]).dt.days

cols_to_drop_post = [
    "signup_date", "payment_method", "browser", "os", "device_type",
    "is_vpn", "is_new_device"
]
df_train = df_train.drop(columns=cols_to_drop_post)
df_test  = df_test.drop(columns=cols_to_drop_post)

# =========================
# 6. Traitement outliers
# (quantiles calculés sur train uniquement)
# =========================
log_clip_cols = [
    "chargebacks_12m", "days_since_last_login", "tenure_months",
    "max_amount_30d_eur", "income_estimate_alt_eur", "tx_amount_total_30d_eur",
    "annual_income_eur", "avg_amount_30d_eur", "num_devices_30d",
    "support_tickets_90d", "failed_payments_6m", "age", "credit_score"
]
clip_bounds = {}
for col in log_clip_cols:
    lower = df_train[col].quantile(0.01)
    upper = df_train[col].quantile(0.99)
    clip_bounds[col] = (lower, upper)
    df_train[col] = df_train[col].clip(lower, upper)
    df_test[col]  = df_test[col].clip(lower, upper)

# =========================
# 7. Standardisation finale
# (scaler fitté sur train uniquement)
# =========================
numeric_cols = df_train.select_dtypes(include=["int64", "float64"]).columns
numeric_cols = numeric_cols.drop("target_is_fraud")

scaler = StandardScaler()
df_train[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])   # fit + transform
df_test[numeric_cols]  = scaler.transform(df_test[numeric_cols])         # transform only ✓


# =========================
# 8. transformer les colonnes booléennes (True / False) en numériques (1 / 0)
# =========================

colonnes = [
    'plan_type_basic', 'plan_type_enterprise', 'plan_type_premium', 'plan_type_standard',
    'channel_call_center', 'channel_mobile_app', 'channel_partner_api', 'channel_web'
]

# Conversion True -> 1, False -> 0
df_train[colonnes] = df_train[colonnes].astype(int)
df_test[colonnes] = df_test[colonnes].astype(int)



# =========================
# Résultat
# =========================
print("Train shape:", df_train.shape)
print("Test shape: ", df_test.shape)

df_train.to_csv("../6.Data/Yann_Process_train.csv", index=False)
df_test.to_csv("../6.Data/Yann_Process_test.csv",  index=False)

Train shape: (160000, 41)
Test shape:  (40000, 40)
