# Preprocessing 

## 1) Import des données 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("../6.Data/kaggle_b2_fraud_train_v3.csv")

## 1) Valeurs null

### 1) On remplace par la médiane 

In [4]:
cols_median = [
    "annual_income_eur",
    "credit_score",
    "avg_amount_30d_eur",
    "max_amount_30d_eur"
]

df[cols_median] = df[cols_median].fillna(
    df[cols_median].median()
)

### 2) On remplace par la moyenne

In [5]:
cols_mean = ["device_trust_z", "ip_risk_z"]

df[cols_mean] = df[cols_mean].fillna(df[cols_mean].mean())


### 3) On change les valeurs en booléens et on remplace par 0

In [6]:
df["secondary_email"] = df["secondary_email"].notnull().astype(int)
df["legacy_partner_score"] = df["legacy_partner_score"].fillna(0)
df["partner_risk_indicator"] = df["partner_risk_indicator"].fillna(0)

### 4) On supprimes les colonnes pas utiles

In [7]:
cols_to_drop = ["occupation", "merchant_category", "customer_note", "last_ticket_subject",'region']
df = df.drop(columns=cols_to_drop)


In [8]:
df

Unnamed: 0,customer_id,account_id,age,tenure_months,annual_income_eur,credit_score,num_transactions_30d,avg_amount_30d_eur,max_amount_30d_eur,days_since_last_login,...,internal_signal_5,internal_signal_6,internal_signal_7,internal_signal_8,terms_accepted_flag,partner_risk_indicator,manual_review_result,post_event_status_code,chargeback_resolution_time_days,legacy_partner_score
0,CUST_6O9Q8D4I36,ACC_TXXXTNEUVKFY,34,108,38635.01,544.0,20,60.92,80.16,4.9,...,0.39006,0.10963,0.55097,-0.56104,1,0.000000,approve,0,7.9,0.0
1,CUST_FGUGTW230C,ACC_70VD7A4FFWCW,48,2,19912.97,703.0,21,112.11,571.12,0.3,...,0.03265,-0.40256,0.36218,0.86583,1,0.000000,approve,0,5.5,0.0
2,CUST_8ZI3LCBZ0W,ACC_AF53381QSC0L,27,0,20326.87,720.0,25,73.61,492.57,4.6,...,-0.15637,0.57818,0.28902,-2.19864,1,0.000000,approve,0,7.2,0.0
3,CUST_5MP3AR41CJ,ACC_U7WZGJ486LIV,45,49,38452.47,703.0,17,47.53,204.18,25.3,...,-1.02145,0.63908,-0.89190,-0.81592,1,0.000000,approve,0,4.4,0.0
4,CUST_GNPL83JB0J,ACC_XW7DS3ED5J4Y,37,46,31943.14,594.0,13,99.95,734.09,12.8,...,-0.65771,0.08020,0.17606,0.86739,1,0.000000,approve,0,4.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,CUST_I81IW5SVRQ,ACC_UPDTFTYTSM0A,56,0,34775.62,727.0,21,51.72,226.11,3.8,...,-2.54086,-0.60747,0.23252,-0.06215,1,-1.535486,approve,0,1.0,0.0
159996,CUST_QT6DDEMKTJ,ACC_97NE0LBL5W9U,41,4,88617.57,770.0,18,52.54,171.07,15.1,...,0.34098,-1.78817,0.31788,0.51072,1,0.000000,approve,0,7.4,0.0
159997,CUST_I0JS1GTS98,ACC_9JJ84W64Z7GX,30,2,41148.54,738.0,20,29.34,119.81,0.7,...,-1.28947,-0.32324,-0.06238,-0.99076,1,0.000000,approve,3,6.6,0.0
159998,CUST_L7GUCJ3TFY,ACC_NGFXDR7HW1ZS,56,6,31943.14,719.0,25,88.56,553.16,22.6,...,0.47179,-0.22090,-1.34239,-0.30513,1,0.000000,approve,0,12.5,0.0


In [9]:
df.to_csv("dataset_preprocessed_1.csv", index=False)


## 2) Gestion des outliers

### 1) Standardisation +- 3 sigma

In [10]:
import numpy as np
import pandas as pd

# Colonnes à standardiser
cols_to_standardize = [
    "internal_signal_6",
    "internal_signal_3",
    "credit_score_norm",
    "internal_signal_5",
    "internal_signal_1",
    "internal_signal_8",
    "internal_signal_2",
    "internal_signal_7",
    "internal_signal_4",
    "num_transactions_30d",
    "ip_risk_z",
    "device_trust_z"
]

def standardisation_sigma_3(series):
    mu = series.mean()
    sigma = series.std()
    z = (series - mu) / sigma
    return np.clip(z, -3, 3)

for col in cols_to_standardize:
    df[col] = standardisation_sigma_3(df[col])


### 2) Log transformation / Winsorisation / Clip

In [11]:
import numpy as np

cols_to_transform = [
    "chargebacks_12m",
    "days_since_last_login",
    "chargeback_resolution_time_days",
    "tenure_months",
    "max_amount_30d_eur",
    "income_estimate_alt_eur",
    "tx_amount_total_30d_eur",
    "annual_income_eur",
    "avg_amount_30d_eur",
    "num_devices_30d",
    "support_tickets_90d",
    "failed_payments_6m",
    "age",
    "credit_score"
]

# 1️ LOG TRANSFORM 

log_cols = [
    "chargebacks_12m",
    "days_since_last_login",
    "chargeback_resolution_time_days",
    "tenure_months",
    "max_amount_30d_eur",
    "income_estimate_alt_eur",
    "tx_amount_total_30d_eur",
    "annual_income_eur",
    "avg_amount_30d_eur",
    "num_devices_30d",
    "support_tickets_90d",
    "failed_payments_6m"
]

for col in log_cols:
    df[col] = np.log1p(df[col])

# 2️ WINSORISATION 

for col in cols_to_transform:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)

# 3️ CLIP MÉTIER

df["age"] = df["age"].clip(18, 100)
df["credit_score"] = df["credit_score"].clip(300, 900)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [14]:
df.to_csv("dataset_preprocessed_2.csv", index=False)

## 3) Gestion des variables corréllées

### 1) suppréssion de colonnes

In [15]:
cols_to_drop = ["chargeback_resolution_time_days", "post_event_status_code"]

df = df.drop(columns=cols_to_drop, errors="ignore")


In [16]:
cols_to_drop = [
    "avg_amount_30d_eur",
    "income_estimate_alt_eur",
]

df = df.drop(columns=cols_to_drop, errors="ignore")


In [17]:
df.to_csv("dataset_preprocessed_3.csv", index=False)

## 4) Encodage

### 1) Encodage binaire

In [18]:
binary_cols = ["is_vpn", "is_new_device"]

for col in binary_cols:
    df[col] = df[col].astype(int)


### 2) One Hot encoding

In [20]:
one_hot_cols = ["channel"]

df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)


### 3) Label encoding

In [23]:
from sklearn.preprocessing import LabelEncoder
label_cols = [
    "plan_type",
    "manual_review_result",
    "partner_risk_indicator"
]

label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le


### 4) Target Encoding

In [25]:
target_cols = ["payment_method", "browser", "os", "device_type"]

for col in target_cols:
    target_means = df.groupby(col)["target_is_fraud"].mean()
    df[col] = df[col].map(target_means)


### 5) Feature Engineering sur signup_date

In [26]:
df["signup_date"] = pd.to_datetime(df["signup_date"])

df["signup_year"] = df["signup_date"].dt.year
df["signup_month"] = df["signup_date"].dt.month
df["signup_day"] = df["signup_date"].dt.day
df["signup_weekday"] = df["signup_date"].dt.weekday

# Ancienneté en jours
df["account_age_days"] = (pd.Timestamp.today() - df["signup_date"]).dt.days

# Suppression de la date brute
df = df.drop(columns=["signup_date"])


In [27]:
df.to_csv("dataset_preprocessed_4.csv", index=False)