In [1]:
import pandas as pd
import numpy as np

#Load data
df = pd.read_csv("../data/fintech_synthetic_5000.csv")
df.columns = df.columns.str.strip()
df.columns.tolist()


['customer_id',
 'age',
 'country',
 'customer_segment',
 'kyc_source',
 'primary_device',
 'income_annual_eur',
 'account_age_days',
 'logins_30d',
 'avg_session_min',
 'txn_cnt_30d',
 'avg_txn_amount_eur',
 'cash_in_ratio',
 'cross_border_ratio',
 'failed_txn_rate',
 'support_tickets_90d',
 'chargeback_cnt_90d',
 'disputes_open',
 'pep_flag',
 'sanction_screen_hit',
 'aml_alerts_180d',
 'card_user',
 'fx_trading_user',
 'crypto_user',
 'risk_tier',
 'next_30d_net_revenue_eur']

In [2]:
from sklearn.model_selection import train_test_split

#one feature matrix [X]Â and two targets y

y_classification = df["risk_tier"]
y_regression = df["next_30d_net_revenue_eur"]

#Features

X = df.drop(columns=["customer_id","risk_tier","next_30d_net_revenue_eur"])

X.shape, y_classification.shape, y_regression.shape

((5000, 23), (5000,), (5000,))

In [3]:
#Train/Test Split

#Classification Split

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X,
    y_classification,
    test_size=0.25,
    random_state=42,
    stratify=y_classification  #Risk tiers are imbalanced, must be stratified
)

#Regression Split

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X,
    y_regression,
    test_size=0.25,
    random_state=42
)



In [4]:
#Targets
categorical_target = "risk_tier"
numerical_target = "next_30d_net_revenue_eur"

#Identifier
id_column="customer_id"
#Separate numerical vs categorical features
features = [col for col in df.columns if col not in [categorical_target,numerical_target,id_column]]

categorical_features = df[features].select_dtypes(include=["object"]).columns.tolist()
numerical_features = df[features].select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features, numerical_features

(['country', 'customer_segment', 'kyc_source', 'primary_device'],
 ['age',
  'income_annual_eur',
  'account_age_days',
  'logins_30d',
  'avg_session_min',
  'txn_cnt_30d',
  'avg_txn_amount_eur',
  'cash_in_ratio',
  'cross_border_ratio',
  'failed_txn_rate',
  'support_tickets_90d',
  'chargeback_cnt_90d',
  'disputes_open',
  'pep_flag',
  'sanction_screen_hit',
  'aml_alerts_180d',
  'card_user',
  'fx_trading_user',
  'crypto_user'])

In [5]:
#Numerical Pipeline

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

#Categorical Pipeline

from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

#Combine Pipelines

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)


In [6]:
#Sanity Check
X_train_processed = preprocessor.fit_transform(X_train_c)
X_test_processed = preprocessor.transform(X_test_c)

X_train_processed.shape, X_test_processed.shape


((3750, 35), (1250, 35))