In [1]:
import pandas as pd
import numpy as np

#Load data
df = pd.read_csv("../data/fintech_synthetic_5000.csv")
df.columns = df.columns.str.strip()

#Basic checks
print(df.shape)
df.head()

(5000, 26)


Unnamed: 0,customer_id,age,country,customer_segment,kyc_source,primary_device,income_annual_eur,account_age_days,logins_30d,avg_session_min,...,chargeback_cnt_90d,disputes_open,pep_flag,sanction_screen_hit,aml_alerts_180d,card_user,fx_trading_user,crypto_user,risk_tier,next_30d_net_revenue_eur
0,1,22,NL,Retail,Web,Android,,892,6,8.71,...,0,0,0,0,0,1,1,0,Low,39.69
1,2,58,UK,Retail,InApp,Web,30051.0,1094,8,12.63,...,0,0,0,0,0,0,0,0,Low,1.8
2,3,52,IT,Retail,InApp,iOS,56313.0,129,13,7.64,...,0,0,0,0,1,1,1,0,Low,32.14
3,4,40,ES,Retail,Web,Android,15917.0,1488,6,8.78,...,1,0,0,0,0,1,0,0,Medium,-12.26
4,5,40,DE,Retail,InApp,iOS,40425.0,914,10,9.53,...,1,0,0,0,0,1,1,0,Medium,-5.78


In [2]:
#Targets
categorical_target = "risk_tier"
numerical_target = "next_30d_net_revenue_eur"

#Identifier
id_column="customer_id"

features = [col for col in df.columns if col not in [categorical_target,numerical_target,id_column]]
features

['age',
 'country',
 'customer_segment',
 'kyc_source',
 'primary_device',
 'income_annual_eur',
 'account_age_days',
 'logins_30d',
 'avg_session_min',
 'txn_cnt_30d',
 'avg_txn_amount_eur',
 'cash_in_ratio',
 'cross_border_ratio',
 'failed_txn_rate',
 'support_tickets_90d',
 'chargeback_cnt_90d',
 'disputes_open',
 'pep_flag',
 'sanction_screen_hit',
 'aml_alerts_180d',
 'card_user',
 'fx_trading_user',
 'crypto_user']

In [3]:
#Missing columns ratio checks
missing_df= ( df.isna().mean().sort_values(ascending=False).to_frame(name="missing_ratio").query("missing_ratio>0"))

missing_df

Unnamed: 0,missing_ratio
income_annual_eur,0.1232
avg_session_min,0.0826
cross_border_ratio,0.0598
primary_device,0.0308
failed_txn_rate,0.0214
kyc_source,0.019


In [4]:
#Target Variable Checks
df[categorical_target].value_counts(normalize=True)

risk_tier
Low       0.5266
Medium    0.3650
High      0.1084
Name: proportion, dtype: float64

In [5]:
#Checking revenue distribution (numerical target)
df[numerical_target].describe()

count    5000.000000
mean       -3.552928
std        38.885284
min       -50.000000
25%       -29.770000
50%        -7.995000
75%        11.105000
max       258.250000
Name: next_30d_net_revenue_eur, dtype: float64

In [6]:
#Qucik leakage sanity check
#Risk signals to correlate with revenue negatively
#Activity metrics to correlate positively
df[[numerical_target, "txn_cnt_30d", "avg_txn_amount_eur", "chargeback_cnt_90d"]].corr()


Unnamed: 0,next_30d_net_revenue_eur,txn_cnt_30d,avg_txn_amount_eur,chargeback_cnt_90d
next_30d_net_revenue_eur,1.0,0.434329,0.415668,-0.070527
txn_cnt_30d,0.434329,1.0,0.531434,-0.000846
avg_txn_amount_eur,0.415668,0.531434,1.0,9.8e-05
chargeback_cnt_90d,-0.070527,-0.000846,9.8e-05,1.0


In [7]:
#Separate numerical vs categorical features
categorical_features = df[features].select_dtypes(include=["object"]).columns.tolist()
numerical_features = df[features].select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features, numerical_features

(['country', 'customer_segment', 'kyc_source', 'primary_device'],
 ['age',
  'income_annual_eur',
  'account_age_days',
  'logins_30d',
  'avg_session_min',
  'txn_cnt_30d',
  'avg_txn_amount_eur',
  'cash_in_ratio',
  'cross_border_ratio',
  'failed_txn_rate',
  'support_tickets_90d',
  'chargeback_cnt_90d',
  'disputes_open',
  'pep_flag',
  'sanction_screen_hit',
  'aml_alerts_180d',
  'card_user',
  'fx_trading_user',
  'crypto_user'])