# D√©tection d'anomalies comportementales (scores par sc√©nario)

Sc√©narios calcul√©s s√©par√©ment (scores 0‚Äì1, 1 = plus atypique). On retourne uniquement un tableau des scores par technique (pas de seuil 1%).


In [1]:
# üì¶ Pr√©paration : import, chargement, nettoyage commun
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest

WINDOWS = ['30D']          # ajoutez '7D' ou '90D' si besoin
CONTAMINATION = 0.01       # utilis√© pour IsolationForest
THRESHOLD_NEAR = 10_000    # seuil cash structuring (ajustable)

candidates = [
    Path('joined_with_transactions.csv'),
    Path('joined_with_transcation.csv'),
    Path('data_lauzhack_2/joined_with_transactions.csv'),
    Path('data_lauzhack_2/joined_with_transcation.csv'),
]
for path in candidates:
    if path.exists():
        CSV_PATH = path
        break
else:
    raise FileNotFoundError('Aucun CSV trouv√© dans les emplacements candidats.')

print(f"Lecture du fichier : {CSV_PATH}")

# Chargement et nettoyage minimal
tx = pd.read_csv(CSV_PATH)
col_date = 'Date'; col_dir = 'Debit/Credit'; col_amount = 'Amount'; col_balance = 'Balance'
col_partner = 'partner_id'; col_counterparty = 'counterparty_Account_ID'; col_ext_counterparty = 'ext_counterparty_Account_ID'
col_country = 'ext_counterparty_country'; col_transfer = 'Transfer_Type'

tx[col_amount] = pd.to_numeric(tx[col_amount], errors='coerce').fillna(0)
tx[col_balance] = pd.to_numeric(tx[col_balance], errors='coerce')
tx[col_date] = pd.to_datetime(tx[col_date], errors='coerce')
tx = tx.dropna(subset=[col_date, col_partner])

flag = tx[col_dir].str.lower().str.strip()
tx['is_credit'] = flag.eq('credit')
tx['is_debit'] = flag.eq('debit')

counterparty = tx[col_counterparty].fillna(tx[col_ext_counterparty])
tx['counterparty'] = counterparty.fillna('UNKNOWN')
tx['is_cross_border'] = tx[col_country].notna()
tx['is_cash'] = tx[col_transfer].fillna('').str.upper().str.contains('CASH')
tx['is_cash_credit'] = tx['is_cash'] & tx['is_credit']
tx['is_near_threshold'] = tx['is_cash_credit'] & tx[col_amount].between(0.8*THRESHOLD_NEAR, THRESHOLD_NEAR, inclusive='left')

# Montants filtr√©s pour √©viter des apply lourds
tx['amt_credit'] = np.where(tx['is_credit'], tx[col_amount], 0)
tx['amt_debit'] = np.where(tx['is_debit'], tx[col_amount], 0)
tx['amt_credit_cross'] = np.where(tx['is_credit'] & tx['is_cross_border'], tx[col_amount], 0)
tx['amt_cash_credit'] = np.where(tx['is_cash_credit'], tx[col_amount], 0)
tx['amt_cash_debit'] = np.where(tx['is_cash'] & tx['is_debit'], tx[col_amount], 0)

tx['counterparty_credit'] = np.where(tx['is_credit'], tx['counterparty'], np.nan)
tx['counterparty_debit'] = np.where(tx['is_debit'], tx['counterparty'], np.nan)

country_codes = tx[col_country].astype('category').cat.codes
country_codes = country_codes.where(tx[col_country].notna(), -1)
tx['country_code_num'] = country_codes

tx['day'] = tx[col_date].dt.floor('D')

# Agr√©gation journali√®re pour all√©ger les rollings
daily = tx.groupby([col_partner, 'day']).agg(
    nb_tx_total=('Amount', 'count'),
    nb_credit=('is_credit', 'sum'),
    nb_debit=('is_debit', 'sum'),
    sum_credit=('amt_credit', 'sum'),
    sum_debit=('amt_debit', 'sum'),
    sum_credit_cross=('amt_credit_cross', 'sum'),
    avg_amount=('Amount', 'mean'),
    median_amount=('Amount', 'median'),
    std_amount=('Amount', 'std'),
    p95_amount=('Amount', lambda x: np.percentile(x, 95)),
    p99_amount=('Amount', lambda x: np.percentile(x, 99)),
    nb_counterparties_entrantes=('counterparty_credit', lambda s: pd.Series(s).dropna().nunique()),
    nb_counterparties_sortantes=('counterparty_debit', lambda s: pd.Series(s).dropna().nunique()),
    nb_ext_countries=('country_code_num', lambda s: pd.Series(s[s >= 0]).nunique()),
    avg_balance=('Balance', 'mean'),
    nb_cash_deposits=('is_cash_credit', 'sum'),
    nb_near_threshold=('is_near_threshold', 'sum'),
    sum_cash_deposits=('amt_cash_credit', 'sum'),
    sum_cash_out=('amt_cash_debit', 'sum'),
    nb_cash_days=('is_cash', 'max'),
).reset_index()

daily_idx = daily.sort_values([col_partner, 'day']).set_index('day')


def minmax_01(series):
    return (series - series.min()) / (series.max() - series.min() + 1e-9)


def iso_score(df, feature_cols, contamination=CONTAMINATION):
    X = df[feature_cols].fillna(0)
    model = IsolationForest(random_state=42, contamination=contamination, n_estimators=300, n_jobs=-1)
    model.fit(X)
    raw = -model.decision_function(X)
    scored = df.copy()
    scored['score'] = minmax_01(raw)
    return scored

client_period = None  # aliment√© ensuite


FileNotFoundError: Aucun CSV trouv√© dans les emplacements candidats.

In [None]:
# Technique 1 : features comportementales de base (rollings sur agr√©gats journaliers)

all_base = []
for w in WINDOWS:
    grp = daily_idx.groupby('partner_id')
    roll = grp.rolling(w).agg({
        'nb_tx_total': 'sum',
        'nb_credit': 'sum',
        'nb_debit': 'sum',
        'sum_credit': 'sum',
        'sum_debit': 'sum',
        'sum_credit_cross': 'sum',
        'avg_amount': 'mean',
        'median_amount': 'mean',
        'std_amount': 'mean',
        'p95_amount': 'mean',
        'p99_amount': 'mean',
        'nb_counterparties_entrantes': 'sum',
        'nb_counterparties_sortantes': 'sum',
        'nb_ext_countries': 'sum',
        'avg_balance': 'mean',
    })
    roll = roll.reset_index().rename(columns={'day': 'window_end'})
    roll['window'] = w
    roll['net_flow'] = roll['sum_credit'] - roll['sum_debit']
    roll['turnover_ratio'] = (roll['sum_credit'] + roll['sum_debit']) / roll['avg_balance'].replace(0, np.nan)
    roll['turnover_ratio'] = roll['turnover_ratio'].fillna(0)
    roll['share_international'] = roll['sum_credit_cross'] / roll['sum_credit'].replace(0, np.nan)
    roll['share_international'] = roll['share_international'].fillna(0)

    feature_cols = [
        'nb_tx_total', 'nb_credit', 'nb_debit', 'sum_credit', 'sum_debit', 'net_flow',
        'avg_amount', 'median_amount', 'std_amount', 'p95_amount', 'p99_amount',
        'nb_counterparties_entrantes', 'nb_counterparties_sortantes', 'nb_ext_countries',
        'turnover_ratio', 'share_international'
    ]
    scored = iso_score(roll, feature_cols, contamination=CONTAMINATION)
    scored = scored.rename(columns={'score': 'score_behavioural'})
    all_base.append(scored)

base_behavioural = pd.concat(all_base, ignore_index=True)
client_period = base_behavioural.copy()
print("Base comportementale calcul√©e ->", client_period.shape)


In [None]:
# Technique 2 : Structuring (cash sous seuil)

all_struct = []
for w in WINDOWS:
    grp = daily_idx.groupby('partner_id')
    roll = grp.rolling(w).agg({
        'nb_cash_deposits': 'sum',
        'nb_near_threshold': 'sum',
        'sum_cash_deposits': 'sum',
    })
    roll = roll.reset_index().rename(columns={'day': 'window_end'})
    roll['window'] = w
    roll['ratio_near_threshold'] = roll['nb_near_threshold'] / roll['nb_cash_deposits'].replace(0, np.nan)
    roll['ratio_near_threshold'] = roll['ratio_near_threshold'].fillna(0)

    feature_cols = ['nb_cash_deposits', 'nb_near_threshold', 'ratio_near_threshold', 'sum_cash_deposits']
    struct_scored = iso_score(roll, feature_cols, contamination=CONTAMINATION)
    struct_scored = struct_scored.rename(columns={'score': 'score_structuring'})
    all_struct.append(struct_scored)

structuring = pd.concat(all_struct, ignore_index=True)
client_period = client_period.merge(structuring, on=['partner_id', 'window_end', 'window'], how='left')
print("Structuring ajout√© ->", structuring.shape)


In [None]:
# Technique 3 : Rapid movement (turnover √©lev√©)

all_velocity = []
for w in WINDOWS:
    grp = daily_idx.groupby('partner_id')
    roll = grp.rolling(w).agg({
        'sum_credit': 'sum',
        'sum_debit': 'sum',
        'avg_balance': 'mean',
    })
    roll = roll.reset_index().rename(columns={'day': 'window_end'})
    roll['window'] = w
    roll['turnover_ratio_velocity'] = (roll['sum_credit'] + roll['sum_debit']) / roll['avg_balance'].replace(0, np.nan)
    roll['turnover_ratio_velocity'] = roll['turnover_ratio_velocity'].fillna(0)

    feature_cols = ['sum_credit', 'sum_debit', 'avg_balance', 'turnover_ratio_velocity']
    velocity_scored = iso_score(roll, feature_cols, contamination=CONTAMINATION)
    velocity_scored = velocity_scored.rename(columns={'score': 'score_rapid_movement'})
    all_velocity.append(velocity_scored)

rapid_movement = pd.concat(all_velocity, ignore_index=True)
client_period = client_period.merge(rapid_movement, on=['partner_id', 'window_end', 'window'], how='left')
print("Rapid movement ajout√© ->", rapid_movement.shape)


In [None]:
# Technique 4 : Round-tripping (sym√©trie flux globales par contrepartie)

pair = tx.groupby(['partner_id', 'counterparty']).agg(
    sum_to_counterparty=('amt_debit', 'sum'),
    sum_from_counterparty=('amt_credit', 'sum'),
).reset_index()
pair['symmetry_ratio'] = pair[['sum_to_counterparty', 'sum_from_counterparty']].min(axis=1) /     pair[['sum_to_counterparty', 'sum_from_counterparty']].max(axis=1).replace(0, np.nan)
pair['symmetry_ratio'] = pair['symmetry_ratio'].fillna(0)

client_round = pair.groupby('partner_id').agg(
    max_symmetry_ratio=('symmetry_ratio', 'max'),
    nb_pairs_high_symmetry=('symmetry_ratio', lambda s: (s > 0.8).sum()),
    total_pair_volume=('symmetry_ratio', 'size'),
).reset_index()

round_scored = iso_score(client_round, ['max_symmetry_ratio', 'nb_pairs_high_symmetry', 'total_pair_volume'], contamination=CONTAMINATION)
round_scored = round_scored.rename(columns={'score': 'score_round_tripping'})

client_period = client_period.merge(round_scored, on='partner_id', how='left')
print("Round-tripping ajout√© ->", round_scored.shape)


In [None]:
# Technique 5 : Unusual cash (niveau et part de cash)

all_cash = []
for w in WINDOWS:
    grp = daily_idx.groupby('partner_id')
    roll = grp.rolling(w).agg({
        'sum_cash_deposits': 'sum',
        'sum_cash_out': 'sum',
        'nb_cash_days': 'sum',
        'sum_credit': 'sum',
    })
    roll = roll.reset_index().rename(columns={'day': 'window_end'})
    roll['window'] = w
    roll['share_cash_in'] = roll['sum_cash_deposits'] / roll['sum_credit'].replace(0, np.nan)
    roll['share_cash_in'] = roll['share_cash_in'].fillna(0)

    feature_cols = ['sum_cash_deposits', 'sum_cash_out', 'nb_cash_days', 'share_cash_in']
    cash_scored = iso_score(roll, feature_cols, contamination=CONTAMINATION)
    cash_scored = cash_scored.rename(columns={'score': 'score_unusual_cash'})
    all_cash.append(cash_scored)

unusual_cash = pd.concat(all_cash, ignore_index=True)
client_period = client_period.merge(unusual_cash, on=['partner_id', 'window_end', 'window'], how='left')
print("Unusual cash ajout√© ->", unusual_cash.shape)


In [None]:
# Sortie : tableau des scores par technique (dernier point par partenaire)

scenario_score_cols = [
    'score_behavioural',
    'score_structuring',
    'score_rapid_movement',
    'score_round_tripping',
    'score_unusual_cash',
]

latest = client_period.sort_values('window_end').groupby('partner_id').tail(1)
score_table = latest[['partner_id', 'window_end'] + scenario_score_cols].reset_index(drop=True)

output_path = CSV_PATH.parent / 'client_scores_by_technique.csv'
score_table.to_csv(output_path, index=False)
print(f"Tableau des scores sauvegard√© dans {output_path} (shape={score_table.shape})")
print(score_table.head())
