<a href="https://colab.research.google.com/github/caetano-dev/PixFraudDetection/blob/main/TCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install pyarrow



## Data Loading and Preprocessing with DuckDB

1. **Mount Google Drive:** Mounts your Google Drive to access the data files.
2. **Define File Paths:** Sets up paths for the raw data files and the directory for processed data, handling both large and small datasets.
3. **Define Column Structures:** Specifies the standard column names and their expected data types.
4. **Parse Patterns File:** Reads and parses a text file containing money laundering patterns into a pandas DataFrame.
5. **Define Currencies:** Lists the specific currencies that will be processed.
6. **Initialize DuckDB:** Sets up an in-memory DuckDB database for efficient data processing.
7. **Define SQL Queries:** Creates SQL queries using DuckDB syntax to read the raw transaction data, parse timestamps, and cast columns to appropriate types.
8. **Filter and Save Data by Currency:** Iterates through the defined currencies, applies filters to the transaction data (normal and laundering) and account data using DuckDB, and saves the filtered data into separate Parquet files for each currency. This includes joining with the parsed patterns data to identify laundering transactions.

In [None]:
import os
import re
import duckdb
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
DRIVE_DIR = '/content/drive/MyDrive/AML'

#PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed/')
#TX_CSV = os.path.join(DRIVE_DIR, 'HI-Large_Trans.csv')
#PATTERNS_TXT = os.path.join(DRIVE_DIR, 'HI-Large_Patterns.txt')
#ACCOUNTS_CSV = os.path.join(DRIVE_DIR, 'HI-Large_Accounts.csv')

PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed/small')
TX_CSV = os.path.join(DRIVE_DIR, 'HI-Small_Trans.csv')
PATTERNS_TXT = os.path.join(DRIVE_DIR, 'HI-Small_Patterns.txt')
ACCOUNTS_CSV = os.path.join(DRIVE_DIR, 'HI-Small_Accounts.csv')

os.makedirs(PROCESSED_DIR, exist_ok=True)

if not os.path.exists(TX_CSV):
    raise FileNotFoundError(f"Transaction file not found: {TX_CSV}")
else:
    print(f"Found data folder: {DRIVE_DIR}")
    print("-" * 50)

standard_columns = [
    'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account',
    'amount_received', 'currency_received', 'amount_sent', 'currency_sent',
    'payment_type', 'is_laundering'
]

column_types = {
    'timestamp': 'VARCHAR',
    'from_bank': 'VARCHAR',
    'from_account': 'VARCHAR',
    'to_bank': 'VARCHAR',
    'to_account': 'VARCHAR',
    'amount_received': 'VARCHAR',
    'currency_received': 'VARCHAR',
    'amount_sent': 'VARCHAR',
    'currency_sent': 'VARCHAR',
    'payment_type': 'VARCHAR',
    'is_laundering': 'VARCHAR'
}

def parse_patterns_file(file_path):
    attempts = []
    current_attempt = None
    attempt_counter = 0

    with open(file_path, 'r') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith('BEGIN LAUNDERING ATTEMPT'):
                attempt_counter += 1
                m = re.search(r'BEGIN LAUNDERING ATTEMPT\s*-\s*(.+)$', line)
                attempt_type = m.group(1).strip() if m else 'UNKNOWN'
                current_attempt = {
                    'attempt_id': attempt_counter,
                    'attempt_type': attempt_type,
                    'transactions': []
                }
            elif line.startswith('END LAUNDERING ATTEMPT'):
                if current_attempt:
                    attempts.append(current_attempt)
                current_attempt = None
            elif current_attempt:
                parts = [p.strip() for p in line.split(',')]
                if len(parts) >= 11:
                    tx = dict(zip(standard_columns, parts[:11]))
                    tx['attempt_id'] = current_attempt['attempt_id']
                    tx['attempt_type'] = current_attempt['attempt_type']
                    current_attempt['transactions'].append(tx)

    all_transactions = [tx for attempt in attempts for tx in attempt['transactions']]
    return pd.DataFrame(all_transactions, columns=standard_columns + ['attempt_id', 'attempt_type'])

CURRENCIES = [
    "US Dollar",
    "Euro",
    "Yuan",
    "Shekel",
    "Canadian Dollar",
    "UK Pound",
    "Ruble",
    "Australian Dollar",
    "Swiss Franc",
    "Yen",
    "Mexican Peso",
    "Rupee",
    "Brazil Real",
    "Saudi Riyal"
]

con = duckdb.connect(database=':memory:')
con.execute("PRAGMA threads=8")

read_tx_csv_sql = f"""
  SELECT * FROM read_csv_auto(
    '{TX_CSV}',
    delim=',',
    header=false,
    columns={column_types},
    all_varchar=true
  )
"""

ts_parse_sql = """
CASE
  WHEN length(trim(timestamp)) = 16 THEN strptime(trim(timestamp), '%Y/%m/%d %H:%M')
  WHEN length(trim(timestamp)) = 19 THEN strptime(trim(timestamp), '%Y/%m/%d %H:%M:%S')
  ELSE NULL
END
"""

typed_tx_sql = f"""
WITH raw AS ({read_tx_csv_sql})
SELECT
  {ts_parse_sql}::TIMESTAMP AS timestamp,
  trim(from_bank) AS from_bank,
  trim(from_account) AS from_account,
  trim(to_bank) AS to_bank,
  trim(to_account) AS to_account,
  try_cast(nullif(trim(amount_received), '') AS DOUBLE) AS amount_received,
  trim(currency_received) AS currency_received,
  try_cast(nullif(trim(amount_sent), '') AS DOUBLE) AS amount_sent,
  trim(currency_sent) AS currency_sent,
  trim(payment_type) AS payment_type,
  coalesce(try_cast(nullif(trim(is_laundering), '') AS INTEGER), 0) AS is_laundering
FROM raw
"""

def currency_filter_sql(currency_name):
    return f"""
    upper(trim(currency_sent)) = upper('{currency_name}') AND
    upper(trim(currency_received)) = upper('{currency_name}') AND
    upper(trim(payment_type)) = 'ACH'
    """

patterns_df = parse_patterns_file(PATTERNS_TXT)
if patterns_df.empty:
    patterns_df = pd.DataFrame(columns=standard_columns + ['attempt_id', 'attempt_type'])
con.register('patterns_df', patterns_df)

for currency in CURRENCIES:
    cur_dirname = currency.replace(' ', '_')
    OUT_DIR = os.path.join(PROCESSED_DIR, cur_dirname)
    os.makedirs(OUT_DIR, exist_ok=True)

    OUT_STEP1 = os.path.join(OUT_DIR, '1_filtered_normal_transactions.parquet')
    OUT_STEP2 = os.path.join(OUT_DIR, '2_filtered_laundering_transactions.parquet')
    OUT_STEP3 = os.path.join(OUT_DIR, '3_filtered_accounts.parquet')

    filt_sql = currency_filter_sql(currency)

    # Step 1: normal transactions for this currency
    con.execute(f"""
      COPY (
        WITH typed AS ({typed_tx_sql})
        SELECT
          timestamp, from_bank, from_account, to_bank, to_account,
          amount_received, currency_received, amount_sent, currency_sent,
          payment_type, is_laundering
        FROM typed
        WHERE timestamp IS NOT NULL
          AND {filt_sql}
          AND is_laundering = 0
      ) TO '{OUT_STEP1}' (FORMAT PARQUET, COMPRESSION ZSTD)
    """)

    step1_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{OUT_STEP1}')").fetchone()[0]
    print(f"[{currency}] Step 1: Saved normal transactions to '{OUT_STEP1}' (rows={step1_rows:,})")

    # Step 2: laundering transactions (from patterns + missing from CSV) for this currency
    con.execute(f"""
      COPY (
        WITH
          pat_raw AS (
            SELECT
              timestamp, from_bank, from_account, to_bank, to_account,
              amount_received, currency_received, amount_sent, currency_sent,
              payment_type, is_laundering,
              attempt_id,
              attempt_type
            FROM patterns_df
          ),
          pat_typed AS (
            SELECT
              {ts_parse_sql}::TIMESTAMP AS timestamp,
              trim(from_bank) AS from_bank,
              trim(from_account) AS from_account,
              trim(to_bank) AS to_bank,
              trim(to_account) AS to_account,
              try_cast(nullif(trim(amount_received), '') AS DOUBLE) AS amount_received,
              trim(currency_received) AS currency_received,
              try_cast(nullif(trim(amount_sent), '') AS DOUBLE) AS amount_sent,
              trim(currency_sent) AS currency_sent,
              trim(payment_type) AS payment_type,
              coalesce(try_cast(nullif(trim(is_laundering), '') AS INTEGER), 0) AS is_laundering,
              try_cast(attempt_id AS BIGINT) AS attempt_id,
              trim(attempt_type) AS attempt_type
            FROM pat_raw
          ),
          pat_filt AS (
            SELECT
              timestamp, from_bank, from_account, to_bank, to_account,
              amount_received, currency_received, amount_sent, currency_sent,
              payment_type, is_laundering, attempt_id, attempt_type,
              CAST(round(amount_sent * 100) AS BIGINT) AS amount_sent_c,
              CAST(round(amount_received * 100) AS BIGINT) AS amount_received_c
            FROM pat_typed
            WHERE timestamp IS NOT NULL
              AND {filt_sql}
              AND is_laundering = 1
          ),
          raw_pos AS (
            WITH typed AS ({typed_tx_sql})
            SELECT
              timestamp, from_bank, from_account, to_bank, to_account,
              amount_received, currency_received, amount_sent, currency_sent,
              payment_type, is_laundering,
              CAST(round(amount_sent * 100) AS BIGINT) AS amount_sent_c,
              CAST(round(amount_received * 100) AS BIGINT) AS amount_received_c
            FROM typed
            WHERE timestamp IS NOT NULL
              AND {filt_sql}
              AND is_laundering = 1
          ),
          missing AS (
            SELECT raw_pos.*
            FROM raw_pos
            LEFT JOIN pat_filt
              ON raw_pos.timestamp = pat_filt.timestamp
              AND raw_pos.from_bank = pat_filt.from_bank
              AND raw_pos.from_account = pat_filt.from_account
              AND raw_pos.to_bank = pat_filt.to_bank
              AND raw_pos.to_account = pat_filt.to_account
              AND raw_pos.amount_received_c = pat_filt.amount_received_c
              AND raw_pos.amount_sent_c = pat_filt.amount_sent_c
            WHERE pat_filt.timestamp IS NULL
          ),
          unioned AS (
            SELECT
              timestamp, from_bank, from_account, to_bank, to_account,
              amount_received, currency_received, amount_sent, currency_sent,
              payment_type, is_laundering,
              attempt_id, attempt_type
            FROM pat_filt
            UNION ALL
            SELECT
              timestamp, from_bank, from_account, to_bank, to_account,
              amount_received, currency_received, amount_sent, currency_sent,
              payment_type, is_laundering,
              NULL::INTEGER AS attempt_id, 'UNLISTED' AS attempt_type
            FROM missing
          )
        SELECT * FROM unioned
      ) TO '{OUT_STEP2}' (FORMAT PARQUET, COMPRESSION ZSTD)
    """)

    base_count = con.execute("""
      WITH x as (SELECT attempt_type FROM read_parquet(?) WHERE attempt_type <> 'UNLISTED')
      SELECT COUNT(*) FROM x
    """, [OUT_STEP2]).fetchone()[0]
    added_count = con.execute("""
      WITH x as (SELECT attempt_type FROM read_parquet(?) WHERE attempt_type = 'UNLISTED')
      SELECT COUNT(*) FROM x
    """, [OUT_STEP2]).fetchone()[0]
    total_count = con.execute(f"SELECT COUNT(*) FROM read_parquet('{OUT_STEP2}')").fetchone()[0]
    print(f"[{currency}] Step 2: Saved laundering transactions to '{OUT_STEP2}' (patterns={base_count:,}, added_from_csv={added_count:,}, total={total_count:,})")

    # Step 3: Filter accounts involved in either step1 or step2 for this currency
    con.execute(f"""
      COPY (
        WITH all_tx AS (
          SELECT
            timestamp, from_bank, from_account, to_bank, to_account,
            amount_received, currency_received, amount_sent, currency_sent,
            payment_type, is_laundering,
            NULL::INTEGER AS attempt_id, NULL::VARCHAR AS attempt_type
          FROM read_parquet('{OUT_STEP1}')
          UNION ALL
          SELECT
            timestamp, from_bank, from_account, to_bank, to_account,
            amount_received, currency_received, amount_sent, currency_sent,
            payment_type, is_laundering,
            attempt_id, attempt_type
          FROM read_parquet('{OUT_STEP2}')
        ),
        involved AS (
          SELECT DISTINCT from_account AS account FROM all_tx WHERE from_account IS NOT NULL
          UNION
          SELECT DISTINCT to_account AS account FROM all_tx WHERE to_account IS NOT NULL
        ),
        accounts AS (
          SELECT * FROM read_csv_auto(
            '{ACCOUNTS_CSV}',
            delim=',',
            header=false,
            columns={{'bank_name': 'VARCHAR', 'bank_id': 'VARCHAR', 'account_id_hex': 'VARCHAR', 'entity_id': 'VARCHAR', 'entity_name': 'VARCHAR'}},
            all_varchar=true
          )
        )
        SELECT a.*
        FROM accounts a
        INNER JOIN involved i
          ON trim(a.account_id_hex) = trim(i.account)
      ) TO '{OUT_STEP3}' (FORMAT PARQUET, COMPRESSION ZSTD)
    """)

    step3_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{OUT_STEP3}')").fetchone()[0]
    print(f"[{currency}] Step 3: Saved filtered account details to '{OUT_STEP3}' (rows={step3_rows:,})")

con.close()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found data folder: /content/drive/MyDrive/AML
--------------------------------------------------


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[US Dollar] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/US_Dollar/1_filtered_normal_transactions.parquet' (rows=199,982)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[US Dollar] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/US_Dollar/2_filtered_laundering_transactions.parquet' (patterns=1,178, added_from_csv=485, total=1,663)
[US Dollar] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/US_Dollar/3_filtered_accounts.parquet' (rows=93,102)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Euro] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Euro/1_filtered_normal_transactions.parquet' (rows=125,228)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Euro] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Euro/2_filtered_laundering_transactions.parquet' (patterns=886, added_from_csv=320, total=1,206)
[Euro] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Euro/3_filtered_accounts.parquet' (rows=57,220)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Yuan] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Yuan/1_filtered_normal_transactions.parquet' (rows=21,877)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Yuan] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Yuan/2_filtered_laundering_transactions.parquet' (patterns=107, added_from_csv=55, total=162)
[Yuan] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Yuan/3_filtered_accounts.parquet' (rows=10,088)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Shekel] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Shekel/1_filtered_normal_transactions.parquet' (rows=20,461)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Shekel] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Shekel/2_filtered_laundering_transactions.parquet' (patterns=25, added_from_csv=56, total=81)
[Shekel] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Shekel/3_filtered_accounts.parquet' (rows=9,377)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Canadian Dollar] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Canadian_Dollar/1_filtered_normal_transactions.parquet' (rows=15,732)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Canadian Dollar] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Canadian_Dollar/2_filtered_laundering_transactions.parquet' (patterns=76, added_from_csv=37, total=113)
[Canadian Dollar] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Canadian_Dollar/3_filtered_accounts.parquet' (rows=6,939)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[UK Pound] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/UK_Pound/1_filtered_normal_transactions.parquet' (rows=19,186)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[UK Pound] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/UK_Pound/2_filtered_laundering_transactions.parquet' (patterns=71, added_from_csv=35, total=106)
[UK Pound] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/UK_Pound/3_filtered_accounts.parquet' (rows=8,557)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Ruble] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Ruble/1_filtered_normal_transactions.parquet' (rows=16,430)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Ruble] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Ruble/2_filtered_laundering_transactions.parquet' (patterns=72, added_from_csv=43, total=115)
[Ruble] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Ruble/3_filtered_accounts.parquet' (rows=7,410)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Australian Dollar] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Australian_Dollar/1_filtered_normal_transactions.parquet' (rows=14,522)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Australian Dollar] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Australian_Dollar/2_filtered_laundering_transactions.parquet' (patterns=69, added_from_csv=42, total=111)
[Australian Dollar] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Australian_Dollar/3_filtered_accounts.parquet' (rows=6,711)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Swiss Franc] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Swiss_Franc/1_filtered_normal_transactions.parquet' (rows=25,236)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Swiss Franc] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Swiss_Franc/2_filtered_laundering_transactions.parquet' (patterns=114, added_from_csv=50, total=164)
[Swiss Franc] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Swiss_Franc/3_filtered_accounts.parquet' (rows=11,538)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Yen] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Yen/1_filtered_normal_transactions.parquet' (rows=16,586)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Yen] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Yen/2_filtered_laundering_transactions.parquet' (patterns=89, added_from_csv=43, total=132)
[Yen] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Yen/3_filtered_accounts.parquet' (rows=7,696)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Mexican Peso] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Mexican_Peso/1_filtered_normal_transactions.parquet' (rows=11,552)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Mexican Peso] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Mexican_Peso/2_filtered_laundering_transactions.parquet' (patterns=53, added_from_csv=26, total=79)
[Mexican Peso] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Mexican_Peso/3_filtered_accounts.parquet' (rows=5,323)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Rupee] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Rupee/1_filtered_normal_transactions.parquet' (rows=20,858)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Rupee] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Rupee/2_filtered_laundering_transactions.parquet' (patterns=111, added_from_csv=34, total=145)
[Rupee] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Rupee/3_filtered_accounts.parquet' (rows=9,390)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Brazil Real] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Brazil_Real/1_filtered_normal_transactions.parquet' (rows=7,885)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Brazil Real] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Brazil_Real/2_filtered_laundering_transactions.parquet' (patterns=21, added_from_csv=24, total=45)
[Brazil Real] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Brazil_Real/3_filtered_accounts.parquet' (rows=3,412)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Saudi Riyal] Step 1: Saved normal transactions to '/content/drive/MyDrive/AML/processed/small/Saudi_Riyal/1_filtered_normal_transactions.parquet' (rows=9,197)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[Saudi Riyal] Step 2: Saved laundering transactions to '/content/drive/MyDrive/AML/processed/small/Saudi_Riyal/2_filtered_laundering_transactions.parquet' (patterns=336, added_from_csv=25, total=361)
[Saudi Riyal] Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/small/Saudi_Riyal/3_filtered_accounts.parquet' (rows=4,095)


In [None]:
import os
from pathlib import Path
import os
import re
import duckdb
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
DRIVE_DIR = '/content/drive/MyDrive/AML'

#PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed/')
#TX_CSV = os.path.join(DRIVE_DIR, 'HI-Large_Trans.csv')
#PATTERNS_TXT = os.path.join(DRIVE_DIR, 'HI-Large_Patterns.txt')
#ACCOUNTS_CSV = os.path.join(DRIVE_DIR, 'HI-Large_Accounts.csv')

PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed/small')
TX_CSV = os.path.join(DRIVE_DIR, 'HI-Small_Trans.csv')
PATTERNS_TXT = os.path.join(DRIVE_DIR, 'HI-Small_Patterns.txt')
ACCOUNTS_CSV = os.path.join(DRIVE_DIR, 'HI-Small_Accounts.csv')

os.makedirs(PROCESSED_DIR, exist_ok=True)

DRIVE_BASE = Path('/content/drive/MyDrive/AML/processed/small/US_Dollar')
print("Normal transactions")
df = pd.read_parquet(DRIVE_BASE / '1_filtered_normal_transactions.parquet')
df.info()
print(df.head())
print("Laundering transactions")
df = pd.read_parquet(DRIVE_BASE / '2_filtered_laundering_transactions.parquet')
df.info()
print(df.head())
print("Bank accounts")
df = pd.read_parquet(DRIVE_BASE / '3_filtered_accounts.parquet')
df.info()
print(df.head())

Mounted at /content/drive
Normal transactions
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199982 entries, 0 to 199981
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   timestamp          199982 non-null  datetime64[us]
 1   from_bank          199982 non-null  object        
 2   from_account       199982 non-null  object        
 3   to_bank            199982 non-null  object        
 4   to_account         199982 non-null  object        
 5   amount_received    199982 non-null  float64       
 6   currency_received  199982 non-null  object        
 7   amount_sent        199982 non-null  float64       
 8   currency_sent      199982 non-null  object        
 9   payment_type       199982 non-null  object        
 10  is_laundering      199982 non-null  int32         
dtypes: datetime64[us](1), float64(2), int32(1), object(7)
memory usage: 16.0+ MB
            timestamp from_bank f

In [2]:
!pip install python-igraph leidenalg fastparquet
!pip install -q condacolab
import condacolab

condacolab.install_from_url("https://github.com/conda-forge/miniforge/releases/download/25.3.1-0/Miniforge3-Linux-x86_64.sh")
!mamba install -q graph-tool
import graph_tool.all as gt, igraph as ig, leidenalg as la

✨🍰✨ Everything looks OK!


In [33]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
from google.colab import drive
from collections import OrderedDict

import graph_tool.all as gt
import igraph as ig
import leidenalg as la

drive.mount('/content/drive', force_remount=False)

# -----------------------
# Paths and config
# -----------------------
# DRIVE_BASE = Path('/content/drive/MyDrive/AML/processed/small/US_Dollar')
DRIVE_BASE = Path('/content/drive/MyDrive/AML/processed/US_Dollar')
proc = DRIVE_BASE

p_norm = proc / '1_filtered_normal_transactions.parquet'
p_pos  = proc / '2_filtered_laundering_transactions.parquet'
p_acct = proc / '3_filtered_accounts.parquet'

WINDOW_DAYS_LIST = [3, 7]
WINDOW_STRIDE_DAYS = 1
MAX_WINDOWS_PER_SETTING = 5  # set None to process all windows

# Communities
LOUVAIN_RESOLUTION = 1.0  # used by Leiden (RB configuration); Louvain ignores it
LOUVAIN_SEED = 42
COMMUNITY_WEIGHTED = True  # use 'w_amount_log' as weights for Louvain/Leiden

# -----------------------
# Helpers
# -----------------------
def parse_ts(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    dt = pd.to_datetime(s, format='%Y/%m/%d %H:%M', errors='coerce')
    mask = dt.isna()
    if mask.any():
        dt2 = pd.to_datetime(s[mask], format='%Y/%m/%d %H:%M:%S', errors='coerce')
        dt.loc[mask] = dt2
    return dt

def to_cents(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors='coerce').mul(100).round().astype('Int64')

def iter_windows(start, end, window_days=3, stride_days=1):
    cur = start
    while cur < end:
        yield (cur, cur + timedelta(days=window_days))
        cur += timedelta(days=stride_days)

def build_name_index(G: gt.Graph):
    name = G.vp['name']
    return {name[v]: v for v in G.vertices()}

def init_base_props(G: gt.Graph):
    # Vertex props
    G.vp['name']    = G.new_vertex_property('string')
    G.vp['bank_id'] = G.new_vertex_property('string')
    G.vp['entity_id'] = G.new_vertex_property('string')
    G.vp['entity_name'] = G.new_vertex_property('string')
    G.vp['is_laundering_involved'] = G.new_vertex_property('int32_t', vals=0)
    # Edge props for aggregated graphs
    G.ep['w_count'] = G.new_edge_property('int64_t', vals=0)
    G.ep['w_amount'] = G.new_edge_property('int64_t', vals=0) # amount received
    G.ep['w_amount_sent'] = G.new_edge_property('int64_t', vals=0) # amount sent
    G.ep['w_amount_log'] = G.new_edge_property('double', vals=0.0)
    G.ep['reciprocated'] = G.new_edge_property('int32_t', vals=0)  # for directed only

def init_agg_vertex_props(G: gt.Graph):
    G.vp['in_amount_sum'] = G.new_vertex_property('int64_t', vals=0)
    G.vp['out_amount_sum'] = G.new_vertex_property('int64_t', vals=0)
    G.vp['in_deg'] = G.new_vertex_property('int32_t', vals=0)
    G.vp['out_deg'] = G.new_vertex_property('int32_t', vals=0)
    G.vp['in_tx_count'] = G.new_vertex_property('int64_t', vals=0)
    G.vp['out_tx_count'] = G.new_vertex_property('int64_t', vals=0)
    G.vp['in_out_amount_ratio'] = G.new_vertex_property('double', vals=0.0)

def summarize_graph(G: gt.Graph, name='Graph'):
    print(f"{name}: {G.num_vertices():,} nodes, {G.num_edges():,} edges")

# -----------------------
# Data loading
# -----------------------
def load_processed():
    print("Loading processed data...")
    df_n = pd.read_parquet(p_norm)
    df_p = pd.read_parquet(p_pos)
    df = pd.concat([df_n, df_p], ignore_index=True)

    # Ensure timestamp is datetime and handle potential errors
    if not np.issubdtype(df['timestamp'].dtype, np.datetime64):
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df = df.dropna(subset=['timestamp'])
    df.sort_values('timestamp', inplace=True)

    # Clean and engineer fields
    df['is_laundering'] = pd.to_numeric(df['is_laundering'], errors='coerce').fillna(0).astype('int8')
    df['amount_sent_c'] = to_cents(df['amount_sent'])
    df['amount_received_c'] = to_cents(df['amount_received'])
    df['same_bank'] = (df['from_bank'].astype(str) == df['to_bank'].astype(str))
    # Ensure string ids
    df['from_account'] = df['from_account'].astype(str)
    df['to_account'] = df['to_account'].astype(str)

    acct = pd.read_parquet(p_acct).drop_duplicates(subset=['account_id_hex'])
    acct['account_id_hex'] = acct['account_id_hex'].astype(str)
    acct.set_index('account_id_hex', inplace=True)
    return df, acct

def build_all_light():
    df, acct = load_processed()
    tmin, tmax = (df['timestamp'].min(), df['timestamp'].max())
    print(f"Loaded: {len(df):,} tx; accounts: {len(acct):,}")
    print(f"Time range: {tmin} → {tmax}")
    return df, acct, tmin, tmax

# -----------------------
# Vectorized aggregation from DataFrame
# -----------------------
def aggregate_graph_from_df(df_slice: pd.DataFrame, acct: pd.DataFrame, directed=False) -> gt.Graph:
    """
    Vectorized aggregation:
      - Directed: collapse by (u, v)
      - Undirected: collapse by sorted pair (a, b)
      - Edge props: w_count, w_amount (received), w_amount_sent, w_amount_log, reciprocated (if directed)
      - Vertex props: name, bank_id, entity_id, entity_name, is_laundering_involved
        + in/out amounts, degrees, tx counts, ratio
    """
    if df_slice is None or len(df_slice) == 0:
        H = gt.Graph(directed=directed)
        init_base_props(H); init_agg_vertex_props(H)
        return H

    # Ensure strings and numeric arrays
    u = df_slice['from_account'].astype(str).to_numpy(copy=False)
    v = df_slice['to_account'].astype(str).to_numpy(copy=False)
    amt_recv = pd.to_numeric(df_slice['amount_received_c'], errors='coerce').fillna(0).astype(np.int64).to_numpy(copy=False)
    amt_sent = pd.to_numeric(df_slice['amount_sent_c'], errors='coerce').fillna(0).astype(np.int64).to_numpy(copy=False)


    if directed:
        tmp = pd.DataFrame({'u': u, 'v': v, 'w_recv': amt_recv, 'w_sent': amt_sent})
        agg_e = tmp.groupby(['u', 'v'], sort=False, observed=False).agg(
            w_amount_recv=('w_recv','sum'),
            w_amount_sent=('w_sent','sum'),
            w_count=('u','size')
        ).reset_index()
        # w_amount will represent received amount for compatibility with existing community logic if needed
        agg_e['w_amount'] = agg_e['w_amount_recv']
        # reciprocated flag
        edge_idx = pd.MultiIndex.from_frame(agg_e[['u', 'v']])
        rev_idx  = pd.MultiIndex.from_frame(agg_e[['v', 'u']])
        agg_e['reciprocated'] = edge_idx.isin(rev_idx).astype(np.int32)
        nodes = pd.unique(np.concatenate([agg_e['u'].to_numpy(), agg_e['v'].to_numpy()]))
    else:
        a = np.where(u <= v, u, v)
        b = np.where(u <= v, v, u)
        # In undirected, received and sent amounts are effectively the same from the edge perspective
        tmp = pd.DataFrame({'a': a, 'b': b, 'w': amt_recv}) # Use received for undirected sum
        sum_df = tmp.groupby(['a', 'b'], sort=False, observed=False)['w'].sum().rename('w_amount')
        cnt_df = tmp.groupby(['a', 'b'], sort=False, observed=False).size().rename('w_count')
        agg_e = pd.concat([sum_df, cnt_df], axis=1).reset_index()
        # For undirected, w_amount_sent is not meaningful at the edge level, can add if needed for vertex sums later
        # agg_e['w_amount_sent'] = agg_e['w_amount'] # Keep sent amount same as received for undirected for now
        agg_e['reciprocated'] = 0
        nodes = pd.unique(np.concatenate([agg_e['a'].to_numpy(), agg_e['b'].to_numpy()]))

    agg_e['w_amount_log'] = np.log1p(agg_e['w_amount']).astype(np.float64)

    # Build graph and props
    H = gt.Graph(directed=directed)
    init_base_props(H)
    init_agg_vertex_props(H)

    # Add vertices
    n = len(nodes)
    H.add_vertex(n)

    # Prepare vertex attributes (strings must be set per-vertex)
    names_list = [str(x) for x in nodes]
    sub_acct = acct.reindex(names_list)

    def col_or_default(df, col, default=''):
        if col in df.columns:
            return df[col].fillna('').astype(str).tolist()
        return [default] * len(df)

    bank_list = col_or_default(sub_acct, 'bank_id', '')
    entid_list = col_or_default(sub_acct, 'entity_id', '')
    entname_list = col_or_default(sub_acct, 'entity_name', '')

    name_vp = H.vp['name']
    bank_vp = H.vp['bank_id']
    eid_vp  = H.vp['entity_id']
    enm_vp  = H.vp['entity_name']

    for i in range(n):
        vtx = H.vertex(i)
        name_vp[vtx] = names_list[i]
        bank_vp[vtx] = bank_list[i]
        eid_vp[vtx]  = entid_list[i]
        enm_vp[vtx]  = entname_list[i]

    # Fast index map for nodes -> vertex index
    idx = {names_list[i]: i for i in range(n)}

    # Add edges with numeric properties (vectorized)
    if directed:
        src = agg_e['u'].map(idx).astype(np.int64).to_numpy()
        dst = agg_e['v'].map(idx).astype(np.int64).to_numpy()
        edge_tbl = np.column_stack([
            src, dst,
            agg_e['w_count'].to_numpy(np.int64),
            agg_e['w_amount'].to_numpy(np.int64), # received
            agg_e['w_amount_sent'].to_numpy(np.int64), # sent
            agg_e['w_amount_log'].to_numpy(np.float64),
            agg_e['reciprocated'].to_numpy(np.int32)
        ])
        edge_props = [H.ep['w_count'], H.ep['w_amount'], H.ep['w_amount_sent'], H.ep['w_amount_log'], H.ep['reciprocated']]
    else:
        src = agg_e['a'].map(idx).astype(np.int64).to_numpy()
        dst = agg_e['b'].map(idx).astype(np.int64).to_numpy()
        # Dropping w_amount_sent for undirected graphs
        edge_tbl = np.column_stack([
            src, dst,
            agg_e['w_count'].to_numpy(np.int64),
            agg_e['w_amount'].to_numpy(np.int64), # received
            agg_e['w_amount_log'].to_numpy(np.float64),
            agg_e['reciprocated'].to_numpy(np.int32)
        ])
        edge_props = [H.ep['w_count'], H.ep['w_amount'], H.ep['w_amount_log'], H.ep['reciprocated']]


    H.add_edge_list(
        edge_tbl,
        eprops=edge_props
    )

    # Vertex labels for positives (numeric -> OK to use .a)
    pos_mask = df_slice['is_laundering'] == 1
    if pos_mask.any():
        pos_nodes = pd.unique(np.concatenate([
            df_slice.loc[pos_mask, 'from_account'].astype(str).to_numpy(),
            df_slice.loc[pos_mask, 'to_account'].astype(str).to_numpy()
        ]))
    else:
        pos_nodes = np.array([], dtype=object)

    involved = H.vp['is_laundering_involved']
    arr_involv = np.zeros(n, dtype=np.int32)
    for acc in pos_nodes:
        j = idx.get(acc, None)
        if j is not None:
            arr_involv[j] = 1
    involved.a = arr_involv

    # Vertex-level sums and degrees (vectorized via groupby results)
    in_amount_sum = H.vp['in_amount_sum']; out_amount_sum = H.vp['out_amount_sum']
    in_deg = H.vp['in_deg']; out_deg = H.vp['out_deg']
    in_tx_count = H.vp['in_tx_count']; out_tx_count = H.vp['out_tx_count']
    in_out_ratio = H.vp['in_out_amount_ratio']

    if directed:
        # Use w_amount_recv for in_amount_sum and w_amount_sent for out_amount_sum
        in_amt_map  = agg_e.groupby('v')['w_amount_recv'].sum().to_dict()
        out_amt_map = agg_e.groupby('u')['w_amount_sent'].sum().to_dict()
        in_tx_map   = agg_e.groupby('v')['w_count'].sum().to_dict()
        out_tx_map  = agg_e.groupby('u')['w_count'].sum().to_dict()
        in_deg_map  = agg_e.groupby('v').size().to_dict()
        out_deg_map = agg_e.groupby('u').size().to_dict()

        arr_in_amt  = np.zeros(n, dtype=np.int64); arr_out_amt = np.zeros(n, dtype=np.int64)
        arr_in_tx   = np.zeros(n, dtype=np.int64); arr_out_tx  = np.zeros(n, dtype=np.int64)
        arr_in_deg  = np.zeros(n, dtype=np.int32); arr_out_deg = np.zeros(n, dtype=np.int32)

        for k, val in in_amt_map.items():  arr_in_amt[idx[k]]  = int(val)
        for k, val in out_amt_map.items(): arr_out_amt[idx[k]] = int(val)
        for k, val in in_tx_map.items():   arr_in_tx[idx[k]]   = int(val)
        for k, val in out_tx_map.items():  arr_out_tx[idx[k]]  = int(val)
        for k, val in in_deg_map.items():  arr_in_deg[idx[k]]  = int(val)
        for k, val in out_deg_map.items(): arr_out_deg[idx[k]] = int(val)

        in_amount_sum.a = arr_in_amt;  out_amount_sum.a = arr_out_amt
        in_tx_count.a   = arr_in_tx;   out_tx_count.a   = arr_out_tx
        in_deg.a        = arr_in_deg;  out_deg.a        = arr_out_deg
        in_out_ratio.a  = (arr_in_amt + 1.0) / (arr_out_amt + 1.0)
    else:
        # undirected: in == out
        deg_a = agg_e.groupby('a').size().to_dict()
        deg_b = agg_e.groupby('b').size().to_dict()
        amt_a = agg_e.groupby('a')['w_amount'].sum().to_dict() # Use w_amount (received) for undirected
        amt_b = agg_e.groupby('b')['w_amount'].sum().to_dict() # Use w_amount (received) for undirected
        tx_a  = agg_e.groupby('a')['w_count'].sum().to_dict()
        tx_b  = agg_e.groupby('b')['w_count'].sum().to_dict()

        arr_deg = np.zeros(n, dtype=np.int32)
        arr_amt = np.zeros(n, dtype=np.int64)
        arr_tx  = np.zeros(n, dtype=np.int64)

        for k, val in deg_a.items(): arr_deg[idx[k]] += int(val)
        for k, val in deg_b.items(): arr_deg[idx[k]] += int(val)
        for k, val in amt_a.items(): arr_amt[idx[k]] += int(val)
        for k, val in amt_b.items(): arr_amt[idx[k]] += int(val)
        for k, val in tx_a.items():  arr_tx[idx[k]]  += int(val)
        for k, val in tx_b.items():  arr_tx[idx[k]]  += int(val)

        in_deg.a = arr_deg;  out_deg.a = arr_deg
        in_amount_sum.a = arr_amt; out_amount_sum.a = arr_amt
        in_tx_count.a = arr_tx; out_tx_count.a = arr_tx
        in_out_ratio.a = np.ones(n, dtype=np.float64)

    return H

# -----------------------
# igraph conversion cache (for Louvain/Leiden and fast PPR in Cell 2)
# -----------------------
IG_CACHE = OrderedDict()
IG_CACHE_MAX = 8

def to_igraph(H: gt.Graph, use_weight=False, weight_name='w_amount_log', cache=True):
    key = (id(H), use_weight, weight_name, H.is_directed())
    if cache and key in IG_CACHE:
        g = IG_CACHE.pop(key); IG_CACHE[key] = g; return g

    n = H.num_vertices()
    attrs = []
    if use_weight and (weight_name in H.ep):
        attrs.append(H.ep[weight_name])
    if 'w_amount' in H.ep: # Add both received and sent amounts to igraph conversion
        attrs.append(H.ep['w_amount'])
    if 'w_amount_sent' in H.ep and H.is_directed(): # Only add sent amount if directed
        attrs.append(H.ep['w_amount_sent'])


    if attrs:
        ed = H.get_edges(eprops=attrs)
        edges = [(int(a), int(b)) for a, b in ed[:, :2].astype(int)]
        col = 2
        w = None; amt = None; amt_sent = None
        if use_weight and (weight_name in H.ep):
            w = [float(x) for x in ed[:, col].astype(float)]; col += 1
        if 'w_amount' in H.ep:
            amt = [float(x) for x in ed[:, col].astype(float)]; col += 1
        if 'w_amount_sent' in H.ep and H.is_directed():
            amt_sent = [float(x) for x in ed[:, col].astype(float)]; col += 1
    else:
        ed = H.get_edges()
        edges = [(int(a), int(b)) for a, b in ed.astype(int)]
        w = None; amt = None; amt_sent = None

    g = ig.Graph(n=n, edges=edges, directed=H.is_directed())
    if w is not None:
        g.es['weight'] = w
    if amt is not None:
        g.es['amount'] = amt # amount received
    if amt_sent is not None:
        g.es['amount_sent'] = amt_sent # amount sent

    g.vs['name'] = [H.vp['name'][H.vertex(i)] for i in range(n)]
    if cache:
        IG_CACHE[key] = g
        if len(IG_CACHE) > IG_CACHE_MAX:
            IG_CACHE.popitem(last=False)
    return g

# -----------------------
# Communities (Louvain & Leiden)
# -----------------------
def run_louvain_igraph(H: gt.Graph, seed=42, weight='w_amount_log'):
    """
    Louvain via igraph community_multilevel (modularity).
    """
    np.random.seed(seed)
    ig.random.seed(seed)

    g = to_igraph(H, use_weight=(COMMUNITY_WEIGHTED and (weight in H.ep)), weight_name=weight)
    cl = g.community_multilevel(weights=g.es['weight'] if ('weight' in g.es.attributes()) else None, return_levels=False)
    membership = cl.membership
    part = {g.vs[i]['name']: int(membership[i]) for i in range(g.vcount())}
    max_cid = max(membership) if membership else -1
    comms = [set() for _ in range(max_cid + 1)] if max_cid >= 0 else []
    for i, cid in enumerate(membership):
        comms[cid].add(g.vs[i]['name'])
    return part, comms

def run_leiden_igraph(H: gt.Graph, seed=42, weight='w_amount_log', resolution=1.0):
    """
    Leiden (RBConfiguration, tunable resolution).
    """
    np.random.seed(seed)
    ig.random.seed(seed)
    g = to_igraph(H, use_weight=(COMMUNITY_WEIGHTED and (weight in H.ep)), weight_name=weight)
    part = la.RBConfigurationVertexPartition(g,
                                             weights='weight' if ('weight' in g.es.attributes()) else None,
                                             resolution_parameter=resolution)
    opt = la.Optimiser()
    opt.set_rng_seed(seed)
    opt.optimise_partition(part)
    membership = part.membership
    part_map = {g.vs[i]['name']: int(membership[i]) for i in range(g.vcount())}
    max_cid = max(membership) if membership else -1
    comms = [set() for _ in range(max_cid + 1)] if max_cid >= 0 else []
    for i, cid in enumerate(membership):
        comms[cid].add(g.vs[i]['name'])
    return part_map, comms

# -----------------------
# Unsupervised community scoring
# -----------------------
def score_communities_unsupervised(H: gt.Graph, comms: list, min_size=3):
    scores = {}
    name_to_v = build_name_index(H)
    w_amount = H.ep.get('w_amount', None) # Use received amount for community score consistency
    for cid, nodes in enumerate(comms):
        if not nodes or len(nodes) < min_size:
            scores[cid] = 0.0
            continue
        mask = H.new_vertex_property('bool', vals=False)
        for n in nodes:
            v = name_to_v.get(n, None)
            if v is not None:
                mask[v] = True
        sub = gt.GraphView(H, vfilt=mask)
        n = int(sum(1 for _ in sub.vertices()))
        E = int(sum(1 for _ in sub.edges()))
        max_edges = n*(n-1)/2 if not H.is_directed() else n*(n-1)
        internal_density = (E / max_edges) if max_edges else 0.0
        try:
            lc = gt.local_clustering(sub)
            avg_clust = float(np.mean(lc.a)) if lc.a.size > 0 else 0.0
        except Exception:
            avg_clust = 0.0
        total_amount = 0
        if w_amount is not None:
            total_amount = int(sum(int(w_amount[e]) for e in sub.edges()))
        amount_score = min(1.0, float(np.log1p(total_amount)/20.0))
        size_boost = float(1 - np.exp(-n/10.0))
        scores[cid] = (0.35*internal_density + 0.2*avg_clust + 0.45*amount_score) * size_boost
    return scores

# -----------------------
# Attempt mapping (vectorized)
# -----------------------
def get_attempt_nodes_map_df(df_slice: pd.DataFrame) -> dict:
    if df_slice is None or len(df_slice) == 0:
        return {}
    pos = df_slice['is_laundering'] == 1
    dfp = df_slice.loc[pos, ['attempt_id', 'from_account', 'to_account']].dropna(subset=['attempt_id'])
    if dfp.empty:
        return {}
    dfp['attempt_id'] = dfp['attempt_id'].astype(str)
    g = dfp.groupby('attempt_id')
    att_nodes = {}
    for att_id, grp in g:
        att_nodes[att_id] = set(grp['from_account'].astype(str)).union(set(grp['to_account'].astype(str)))
    return att_nodes

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from pathlib import Path
from google.colab import drive
import os

import graph_tool.all as gt
import igraph as ig
import leidenalg as la

SKLEARN_OK = True
RUN_HITS = True

# -----------------------
# Paths and config
# -----------------------
drive.mount('/content/drive', force_remount=False)
DRIVE_BASE = Path('/content/drive/MyDrive/AML/processed/small/US_Dollar')
# DRIVE_BASE = Path('/content/drive/MyDrive/AML/processed/US_Dollar')
proc = DRIVE_BASE # Define proc here

# -----------------------
# Metrics configs
# -----------------------
METRICS_DIR = proc / "metrics"
METRICS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_CSV = METRICS_DIR / "window_metrics.csv"

K_FRACS = (0.005, 0.01, 0.02)  # 0.5%, 1%, 2% for precision@k and attempt coverage
SEED_CUTOFF_FRAC = 0.2         # first 20% of timeline for global seeds

MAX_WINDOWS_PER_SETTING = None

# --------- PPR speed config ---------
PPR_USE_IGRAPH = True         # use igraph's fast PPR
PPR_WEIGHTED = False          # if True, use 'w_amount_log' as edge weights; unweighted is faster
PPR_ALPHA = 0.85              # 0.85 converges faster than 0.9
PPR_EPS = 1e-4                # relaxed tolerance for faster convergence
PPR_MAX_ITER = 50             # cap iterations
PPR_HOPS = 2                  # k-hop subgraph around seeds (2 or 3 recommended)
PPR_BIDIR = False             # include IN neighbors too? (union of OUT and IN); False is faster
PPR_MAX_NODES = 40000         # cap subgraph size
PPR_MAX_SEEDS = 1000          # cap number of seed nodes (keep first N)

# -----------------------
# Metrics helpers
# -----------------------
def precision_at_k(y_true, y_score, k_frac=0.01):
    y_true = np.asarray(y_true); y_score = np.asarray(y_score)
    n = max(1, int(len(y_true) * k_frac))
    idx = np.argsort(-y_score)[:n]
    return float(y_true[idx].mean())

def get_node_names(G: gt.Graph):
    return [G.vp['name'][v] for v in G.vertices()]

def vprop_to_dict(G: gt.Graph, prop_name: str):
    prop = G.vp[prop_name]
    name = G.vp['name']
    return {name[v]: prop[v] for v in G.vertices()}

def eval_scores(nodes, y_true_dict, score_dict, k_fracs=(0.005, 0.01, 0.02), exclude_nodes=None):
    if exclude_nodes is None: exclude_nodes = set()
    eval_nodes = [n for n in nodes if n not in exclude_nodes]
    y_true = np.array([int(y_true_dict.get(n, 0)) for n in eval_nodes], dtype=int)
    res = {}
    for name, s in score_dict.items():
        scores = np.array([float(s.get(n, 0.0)) for n in eval_nodes], dtype=float)
        ap = average_precision_score(y_true, scores) if SKLEARN_OK and len(set(y_true)) > 1 else None
        metrics = {'ap': ap}
        metrics['_eval_nodes'] = len(eval_nodes)
        metrics['_eval_pos'] = int(y_true.sum())
        for k in k_fracs:
            metrics[f"p_at_{pct_key(k)}"] = precision_at_k(y_true, scores, k)
        order = np.argsort(-scores)
        metrics['_ranked_nodes'] = [eval_nodes[i] for i in order]
        res[name] = metrics
    return res

def run_centrality_baselines(H_dir: gt.Graph):
    scores = {}
    w = H_dir.ep.get('w_amount_log', None)
    try:
        pr = gt.pagerank(H_dir, damping=0.9, weight=w)
        names = H_dir.vp['name']
        scores['pagerank_wlog'] = {names[v]: float(pr[v]) for v in H_dir.vertices()}
    except Exception:
        scores['pagerank_wlog'] = {}

    if RUN_HITS:
        try:
            hubs, auth = gt.hits(H_dir, weight=w)
            names = H_dir.vp['name']
            scores['hits_hub'] = {names[v]: float(hubs[v]) for v in H_dir.vertices()}
            scores['hits_auth'] = {names[v]: float(auth[v]) for v in H_dir.vertices()}
        except Exception:
            scores['hits_hub'] = {}; scores['hits_auth'] = {}
    else:
        scores['hits_hub'] = {}; scores['hits_auth'] = {}

    for k_prop, name in [
        ('in_deg', 'in_deg'), ('out_deg', 'out_deg'),
        ('in_tx_count', 'in_tx'), ('out_tx_count', 'out_tx'),
        ('in_amount_sum', 'in_amt'), ('out_amount_sum', 'out_amt'),
    ]:
        try:
            scores[name] = vprop_to_dict(H_dir, k_prop)
        except Exception:
            scores[name] = {}
    try:
        in_amt = H_dir.vp['in_amount_sum']; out_amt = H_dir.vp['out_amount_sum']; names = H_dir.vp['name']
        scores['collector'] = {names[v]: float(in_amt[v]) / (float(out_amt[v]) + 1.0) for v in H_dir.vertices()}
        scores['distributor'] = {names[v]: float(out_amt[v]) / (float(in_amt[v]) + 1.0) for v in H_dir.vertices()}
    except Exception:
        scores['collector'] = {}; scores['distributor'] = {}
    return scores

def membership_to_comms(membership, names):
    k = max(membership) + 1 if membership else 0
    comms = [set() for _ in range(k)]
    for i, cid in enumerate(membership):
        comms[cid].add(names[i])
    return comms

def score_communities_fast_igraph(g, membership, min_size=2, amount_attr='amount', use_global_clustering=True):
    """
    Fast community scorer:
      - density: #intra edges / max edges
      - amount: sum of edge attribute (default: 'amount' = raw w_amount)
      - clustering: optional, computed once globally then averaged per community
    """
    n = g.vcount()
    if n == 0:
        return {}
    memb = np.asarray(membership, dtype=np.int64)
    K = int(memb.max()) + 1 if memb.size else 0
    size_by_c = np.bincount(memb, minlength=K)

    # Edge list and weights
    E = np.array(g.get_edgelist(), dtype=np.int64)
    if E.size == 0:
        return {cid: 0.0 for cid in range(K)}
    has_amount = amount_attr in g.es.attributes()
    w = np.asarray(g.es[amount_attr], dtype=float) if has_amount else np.ones(E.shape[0], dtype=float)

    cid_u = memb[E[:, 0]]
    cid_v = memb[E[:, 1]]
    mask_intra = (cid_u == cid_v)

    # Intra-community edge count and amount
    e_intra = np.bincount(cid_u[mask_intra], minlength=K)
    amt_intra = np.bincount(cid_u[mask_intra], weights=w[mask_intra], minlength=K)

    # Optional: global node clustering once
    if use_global_clustering:
        # Undirected local clustering; isolates -> 0
        try:
            cl = np.array(g.transitivity_local_undirected(mode="zero"), dtype=float)
            cl = np.nan_to_num(cl, nan=0.0, posinf=0.0, neginf=0.0)
        except Exception:
            cl = np.zeros(n, dtype=float)
        # Sum per community and then average
        cl_sum = np.bincount(memb, weights=cl, minlength=K)
        cl_avg = np.divide(cl_sum, np.maximum(1, size_by_c), where=(size_by_c > 0))
    else:
        cl_avg = np.zeros(K, dtype=float)

    # Build scores
    scores = {}
    for cid in range(K):
        n_c = int(size_by_c[cid])
        if n_c < min_size:
            scores[cid] = 0.0
            continue
        max_edges = n_c * (n_c - 1) / 2.0  # undirected aggregated graph
        density = float(e_intra[cid]) / max_edges if max_edges > 0 else 0.0
        amount_score = min(1.0, float(np.log1p(amt_intra[cid]) / 20.0))  # same scaling you had
        size_boost = float(1.0 - np.exp(-n_c / 10.0))
        # weights: 0.35 density, 0.2 clustering, 0.45 amount (same blend you used)
        scores[cid] = (0.35 * density + 0.2 * float(cl_avg[cid]) + 0.45 * amount_score) * size_boost
    return scores

def compute_communities_fast(H_agg, resolution=1.0, seed=42):
    """
    Returns:
      {
        'louvain': {'comms': list[set], 'scores': dict, 'avg': float},
        'leiden':  {'comms': list[set], 'scores': dict, 'avg': float},
        'ranked_cache': dict[(tag, kf)] -> list[node_name]
      }
    """
    # Build igraph once (undirected aggregated graph); to_igraph comes from Cell 1
    g_und = to_igraph(H_agg, use_weight=(COMMUNITY_WEIGHTED and ('w_amount_log' in H_agg.ep)), weight_name='w_amount_log', cache=False)
    names_und = g_und.vs['name']
    has_weight = 'weight' in g_und.es.attributes()
    has_amount = 'amount' in g_und.es.attributes()
    try:
        ig.random.seed(seed)
    except Exception:
        try:
            from igraph import RandomNumberGenerator
            ig.set_random_number_generator(RandomNumberGenerator(seed))
        except Exception:
            pass
    np.random.seed(seed)

    # Louvain
    cl_louv = g_und.community_multilevel(weights=g_und.es['weight'] if has_weight else None)
    memb_louv = cl_louv.membership
    comms_louv = membership_to_comms(memb_louv, names_und)
    scores_louv = score_communities_fast_igraph(g_und, memb_louv, min_size=3,
                                                amount_attr='amount' if has_amount else ('weight' if has_weight else None),
                                                use_global_clustering=True)
    avg_louv = float(np.mean(list(scores_louv.values()))) if scores_louv else 0.0

    # Leiden
    part = la.RBConfigurationVertexPartition(g_und,
                                             weights='weight' if has_weight else None,
                                             resolution_parameter=resolution)
    opt = la.Optimiser()
    opt.set_rng_seed(seed)
    opt.optimise_partition(part)
    memb_leid = part.membership
    comms_leid = membership_to_comms(memb_leid, names_und)
    scores_leid = score_communities_fast_igraph(g_und, memb_leid, min_size=3,
                                                amount_attr='amount' if has_amount else ('weight' if has_weight else None),
                                                use_global_clustering=True)
    avg_leid = float(np.mean(list(scores_leid.values()))) if scores_leid else 0.0

    # Pre-rank for your coverage metrics
    ranked_cache = {}
    for tag, comms, scores in [('louvain', comms_louv, scores_louv), ('leiden', comms_leid, scores_leid)]:
        if scores:
            comm_order = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            total_nodes = g_und.vcount()
            acc = set()
            for kf in K_FRACS:
                target = max(1, int(total_nodes * kf))
                acc.clear()
                for cid, _score in comm_order:
                    acc |= comms[cid]
                    if len(acc) >= target:
                        break
                ranked_cache[(tag, kf)] = list(acc)

    return {
        'louvain': {'comms': comms_louv, 'scores': scores_louv, 'avg': avg_louv},
        'leiden':  {'comms': comms_leid, 'scores': scores_leid, 'avg': avg_leid},
        'ranked_cache': ranked_cache
    }

# Uses to_igraph from Cell 1 (cached conversion)
def get_seeded_pagerank_scores(H_agg_dir: gt.Graph, seed_nodes: set, weight='w_amount_log', alpha=None):
    """
    Fast personalized PageRank:
      - subgraph around seeds (k-hop, capped size)
      - igraph backend
      - relaxed tolerance (via limited iterations approximation)
      - no reverse-graph averaging by default

    Returns: dict name -> score
    """
    if not seed_nodes:
        return {}
    # Trim seeds to cap, keep deterministic order
    if PPR_MAX_SEEDS is not None and len(seed_nodes) > PPR_MAX_SEEDS:
        seed_nodes = set(list(sorted(seed_nodes))[:PPR_MAX_SEEDS])

    if not PPR_USE_IGRAPH:
        # fallback: graph-tool with relaxed params, no reverse
        w = H_agg_dir.ep.get(weight, None) if PPR_WEIGHTED else None
        pers = H_agg_dir.new_vertex_property('double', vals=0.0)
        name_to_v = build_name_index(H_agg_dir)
        for n in seed_nodes:
            v = name_to_v.get(n, None)
            if v is not None:
                pers[v] = 1.0 / len(seed_nodes)
        alpha_eff = PPR_ALPHA if alpha is None else alpha
        pr = gt.pagerank(H_agg_dir, damping=alpha_eff, weight=w, pers=pers,
                         epsilon=PPR_EPS, max_iter=PPR_MAX_ITER)
        names = H_agg_dir.vp['name']
        return {names[v]: float(pr[v]) for v in H_agg_dir.vertices()}

    # igraph path
    g = to_igraph(H_agg_dir, use_weight=PPR_WEIGHTED, weight_name=weight, cache=False)
    # seed indices present in g
    seeds_idx = g.vs.select(name_in=list(seed_nodes)).indices
    if len(seeds_idx) == 0:
        return {}

    # k-hop neighborhood
    mode_out = ig.OUT
    keep = set()
    # OUT neighborhood
    for lst in g.neighborhood(seeds_idx, order=PPR_HOPS, mode=mode_out):
        keep.update(lst)
    # optionally include IN neighborhood
    if PPR_BIDIR:
        for lst in g.neighborhood(seeds_idx, order=PPR_HOPS, mode=ig.IN):
            keep.update(lst)
    # always keep seeds
    keep.update(seeds_idx)

    # Cap subgraph size
    keep_idx = list(keep)
    if PPR_MAX_NODES is not None and len(keep_idx) > PPR_MAX_NODES:
        # keep seeds plus highest-degree nodes
        deg = g.degree(keep_idx, mode=ig.ALL)
        order = np.argsort(-np.asarray(deg))
        cap = max(1, PPR_MAX_NODES - len(seeds_idx))
        selected = set(seeds_idx) | {keep_idx[i] for i in order[:cap]}
        keep_idx = list(selected)

    # Build subgraph with names preserved
    sub = g.induced_subgraph(keep_idx)
    sub_names = sub.vs['name']
    # seed indices in subgraph
    sub_seeds_idx = sub.vs.select(name_in=list(seed_nodes)).indices
    if len(sub_seeds_idx) == 0:
        return {name: 0.0 for name in sub_names}

    # Personalized restart vector
    reset = np.zeros(sub.vcount(), dtype=float)
    reset[sub_seeds_idx] = 1.0 / len(sub_seeds_idx)

    # Personalized PageRank on subgraph
    alpha_eff = PPR_ALPHA if alpha is None else alpha
    weights_key = 'weight' if (PPR_WEIGHTED and 'weight' in sub.es.attributes()) else None
    pr = sub.personalized_pagerank(damping=alpha_eff, reset=reset, weights=weights_key, directed=True)

    # Map back to full names; nodes outside subgraph get 0
    scores = {name: float(score) for name, score in zip(sub_names, pr)}
    return scores

def attempt_coverage(nodes_ranked, attempt_nodes_map: dict, k_frac=0.01):
    if not attempt_nodes_map:
        return None
    N = len(nodes_ranked); k = max(1, int(N * k_frac))
    top = set(nodes_ranked[:k])
    covered = sum(1 for nodes in attempt_nodes_map.values() if top & nodes)
    return covered / max(1, len(attempt_nodes_map))

def pretty_metrics(results: dict):
    def is_num(x):
        return isinstance(x, (int, float, np.integer, np.floating))
    out = {}
    for method, metr in results.items():
        out[method] = {}
        for k, v in metr.items():
            if str(k).startswith('_'):
                continue
            if v is None:
                out[method][k] = None
            elif is_num(v):
                out[method][k] = round(float(v), 4)
            else:
                out[method][k] = v
    return out

def pct_key(kf: float) -> str:
    """Formats percentage keys consistently."""
    return f"{kf*100:.1f}pct"

# -----------------------
# Build data
# -----------------------
df, acct, tmin, tmax = build_all_light()

# -----------------------
# Temporal windows quick summary (vectorized)
# -----------------------
for window_days in WINDOW_DAYS_LIST:
    print(f"\n-- {window_days}-day windows, stride={WINDOW_STRIDE_DAYS}d --")
    for i, (ws, we) in enumerate(iter_windows(tmin, tmax, window_days=window_days, stride_days=WINDOW_STRIDE_DAYS)):
        df_slice = df[(df['timestamp'] >= ws) & (df['timestamp'] < we)]
        if df_slice.empty:
            continue
        nodes_win = pd.unique(np.concatenate([df_slice['from_account'].to_numpy(), df_slice['to_account'].to_numpy()]))
        pos_e = int((df_slice['is_laundering'] == 1).sum())
        pos_nodes_win = len(pd.unique(np.concatenate([
            df_slice.loc[df_slice['is_laundering']==1, 'from_account'].to_numpy(),
            df_slice.loc[df_slice['is_laundering']==1, 'to_account'].to_numpy()
        ])))
        print(f"[{i:03d}] {ws:%Y-%m-%d} → {we:%Y-%m-%d}: nodes={len(nodes_win):,}, edges={len(df_slice):,}, pos_edges={pos_e:,}, pos_nodes={pos_nodes_win:,}")
        if MAX_WINDOWS_PER_SETTING is not None and i + 1 >= MAX_WINDOWS_PER_SETTING:
            break

# -----------------------
# Full-period community baselines (Louvain + Leiden)
# -----------------------
H_full = aggregate_graph_from_df(df, acct, directed=False)
print(f"\nEnhanced community baselines on full period:")
print(f"Aggregated graph: {H_full.num_vertices():,} nodes, {H_full.num_edges():,} edges")

comm_full = compute_communities_fast(H_full, resolution=LOUVAIN_RESOLUTION, seed=LOUVAIN_SEED)

print("Louvain (igraph) communities (top 5 by heuristic score):")
scores_louv = comm_full['louvain']['scores']
comms_louv = comm_full['louvain']['comms']
for cid, score in sorted(scores_louv.items(), key=lambda x: x[1], reverse=True)[:5]:
    size = len(comms_louv[cid]) if cid < len(comms_louv) else 0
    print(f"  LVN cid={cid:>4}  score={score:.3f}  size={size:>6}")

print("Leiden (igraph) communities (top 5 by heuristic score):")
scores_leid = comm_full['leiden']['scores']
comms_leid = comm_full['leiden']['comms']
for cid, score in sorted(scores_leid.items(), key=lambda x: x[1], reverse=True)[:5]:
    size = len(comms_leid[cid]) if cid < len(comms_leid) else 0
    print(f"  LDN cid={cid:>4}  score={score:.3f}  size={size:>6}")
# -----------------------
# Build fixed time-based seeds (no leakage)
# -----------------------
if tmin is None or tmax is None:
    raise RuntimeError("Time range unavailable; cannot build seeds.")
T = tmin + (tmax - tmin) * SEED_CUTOFF_FRAC
df_seed = df[(df['timestamp'] >= tmin) & (df['timestamp'] < T)]
seed_nodes_global = set(pd.unique(np.concatenate([
    df_seed.loc[df_seed['is_laundering']==1, 'from_account'].astype(str).to_numpy(),
    df_seed.loc[df_seed['is_laundering']==1, 'to_account'].astype(str).to_numpy()
])))
print(f"Global seeds cutoff T={T} | seed_nodes={len(seed_nodes_global)}")

# -----------------------
# Per-window enhanced analysis (prints)
# -----------------------
print("\nPer-window enhanced analysis (first few windows per setting):")
for window_days in WINDOW_DAYS_LIST:
    print(f"\n-- {window_days}-day windows --")
    count = 0
    for ws, we in iter_windows(tmin, tmax, window_days=window_days, stride_days=WINDOW_STRIDE_DAYS):
        df_slice = df[(df['timestamp'] >= ws) & (df['timestamp'] < we)]
        if df_slice.empty:
            continue
        H_agg = aggregate_graph_from_df(df_slice, acct, directed=False)
        H_agg_dir = aggregate_graph_from_df(df_slice, acct, directed=True)

        # Centralities
        nodes = get_node_names(H_agg_dir)
        y_true_dict = vprop_to_dict(H_agg_dir, 'is_laundering_involved')
        score_dict = run_centrality_baselines(H_agg_dir)
        results = eval_scores(nodes, y_true_dict, score_dict, k_fracs=(0.005, 0.01, 0.02))
        print("  Centrality baselines:", pretty_metrics(results))

        # Communities: Louvain and Leiden (fast scorer)
        comm = compute_communities_fast(H_agg, resolution=LOUVAIN_RESOLUTION, seed=LOUVAIN_SEED)
        avg_comm_score_louv = comm['louvain']['avg']
        avg_comm_score_leid = comm['leiden']['avg']

        # Seeded PR (time-based seeds)
        pr_auc_win = None
        if ws >= T and seed_nodes_global:
            seeded_scores = get_seeded_pagerank_scores(H_agg_dir, seed_nodes_global, weight='w_amount_log', alpha=0.9)
            eval_nodes = [n for n in nodes if n not in seed_nodes_global]
            y_true = [int(y_true_dict.get(n, 0)) for n in eval_nodes]
            y_score = [seeded_scores.get(n, 0.0) for n in eval_nodes]
            if SKLEARN_OK and len(set(y_true)) > 1:
                pr_auc_win = average_precision_score(y_true, y_score)

        print(f"[{ws:%Y-%m-%d} → {we:%Y-%m-%d}] nodes={H_agg.num_vertices():,}, edges={H_agg.num_edges():,}")
        print(f"  Avg community score (Louvain): {avg_comm_score_louv:.4f} | (Leiden): {avg_comm_score_leid:.4f}")
        if pr_auc_win is not None:
            print(f"  PersonalizedPageRank PR-AUC: {pr_auc_win:.4f}")

        count += 1
        if MAX_WINDOWS_PER_SETTING is not None and count >= MAX_WINDOWS_PER_SETTING:
            break

# -----------------------
# Full per-window metrics -> CSV
# -----------------------
rows = []
for window_days in WINDOW_DAYS_LIST:
    count = 0
    for ws, we in iter_windows(tmin, tmax, window_days=window_days, stride_days=WINDOW_STRIDE_DAYS):
        df_slice = df[(df['timestamp'] >= ws) & (df['timestamp'] < we)]
        if df_slice.empty:
            continue

        H_agg = aggregate_graph_from_df(df_slice, acct, directed=False)
        H_agg_dir = aggregate_graph_from_df(df_slice, acct, directed=True)
        nodes = get_node_names(H_agg_dir)
        y_true_dict = vprop_to_dict(H_agg_dir, 'is_laundering_involved')
        att_nodes_map = get_attempt_nodes_map_df(df_slice)

        score_dict = run_centrality_baselines(H_agg_dir)
        results = eval_scores(nodes, y_true_dict, score_dict, k_fracs=K_FRACS)

        if ws >= T and seed_nodes_global:
            seeded_scores = get_seeded_pagerank_scores(H_agg_dir, seed_nodes_global, weight='w_amount_log', alpha=0.9)
            seeded_res = eval_scores(nodes, y_true_dict, {'seeded_pr': seeded_scores}, k_fracs=K_FRACS, exclude_nodes=seed_nodes_global)
            results.update(seeded_res)

        # Communities (Louvain + Leiden), fast scorer
        comm = compute_communities_fast(H_agg, resolution=LOUVAIN_RESOLUTION, seed=LOUVAIN_SEED)
        comm_ranked_nodes_cache = comm['ranked_cache']

        base = {
            'window_days': window_days, 'ws': ws, 'we': we,
            'nodes': int(H_agg_dir.num_vertices()), 'edges': int(H_agg_dir.num_edges()),
            'pos_nodes': int(sum(int(y_true_dict.get(n, 0)) for n in nodes))
        }
        for method, m in results.items():
            row = dict(base); row['method'] = method; row['ap'] = m.get('ap', None)

            eval_nodes_count = m.get('_eval_nodes', len(nodes))
            eval_pos_count = m.get('_eval_pos', int(sum(y_true_dict.get(n, 0) for n in nodes)))
            row['eval_nodes'] = eval_nodes_count
            row['eval_pos_nodes'] = eval_pos_count
            row['prevalence_eval'] = (eval_pos_count / eval_nodes_count) if eval_nodes_count > 0 else np.nan

            for kf in K_FRACS:
                # Use the consistent pct_key formatter for p_at and attcov
                p_at_key = f"p_at_{pct_key(kf)}"
                attcov_key = f"attcov_at_{pct_key(kf)}"
                row[p_at_key] = m.get(p_at_key, None)
                ranked_nodes = m.get('_ranked_nodes', [])
                cov = attempt_coverage(ranked_nodes, att_nodes_map, k_frac=kf)
                row[attcov_key] = cov
            rows.append(row)

        # Community coverage rows (Louvain and Leiden)
        for tag in ['louvain', 'leiden']:
            if any((tag, kf) in comm_ranked_nodes_cache for kf in K_FRACS):
                row = dict(base); row['method'] = f'communities_unsup_{tag}'; row['ap'] = None
                row['eval_nodes'] = base['nodes']; row['eval_pos_nodes'] = base['pos_nodes']
                row['prevalence_eval'] = (base['pos_nodes'] / base['nodes']) if base['nodes'] > 0 else np.nan
                for kf in K_FRACS:
                    p_at_key = f"p_at_{pct_key(kf)}"
                    attcov_key = f"attcov_at_{pct_key(kf)}"
                    row[p_at_key] = None
                    # Community coverage is based on the top k% of *communities* by score,
                    # not the top k% of nodes by an individual node score.
                    # The cache `comm_ranked_nodes_cache` already contains the nodes from
                    # the top k% of communities, so we use k_frac=1.0 in attempt_coverage
                    # for this list of nodes.
                    cov = attempt_coverage(comm_ranked_nodes_cache[(tag, kf)], att_nodes_map, k_frac=1.0) if (tag, kf) in comm_ranked_nodes_cache else None
                    row[attcov_key] = cov
                rows.append(row)

        count += 1
        if MAX_WINDOWS_PER_SETTING is not None and count >= MAX_WINDOWS_PER_SETTING:
            break

df_metrics = pd.DataFrame(rows)

def add_random_baseline(dfm: pd.DataFrame) -> pd.DataFrame:
    cols = list(dfm.columns)
    rows = []
    for _, r in dfm.groupby(['window_days', 'ws', 'we']).head(1).iterrows():
        base = {c: r.get(c, np.nan) for c in cols} # Use np.nan instead of None
        base['method'] = 'random'
        prev_eval = base.get('prevalence_eval', np.nan) # Use np.nan as default
        if pd.isna(prev_eval):
             prev_eval = (base.get('pos_nodes', 0) / base.get('nodes', 1)) if base.get('nodes', 0) else np.nan
             base['prevalence_eval'] = prev_eval
        base['ap'] = prev_eval
        for kf in K_FRACS:
            p_at_key = f"p_at_{pct_key(kf)}"
            attcov_key = f"attcov_at_{pct_key(kf)}"
            base[p_at_key] = prev_eval
            base[attcov_key] = np.nan # Random baseline doesn't have a concept of ranked communities
        rows.append(base)
    # Ensure all columns are present in rand_df
    rand_df = pd.DataFrame(rows)
    for col in dfm.columns:
        if col not in rand_df.columns:
            rand_df[col] = np.nan
    rand_df = rand_df[cols] # Ensure column order is the same
    return pd.concat([dfm, rand_df], ignore_index=True)

df_metrics = add_random_baseline(df_metrics)

# Prevalence and lift
df_metrics['prevalence'] = df_metrics['pos_nodes'] / df_metrics['nodes']
for kf in K_FRACS:
    p_at_key = f"p_at_{pct_key(kf)}"
    if p_at_key in df_metrics.columns:
        df_metrics[f'lift_{p_at_key}'] = df_metrics[p_at_key] / df_metrics['prevalence']
        df_metrics[f'lift_eval_{p_at_key}'] = df_metrics[p_at_key] / df_metrics['prevalence_eval']

# -----------------------
# Validation checks
# -----------------------
chk = (df_metrics.groupby(['window_days','ws','we'])
       .agg(nodes_nunique=('nodes','nunique'),
            pos_nodes_nunique=('pos_nodes','nunique'))
       .reset_index())
bad = chk[(chk.nodes_nunique != 1) | (chk.pos_nodes_nunique != 1)]
if not bad.empty:
    print("WARNING: nodes/pos_nodes inconsistent across methods:")
    print(bad.to_string(index=False))

random_rows = df_metrics[df_metrics.method == 'random']
if not random_rows.empty and f'lift_p_at_{pct_key(0.01)}' in random_rows.columns:
    random_lift_median = random_rows[f'lift_p_at_{pct_key(0.01)}'].median()
    if abs(random_lift_median - 1.0) > 0.05:
        print(f"WARNING: Random baseline lift_p_at_1.0pct median = {random_lift_median:.3f}, expected ≈ 1.0")

seeded_rows = df_metrics[df_metrics.method == 'seeded_pr']
if not seeded_rows.empty:
    high_prev = seeded_rows[seeded_rows.prevalence_eval > 0.5]
    if not high_prev.empty:
        print(f"WARNING: {len(high_prev)} seeded_pr rows have prevalence_eval > 0.5 (potentially degenerate)")

df_metrics.to_csv(RESULTS_CSV, index=False)
print(f"\nSaved per-window metrics to {RESULTS_CSV}")

if not df_metrics.empty:
    summary = (df_metrics
               .groupby(['window_days', 'method'])
               .agg(ap_median=('ap','median'),
                    p01_median=(f'p_at_{pct_key(0.01)}','median'),
                    lift_p01_median=(f'lift_p_at_{pct_key(0.01)}','median'),
                    lift_eval_p01_median=(f'lift_eval_p_at_{pct_key(0.01)}','median'),
                    attcov01_median=(f'attcov_at_{pct_key(0.01)}','median'),
                    prevalence_median=('prevalence','median'),
                    windows=('ws','count'))
               .reset_index()
               .sort_values(['window_days', 'ap_median'], ascending=[True, False]))
    print("\nSummary (median across windows):")
    print(summary.to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading processed data...
Loaded: 7,237,958 tx; accounts: 555,587
Time range: 2022-08-01 00:00:00 → 2023-01-12 16:49:00

-- 3-day windows, stride=1d --
[000] 2022-08-01 → 2022-08-04: nodes=205,499, edges=262,957, pos_edges=761, pos_nodes=1,418
[001] 2022-08-02 → 2022-08-05: nodes=149,257, edges=225,360, pos_edges=823, pos_nodes=1,543
[002] 2022-08-03 → 2022-08-06: nodes=233,458, edges=284,739, pos_edges=904, pos_nodes=1,694
[003] 2022-08-04 → 2022-08-07: nodes=233,565, edges=248,949, pos_edges=936, pos_nodes=1,708
[004] 2022-08-05 → 2022-08-08: nodes=233,433, edges=213,304, pos_edges=957, pos_nodes=1,725
[005] 2022-08-06 → 2022-08-09: nodes=149,492, edges=154,204, pos_edges=961, pos_nodes=1,723
[006] 2022-08-07 → 2022-08-10: nodes=149,860, edges=190,445, pos_edges=1,035, pos_nodes=1,876
[007] 2022-08-08 → 2022-08-11: nodes=149,370, edges=225,891, pos_edges=1,

In [34]:
import pandas as pd
import os

# Define the path to the existing CSV file
DRIVE_DIR = '/content/drive/MyDrive/AML'
PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed/small')
METRICS_DIR = os.path.join(PROCESSED_DIR, 'US_Dollar', 'metrics')
EXISTING_CSV = os.path.join(METRICS_DIR, 'compare_window_metrics.csv')
# Correct the path to the newly generated CSV
RESULTS_CSV_PATH = os.path.join(METRICS_DIR, 'window_metrics.csv')


# Check if the existing CSV file exists
if not os.path.exists(EXISTING_CSV):
    print(f"Error: Existing CSV file not found at {EXISTING_CSV}")
elif not os.path.exists(RESULTS_CSV_PATH):
    print(f"Error: Newly generated CSV file not found at {RESULTS_CSV_PATH}")

else:
    # Load the existing CSV
    df_existing = pd.read_csv(EXISTING_CSV)

    # Load the newly generated CSV (which is df_metrics saved to RESULTS_CSV)
    df_new = pd.read_csv(RESULTS_CSV_PATH)


    # --- Comparison ---
    # Ensure columns are in the same order and have the same data types for comparison
    # This might require some cleaning based on how the original compare_window_metrics.csv was created
    # For a robust comparison, consider sorting columns and potentially rounding float values

    # Simple comparison after sorting columns
    df_existing_sorted_cols = df_existing.sort_index(axis=1)
    df_new_sorted_cols = df_new.sort_index(axis=1)

    # Check if shapes are the same
    if df_existing_sorted_cols.shape != df_new_sorted_cols.shape:
        print("DataFrames have different shapes!")
        print("Existing shape:", df_existing_sorted_cols.shape)
        print("New shape:", df_new_sorted_cols.shape)
    else:
        # Compare values, ignoring NaNs
        # Use .equals for exact comparison or np.isclose for float comparison
        comparison_result = df_existing_sorted_cols.equals(df_new_sorted_cols)

        if comparison_result:
            print("The newly generated CSV is identical to the existing one.")
        else:
            print("The newly generated CSV is different from the existing one.")
            # Optional: Find differences
            diff_mask = (df_existing_sorted_cols != df_new_sorted_cols) & \
                        (~df_existing_sorted_cols.isnull()) & (~df_new_sorted_cols.isnull())
            if diff_mask.sum().sum() > 0:
                print("\nDifferences found (excluding NaN differences):")
                # Print rows where differences exist
                diff_rows = df_new_sorted_cols[diff_mask.any(axis=1)]
                print(diff_rows)
            else:
                 print("\nDifferences are likely due to NaN handling or floating point precision.")
                 # A more detailed comparison for floats
                 # For simplicity, skipping detailed float comparison difference printing here,
                 # but you could iterate through columns and use np.isclose

The newly generated CSV is different from the existing one.

Differences found (excluding NaN differences):
     ap  attcov_at_0.5pct  attcov_at_1.0pct  attcov_at_2.0pct  edges  \
11  NaN          0.000000          0.019608          0.039216  52569   
12  NaN          0.000000          0.019608          0.039216  52569   
24  NaN          0.000000          0.041096          0.068493  42573   
25  NaN          0.000000          0.041096          0.068493  42573   
37  NaN          0.000000          0.012346          0.012346  27517   
38  NaN          0.000000          0.012346          0.012346  27517   
50  NaN          0.010526          0.010526          0.010526  27162   
51  NaN          0.010526          0.010526          0.010526  27162   
64  NaN          0.000000          0.000000          0.000000  27305   
65  NaN          0.000000          0.000000          0.000000  27305   
78  NaN          0.000000          0.000000          0.009804  27263   
79  NaN          0.000000   

In [35]:
from datetime import datetime


# Define the cutoff date
cutoff_date = datetime(2022, 9, 10)

# Filter transactions after the cutoff date
df_after_cutoff = df[df['timestamp'] >= cutoff_date]

# Identify all unique accounts in this period
all_accounts_after_cutoff = set(df_after_cutoff['from_account']).union(set(df_after_cutoff['to_account']))

# Identify unique laundering accounts in this period
laundering_accounts_after_cutoff = set(
    df_after_cutoff[df_after_cutoff['is_laundering'] == 1]['from_account']
).union(
    set(df_after_cutoff[df_after_cutoff['is_laundering'] == 1]['to_account'])
)

# Identify negative accounts (all accounts minus laundering accounts)
negative_accounts_after_cutoff = all_accounts_after_cutoff - laundering_accounts_after_cutoff

print(f"Number of all accounts after {cutoff_date}: {len(all_accounts_after_cutoff):,}")
print(f"Number of laundering accounts after {cutoff_date}: {len(laundering_accounts_after_cutoff):,}")
print(f"Number of negative accounts after {cutoff_date}: {len(negative_accounts_after_cutoff):,}")

# Display a sample of negative accounts
print("\nSample of negative accounts:")
display(list(negative_accounts_after_cutoff)[:10])


d = df.assign(day=df.timestamp.dt.date)
daily = d.groupby('day').agg(n=('is_laundering','size'),
pos=('is_laundering','sum'))
daily['prevalence'] = daily['pos'] / daily['n']
print(daily.tail(10))

Number of all accounts after 2022-09-10 00:00:00: 16,471
Number of laundering accounts after 2022-09-10 00:00:00: 575
Number of negative accounts after 2022-09-10 00:00:00: 15,896

Sample of negative accounts:


['808460B10',
 '804627160',
 '80C6D5240',
 '800F8BDD0',
 '8021E7A20',
 '80BDF9A30',
 '80AEC34D0',
 '806044840',
 '806A0CAB0',
 '808F747B0']

                n  pos  prevalence
day                               
2022-09-09  30322  152    0.005013
2022-09-10  10409  170    0.016332
2022-09-11    100   88    0.880000
2022-09-12     78   74    0.948718
2022-09-13     33   32    0.969697
2022-09-14     30   27    0.900000
2022-09-15      8    8    1.000000
2022-09-16      8    8    1.000000
2022-09-17      2    2    1.000000
2022-09-18      5    5    1.000000


In [36]:

df_n = pd.read_parquet(p_norm)
df_p = pd.read_parquet(p_pos)
print("Normals:", df_n.timestamp.min(), "→", df_n.timestamp.max(), "| rows:", len(df_n))
print("Positives:", df_p.timestamp.min(), "→", df_p.timestamp.max(), "| rows:", len(df_p))
dn = (df_n.assign(day=df_n.timestamp.dt.date).groupby('day').size().rename('n_norm'))
dp = (df_p.assign(day=df_p.timestamp.dt.date).groupby('day').size().rename('n_pos'))
daily = pd.concat([dn, dp], axis=1).fillna(0).astype(int)
daily['prevalence'] = daily['n_pos'] / (daily['n_norm'] + daily['n_pos']).clip(lower=1)
print(daily.tail(12))

cutoff = pd.Timestamp('2022-09-10')
d0 = df[(df.timestamp >= cutoff) & (df.is_laundering == 0)]
print("Normals after cutoff:", len(d0))
print(d0['payment_type'].value_counts().head())
print(d0[['currency_sent','currency_received']].value_counts().head(10))


Normals: 2022-08-01 00:00:00 → 2023-01-09 12:27:00 | rows: 7156712
Positives: 2022-08-01 00:03:00 → 2023-01-12 16:49:00 | rows: 81246
            n_norm  n_pos  prevalence
day                                  
2022-12-27       0     10         1.0
2022-12-28       0     15         1.0
2022-12-31       0     12         1.0
2023-01-01       0      7         1.0
2023-01-02       0      8         1.0
2023-01-03       0      9         1.0
2023-01-04       0     10         1.0
2023-01-05       0      8         1.0
2023-01-06       0      8         1.0
2023-01-08       0      6         1.0
2023-01-11       0      1         1.0
2023-01-12       0      3         1.0
Normals after cutoff: 10259
payment_type
ACH    10259
Name: count, dtype: int64
currency_sent  currency_received
US Dollar      US Dollar            10259
Name: count, dtype: int64
