<a href="https://colab.research.google.com/github/caetano-dev/PixFraudDetection/blob/main/TCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install pyarrow



In [2]:
import os
import duckdb
from google.colab import drive

drive.mount('/content/drive')
DRIVE_DIR = '/content/drive/MyDrive/AML'
TX_CSV = os.path.join(DRIVE_DIR, 'HI-Large_Trans.csv')

con = duckdb.connect()

# Read CSV directly and count currencies where currency_sent == currency_received.
# Adjust column types/names if your CSV has a header; this assumes no header as in your original code.
con.execute(f"""
CREATE VIEW tx AS
SELECT
  column00 AS timestamp,
  column01 AS from_bank,
  column02 AS from_account,
  column03 AS to_bank,
  column04 AS to_account,
  column05::DOUBLE AS amount_received,
  column06 AS currency_received,
  column07::DOUBLE AS amount_sent,
  column08 AS currency_sent,
  column09 AS payment_type,
  column10::INTEGER AS is_laundering
FROM read_csv_auto('{TX_CSV}', HEADER=FALSE)
""")
df_counts = con.execute("""
SELECT currency_sent AS currency, COUNT(*) AS cnt
FROM tx
WHERE currency_received = currency_sent AND payment_type = 'ACH'
GROUP BY currency_sent
ORDER BY cnt DESC
""").fetchdf()
print(df_counts)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

             currency      cnt
0           US Dollar  7237958
1                Euro  4582272
2                Yuan  1422620
3              Shekel   876858
4     Canadian Dollar   667281
5            UK Pound   624202
6               Ruble   607117
7   Australian Dollar   563541
8         Swiss Franc   532790
9                 Yen   527523
10       Mexican Peso   522109
11              Rupee   450976
12        Brazil Real   380506
13        Saudi Riyal   347202


In [None]:
import os
import re
import duckdb
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
DRIVE_DIR = '/content/drive/MyDrive/AML'
PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed')
os.makedirs(PROCESSED_DIR, exist_ok=True)

TX_CSV = os.path.join(DRIVE_DIR, 'HI-Large_Trans.csv')
PATTERNS_TXT = os.path.join(DRIVE_DIR, 'HI-large_Patterns.txt')
ACCOUNTS_CSV = os.path.join(DRIVE_DIR, 'HI-Large_accounts.csv')

if not os.path.exists(TX_CSV):
    print(f"ERROR: The transaction file was not found at the specified path.")
    print(f"Please make sure '{os.path.basename(TX_CSV)}' is in the '{DRIVE_DIR}' folder in your Google Drive.")
else:
    print(f"Successfully located data folder in Google Drive: {DRIVE_DIR}")
    print("-" * 50)

    OUT_STEP1 = os.path.join(PROCESSED_DIR, '1_filtered_normal_transactions.parquet')
    OUT_STEP2 = os.path.join(PROCESSED_DIR, '2_filtered_laundering_transactions.parquet')
    OUT_STEP3 = os.path.join(PROCESSED_DIR, '3_filtered_accounts.parquet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully located data folder in Google Drive: /content/drive/MyDrive/AML
--------------------------------------------------


In [None]:
standard_columns = [
    'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account',
    'amount_received', 'currency_received', 'amount_sent', 'currency_sent',
    'payment_type', 'is_laundering'
]

column_types = {
    'timestamp': 'VARCHAR',
    'from_bank': 'VARCHAR',
    'from_account': 'VARCHAR',
    'to_bank': 'VARCHAR',
    'to_account': 'VARCHAR',
    'amount_received': 'VARCHAR',
    'currency_received': 'VARCHAR',
    'amount_sent': 'VARCHAR',
    'currency_sent': 'VARCHAR',
    'payment_type': 'VARCHAR',
    'is_laundering': 'VARCHAR'
}

def parse_patterns_file(file_path):
    attempts = []
    current_attempt = None
    attempt_counter = 0

    with open(file_path, 'r') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith('BEGIN LAUNDERING ATTEMPT'):
                attempt_counter += 1
                m = re.search(r'BEGIN LAUNDERING ATTEMPT\s*-\s*(.+)$', line)
                attempt_type = m.group(1).strip() if m else 'UNKNOWN'
                current_attempt = {
                    'attempt_id': attempt_counter,
                    'attempt_type': attempt_type,
                    'transactions': []
                }
            elif line.startswith('END LAUNDERING ATTEMPT'):
                if current_attempt:
                    attempts.append(current_attempt)
                current_attempt = None
            elif current_attempt:
                parts = [p.strip() for p in line.split(',')]
                if len(parts) >= 11:
                    tx = dict(zip(standard_columns, parts[:11]))
                    tx['attempt_id'] = current_attempt['attempt_id']
                    tx['attempt_type'] = current_attempt['attempt_type']
                    current_attempt['transactions'].append(tx)

    all_transactions = [tx for attempt in attempts for tx in attempt['transactions']]
    return pd.DataFrame(all_transactions, columns=standard_columns + ['attempt_id', 'attempt_type'])


con = duckdb.connect(database=':memory:')
con.execute("PRAGMA threads=8")

read_tx_csv_sql = f"""
  SELECT * FROM read_csv_auto(
    '{TX_CSV}',
    delim=',',
    header=false,
    columns={column_types},
    all_varchar=true
  )
"""

ts_parse_sql = """
CASE
  WHEN length(trim(timestamp)) = 16 THEN strptime(trim(timestamp), '%Y/%m/%d %H:%M')
  WHEN length(trim(timestamp)) = 19 THEN strptime(trim(timestamp), '%Y/%m/%d %H:%M:%S')
  ELSE NULL
END
"""

typed_tx_sql = f"""
WITH raw AS ({read_tx_csv_sql})
SELECT
  {ts_parse_sql}::TIMESTAMP AS timestamp,
  trim(from_bank) AS from_bank,
  trim(from_account) AS from_account,
  trim(to_bank) AS to_bank,
  trim(to_account) AS to_account,
  try_cast(nullif(trim(amount_received), '') AS DOUBLE) AS amount_received,
  trim(currency_received) AS currency_received,
  try_cast(nullif(trim(amount_sent), '') AS DOUBLE) AS amount_sent,
  trim(currency_sent) AS currency_sent,
  trim(payment_type) AS payment_type,
  coalesce(try_cast(nullif(trim(is_laundering), '') AS INTEGER), 0) AS is_laundering
FROM raw
"""

# filter (USD + ACH)
usd_ach_filter = """
upper(trim(currency_sent))='US DOLLAR' AND
upper(trim(currency_received))='US DOLLAR' AND
upper(trim(payment_type))='ACH'
"""

# Step 1: Filter normal transactions (USD+ACH, is_laundering=0)
con.execute(f"""
  COPY (
    WITH typed AS ({typed_tx_sql})
    SELECT
      timestamp, from_bank, from_account, to_bank, to_account,
      amount_received, currency_received, amount_sent, currency_sent,
      payment_type, is_laundering
    FROM typed
    WHERE timestamp IS NOT NULL
      AND {usd_ach_filter}
      AND is_laundering = 0
  ) TO '{OUT_STEP1}' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

step1_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{OUT_STEP1}')").fetchone()[0]
print(f"Step 1: Saved strictly USD/ACH normal transactions to '{OUT_STEP1}' (rows={step1_rows:,})")

# Step 2: Parse patterns, filter USD+ACH positives, then add any missing positives
#         from the main CSV (anti-join on robust keys)
patterns_df = parse_patterns_file(PATTERNS_TXT)
if patterns_df.empty:
    # Create an empty patterns table with matching columns so SQL doesn't break
    patterns_df = pd.DataFrame(columns=standard_columns + ['attempt_id', 'attempt_type'])

# Register patterns into DuckDB
con.register('patterns_df', patterns_df)

# Build the unioned laundering set and write to Parquet
con.execute(f"""
  COPY (
    WITH
      pat_raw AS (
        SELECT
          timestamp, from_bank, from_account, to_bank, to_account,
          amount_received, currency_received, amount_sent, currency_sent,
          payment_type, is_laundering,
          CAST(attempt_id AS VARCHAR) AS attempt_id, -- Explicitly cast to VARCHAR
          attempt_type
        FROM patterns_df -- Read directly from the registered DataFrame
      ),
      pat_typed AS (
        SELECT
          {ts_parse_sql}::TIMESTAMP AS timestamp,
          trim(from_bank) AS from_bank,
          trim(from_account) AS from_account,
          trim(to_bank) AS to_bank,
          trim(to_account) AS to_account,
          try_cast(nullif(trim(amount_received), '') AS DOUBLE) AS amount_received,
          trim(currency_received) AS currency_received,
          try_cast(nullif(trim(amount_sent), '') AS DOUBLE) AS amount_sent,
          trim(currency_sent) AS currency_sent,
          trim(payment_type) AS payment_type,
          coalesce(try_cast(nullif(trim(is_laundering), '') AS INTEGER), 0) AS is_laundering,
          try_cast(nullif(trim(attempt_id), '') AS INTEGER) AS attempt_id,
          trim(attempt_type) AS attempt_type
        FROM pat_raw
      ),
      pat_filt AS (
        SELECT
          timestamp, from_bank, from_account, to_bank, to_account,
          amount_received, currency_received, amount_sent, currency_sent,
          payment_type, is_laundering, attempt_id, attempt_type,
          CAST(round(amount_sent * 100) AS BIGINT) AS amount_sent_c,
          CAST(round(amount_received * 100) AS BIGINT) AS amount_received_c
        FROM pat_typed
        WHERE timestamp IS NOT NULL
          AND {usd_ach_filter}
          AND is_laundering = 1
      ),
      raw_pos AS (
        WITH typed AS ({typed_tx_sql})
        SELECT
          timestamp, from_bank, from_account, to_bank, to_account,
          amount_received, currency_received, amount_sent, currency_sent,
          payment_type, is_laundering,
          CAST(round(amount_sent * 100) AS BIGINT) AS amount_sent_c,
          CAST(round(amount_received * 100) AS BIGINT) AS amount_received_c
        FROM typed
        WHERE timestamp IS NOT NULL
          AND {usd_ach_filter}
          AND is_laundering = 1
      ),
      missing AS (
        SELECT raw_pos.*
        FROM raw_pos
        LEFT JOIN pat_filt
          ON raw_pos.timestamp = pat_filt.timestamp
          AND raw_pos.from_bank = pat_filt.from_bank
          AND raw_pos.from_account = pat_filt.from_account
          AND raw_pos.to_bank = pat_filt.to_bank
          AND raw_pos.to_account = pat_filt.to_account
          AND raw_pos.amount_received_c = pat_filt.amount_received_c
          AND raw_pos.amount_sent_c = pat_filt.amount_sent_c
        WHERE pat_filt.timestamp IS NULL -- Check if there was no match in pat_filt
      ),
      unioned AS (
        SELECT
          timestamp, from_bank, from_account, to_bank, to_account,
          amount_received, currency_received, amount_sent, currency_sent,
          payment_type, is_laundering,
          attempt_id, attempt_type
        FROM pat_filt
        UNION ALL
        SELECT
          timestamp, from_bank, from_account, to_bank, to_account,
          amount_received, currency_received, amount_sent, currency_sent,
          payment_type, is_laundering,
          NULL::INTEGER AS attempt_id, 'UNLISTED' AS attempt_type
        FROM missing
      )
    SELECT * FROM unioned
  ) TO '{OUT_STEP2}' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

base_count = con.execute("""
  WITH x as (SELECT attempt_type FROM read_parquet(?) WHERE attempt_type <> 'UNLISTED')
  SELECT COUNT(*) FROM x
""", [OUT_STEP2]).fetchone()[0]
added_count = con.execute("""
  WITH x as (SELECT attempt_type FROM read_parquet(?) WHERE attempt_type = 'UNLISTED')
  SELECT COUNT(*) FROM x
""", [OUT_STEP2]).fetchone()[0]
total_count = con.execute(f"SELECT COUNT(*) FROM read_parquet('{OUT_STEP2}')").fetchone()[0]
print(f"Step 2: Saved USD/ACH laundering transactions to '{OUT_STEP2}' "
      f"(patterns={base_count:,}, added_from_csv={added_count:,}, total={total_count:,})")

# Step 3: Filter account file
con.execute(f"""
  COPY (
    WITH all_tx AS (
      SELECT
        timestamp, from_bank, from_account, to_bank, to_account,
        amount_received, currency_received, amount_sent, currency_sent,
        payment_type, is_laundering,
        NULL::INTEGER AS attempt_id, NULL::VARCHAR AS attempt_type -- Add these columns with NULLs
      FROM read_parquet('{OUT_STEP1}')
      UNION ALL
      SELECT
        timestamp, from_bank, from_account, to_bank, to_account,
        amount_received, currency_received, amount_sent, currency_sent,
        payment_type, is_laundering,
        attempt_id, attempt_type
      FROM read_parquet('{OUT_STEP2}')
    ),
    involved AS (
      SELECT DISTINCT from_account AS account FROM all_tx WHERE from_account IS NOT NULL
      UNION
      SELECT DISTINCT to_account AS account FROM all_tx WHERE to_account IS NOT NULL
    ),
    accounts AS (
      SELECT * FROM read_csv_auto(
        '{ACCOUNTS_CSV}',
        delim=',',
        header=false,
        columns={{'bank_name': 'VARCHAR', 'bank_id': 'VARCHAR', 'account_id_hex': 'VARCHAR', 'entity_id': 'VARCHAR', 'entity_name': 'VARCHAR'}},
        all_varchar=true
      )
    )
    SELECT a.*
    FROM accounts a
    INNER JOIN involved i
      ON trim(a.account_id_hex) = trim(i.account)
  ) TO '{OUT_STEP3}' (FORMAT PARQUET, COMPRESSION ZSTD)
""")

step3_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{OUT_STEP3}')").fetchone()[0]
print(f"Step 3: Saved filtered account details to '{OUT_STEP3}' (rows={step3_rows:,})")

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Step 1: Saved strictly USD/ACH normal transactions to '/content/drive/MyDrive/AML/processed/1_filtered_normal_transactions.parquet' (rows=7,156,712)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Step 2: Saved USD/ACH laundering transactions to '/content/drive/MyDrive/AML/processed/2_filtered_laundering_transactions.parquet' (patterns=9,143, added_from_csv=72,103, total=81,246)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Step 3: Saved filtered account details to '/content/drive/MyDrive/AML/processed/3_filtered_accounts.parquet' (rows=559,855)


In [22]:
OUT_STEP1 = os.path.join(PROCESSED_DIR, '1_filtered_normal_transactions.parquet')
OUT_STEP2 = os.path.join(PROCESSED_DIR, '2_filtered_laundering_transactions.parquet')
OUT_STEP3 = os.path.join(PROCESSED_DIR, '3_filtered_accounts.parquet')
import pandas as pd
import os
from google.colab import drive

drive.mount('/content/drive')
DRIVE_DIR = '/content/drive/MyDrive/AML'
PROCESSED_DIR = os.path.join(DRIVE_DIR, 'processed')

df = pd.read_parquet(OUT_STEP1)
df.tail()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,amount_received,currency_received,amount_sent,currency_sent,payment_type,is_laundering
7156707,2022-11-05 23:37:00,64485,82E5ED150,251127,8510A6C80,29.18,US Dollar,29.18,US Dollar,ACH,0
7156708,2022-11-05 23:45:00,2132088,83945A130,197710,851283090,726.13,US Dollar,726.13,US Dollar,ACH,0
7156709,2022-11-05 23:40:00,261404,8343BE3A0,1153944,8514B7B00,0.16,US Dollar,0.16,US Dollar,ACH,0
7156710,2022-11-05 23:56:00,261641,82AC63FB0,11304,851833A60,220.79,US Dollar,220.79,US Dollar,ACH,0
7156711,2022-11-05 23:57:00,215070,832442180,170779,851EAEB50,514.69,US Dollar,514.69,US Dollar,ACH,0


In [20]:

df = pd.read_parquet(OUT_STEP2)
df.tail()

Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,amount_received,currency_received,amount_sent,currency_sent,payment_type,is_laundering,attempt_id,attempt_type
81241,2022-11-05 19:30:00,250555,81A9DDCF0,262899,81A9DDF10,1368.94,US Dollar,1368.94,US Dollar,ACH,1,,UNLISTED
81242,2022-11-05 19:57:00,241036,81E300750,2776,81E3007F0,632.02,US Dollar,632.02,US Dollar,ACH,1,,UNLISTED
81243,2022-11-05 19:44:00,11260,8505DF760,284408,8505DF7B0,2448.11,US Dollar,2448.11,US Dollar,ACH,1,,UNLISTED
81244,2022-08-05 21:18:00,867,8005F7EE0,952,800876DD0,1816.29,US Dollar,1816.29,US Dollar,ACH,1,,UNLISTED
81245,2022-08-08 17:52:00,20,800AE2D90,11798,800AE5450,1850.08,US Dollar,1850.08,US Dollar,ACH,1,,UNLISTED


In [23]:
df = pd.read_parquet(OUT_STEP3)
df.tail()

Unnamed: 0,bank_name,bank_id,account_id_hex,entity_id,entity_name
559850,First Bank of Huron,47750,81948E2E0,2AA06A38B40,Corporation #1116
559851,Savings Bank of Seattle,233990,80FD4A2E0,2AA066DC290,Sole Proprietorship #23632
559852,First Bank of Newbury,25580,8120C57E0,2AA06784970,Corporation #20097
559853,Brook Community Bank,127707,80E54CC60,2AA06B42E00,Partnership #8821
559854,Desert Federal Bank,115541,806EC0B60,2AA0653ACD0,Partnership #4555
