In [None]:
# pip install datatable
# pip install networkx
# pip install pandas
# pip install tqdm
# pip install google
# pip install google-auth google-cloud-storage
# pip install gcsfs
# pip install fsspec
# pip install scikit-learn

# 1. This will be the data_preprocessing.py file

In [None]:
import datatable as dt
import numpy as np
from datetime import datetime
from datatable import f, join, sort
import sys
import os
import networkx as nx
import pandas as pd
import google.auth
from google.cloud import storage

In [None]:


def preprocess_data(input_file):
    """
    Preprocesses the input CSV file.

    Args:
    input_file (str): Path to the input CSV file.

    Returns:
    datatable.Frame: Preprocessed data frame.
    """
    output_path = "aml_preprocessed_train_data.csv"

    raw_data = dt.fread(input_file, columns=dt.str32, fill=True)
    currency_dict = {}
    payment_format_dict = {}
    bank_account_dict = {}
    account_dict = {}

    def get_dict_value(name, collection):
        if name in collection:
            value = collection[name]
        else:
            value = len(collection)
            collection[name] = value
        return value

    header = "EdgeID,from_id,to_id,Timestamp,\
    Amount Sent,Sent Currency,Amount Received,Received Currency,\
    Payment Format,Is Laundering\n"

    first_timestamp = -1

    with open(output_path, 'w') as writer:
        writer.write(header)
        for i in range(raw_data.nrows):
            datetime_object = datetime.strptime(raw_data[i, "Timestamp"], '%Y/%m/%d %H:%M')
            timestamp = datetime_object.timestamp()
            day = datetime_object.day
            month = datetime_object.month
            year = datetime_object.year

            if first_timestamp == -1:
                start_time = datetime(year, month, day)
                first_timestamp = start_time.timestamp() - 10

            timestamp = timestamp - first_timestamp

            cur1 = get_dict_value(raw_data[i, "Receiving Currency"], currency_dict)
            cur2 = get_dict_value(raw_data[i, "Payment Currency"], currency_dict)

            fmt = get_dict_value(raw_data[i, "Payment Format"], payment_format_dict)

            from_acc_id_str = raw_data[i, "From Bank"] + raw_data[i, 2]
            from_id = get_dict_value(from_acc_id_str, account_dict)

            to_acc_id_str = raw_data[i, "To Bank"] + raw_data[i, 4]
            to_id = get_dict_value(to_acc_id_str, account_dict)

            amount_received_orig = float(raw_data[i, "Amount Received"])
            amount_paid_orig = float(raw_data[i, "Amount Paid"])

            isl = int(raw_data[i, "Is Laundering"])

            line = '%d,%d,%d,%d,%f,%d,%f,%d,%d,%d\n' % \
                  (i, from_id, to_id, timestamp, amount_paid_orig, cur2, amount_received_orig, cur1, fmt, isl)

            writer.write(line)

    formatted_data = dt.fread(output_path)
    formatted_data = formatted_data[:, :, sort(3)]

    formatted_data.to_csv(output_path)
    return formatted_data


# 2. This is the feature_extraction.py file

In [None]:


def extract_features(G, transactions_df):
    """
    Extracts features from the transactions DataFrame.

    Args:
    G (networkx.DiGraph): Directed graph network.
    transactions_df (pandas.DataFrame): DataFrame containing transaction data.

    Returns:
    pandas.DataFrame: DataFrame with extracted features.
    """
    # Initialize an empty DataFrame to store features
    transaction_features = pd.DataFrame()

    # Iterate through each transaction and extract features
    for _, row in transactions_df.iterrows():
        features = extract_features_for_transaction(G, row)
        transaction_features = transaction_features.append(features, ignore_index=True)

    return transaction_features

def extract_features_for_transaction(G, transaction):
    """
    Extracts features for a single transaction.

    Args:
    G (networkx.DiGraph): Directed graph network.
    transaction (pandas.Series): Series representing a single transaction.

    Returns:
    dict: Dictionary containing extracted features.
    """
    features = {}
    from_id = transaction['from_id']
    to_id = transaction['to_id']
    # Implement feature extraction logic for a single transaction
    features['From Degree'] = G.degree[from_id]
    features['To Degree'] = G.degree[to_id]
    features['From clustering_coefficient'] = nx.clustering(G, from_id)
    features['To clustering_coefficient'] = nx.clustering(G, from_id)
    features['From degree_centrality'] = nx.degree_centrality(G)[from_id]
    features['To degree_centrality'] = nx.degree_centrality(G)[to_id]
    return features


# 3. This is the main.py file

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
# import data_preprocessing
# import feature_extraction
# import model_training
from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import train_test_split


def main(input_file, output_file):
    """
    Main function to orchestrate the workflow.

    Args:
    input_file (str): Path to the input CSV file.
    output_file (str): Path to save the output CSV file.
    """
    # Step 1: Data preprocessing
    # raw_data = data_preprocessing.preprocess_data(input_file)


    # def extract_data(bucket_name, filename):
    #     csv_data = pd.read_csv('gs://' + bucket_name + '/' + filename, encoding='utf-8')
    #     return csv_data
    



    # input_file = extract_data(bucket_name, filename)
    raw_data = preprocess_data(input_file)

    # Step 2: Process data and create Directed Graph Network
    G, transactions_df = process_data(raw_data)

    # Step 3: Feature extraction
    features_df = extract_features(G, transactions_df)

    # Step 4: Model training
    X = features_df  # Features
    y = transactions_df['Is Laundering']  # Target variable
    # 
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=True)
    # model = model_training.train_model(features_df)

    # Step 5: Save or further process the model and data
    pass

def process_data(raw_data):
    """
    Processes the raw data to create a Directed Graph Network and DataFrame.

    Args:
    raw_data (datatable.Frame): Raw data frame.

    Returns:
    networkx.DiGraph: Directed graph network.
    pandas.DataFrame: DataFrame containing transaction data.
    """
    currency_dict = {}
    payment_format_dict = {}
    bank_account_dict = {}
    account_dict = {}

    transactions_df = pd.DataFrame(raw_data.to_dict())
    G = nx.DiGraph()

    for _, row in transactions_df.iterrows():
        # Add nodes and edges to the graph
        from_id = get_dict_value(row['From Bank'] + row['From_ID'], account_dict)
        to_id = get_dict_value(row['To Bank'] + row['To_ID'], account_dict)
        G.add_edge(from_id, to_id, timestamp=row['Timestamp'], amount_sent=row['Amount Sent'],
                   amount_received=row['Amount Received'], received_currency=row['Receiving Currency'],
                   payment_format=row['Payment Format'])

    return G, transactions_df

def get_dict_value(name, collection):
    """
    Gets the value associated with a name in a collection or adds it if not present.

    Args:
    name (str): Name to retrieve or add.
    collection (dict): Dictionary collection.

    Returns:
    int: Value associated with the name.
    """
    if name in collection:
        value = collection[name]
    else:
        value = len(collection)
        collection[name] = value
    return value

if __name__ == "__main__":
    bucket_name ='aml_mlops_bucket'
    input_file = "HI-Small_Trans.csv"
    output_file = "aml_preprocessed_train_data.csv"
    main(input_file, output_file)
