In [1]:
import datatable as dt
import numpy as np
from datetime import datetime
from datatable import f, join, sort
import pandas as pd
import dask.dataframe as dd
import sys
import os
from sklearn.model_selection import train_test_split

def initial_preprocessing(raw_data, first_timestamp):

    data = []

    currency_dict = {}
    payment_format_dict = {}
    bank_account_dict = {}
    account_dict = {}

    def get_dict_value(name, collection):
        if name in collection:
            value = collection[name]
        else:
            value = len(collection)
            collection[name] = value
        return value

    for i in range(raw_data.nrows):
        datetime_object = datetime.strptime(raw_data[i, "Timestamp"], '%Y/%m/%d %H:%M')
        timestamp = datetime_object.timestamp()
        day = datetime_object.day
        month = datetime_object.month
        year = datetime_object.year

        if first_timestamp == -1:
            start_time = datetime(year, month, day)
            first_timestamp = start_time.timestamp() - 10

        timestamp = timestamp - first_timestamp

        receiving_currency = get_dict_value(raw_data[i, "Receiving Currency"], currency_dict)
        payment_currency = get_dict_value(raw_data[i, "Payment Currency"], currency_dict)

        payment_format = get_dict_value(raw_data[i, "Payment Format"], payment_format_dict)

        from_acc_id_str = raw_data[i, "From Bank"] + raw_data[i, 2]
        from_id = get_dict_value(from_acc_id_str, account_dict)

        to_acc_id_str = raw_data[i, "To Bank"] + raw_data[i, 4]
        to_id = get_dict_value(to_acc_id_str, account_dict)

        amount_received = float(raw_data[i, "Amount Received"])
        amount_paid = float(raw_data[i, "Amount Paid"])

        is_laundering = int(raw_data[i, "Is Laundering"])
        
        data.append([i, from_id, to_id, timestamp, amount_paid, payment_currency, amount_received, receiving_currency,
                     payment_format, is_laundering])
        
    # Creating a pandas DataFrame
    pandas_df = pd.DataFrame(data, columns=['Index', 'From_ID', 'To_ID', 'Timestamp', 'Amount_Paid', 'Payment_Currency',
                                     'Amount_Received', 'Receiving_Currency', 'Payment_Format', 'Is_Laundering'])

    ddf = dd.from_pandas(pandas_df, npartitions=2)

    return ddf, first_timestamp, currency_dict, payment_format_dict, bank_account_dict, account_dict


  from pandas.core import (
In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


In [2]:
def add_edges_to_graph(G, ddf):
    def add_edges(partition):
        for index, row in partition.iterrows():
            G.add_edge(row['From_ID'], row['To_ID'], 
                       timestamp=row['Timestamp'], 
                       amount_sent=row['Amount_Paid'], 
                       amount_received=row['Amount_Received'], 
                       received_currency=row['Receiving_Currency'], 
                       payment_format=row['Payment_Format'])

    ddf.map_partitions(add_edges).compute()
    return ddf

In [3]:
import networkx as nx
import pandas as pd
import dask.dataframe as dd
import numpy as np

def create_graph(ddf):
    
    G = nx.DiGraph()
    ddf = add_edges_to_graph(G, ddf)
    
    return G, ddf

In [4]:
import networkx as nx
import pandas as pd
import dask.dataframe as dd
import numpy as np

def extract_features(node):
    features = {}
    #Node
    features['Node'] = node
    # Degree
    features['degree'] = G.degree[node]
    # In Degree
    features['in_degree'] = G.in_degree[node]
    # Out Degree
    features['out_degree'] = G.out_degree[node]
    # Clustering Coefficient
    features['clustering_coefficient'] = nx.clustering(G, node)
    # Degree Centrality
    features['degree_centrality'] = nx.degree_centrality(G)[node]
    
    return features


In [5]:
def merge_trans_with_gf(transactions_ddf, graph_ddf):
    
    # Merge on From_ID
    merged_ddf = dd.merge(train_graph_ddf, graph_features_ddf, left_on='From_ID', right_on='Node', how='left')

    # Rename columns to avoid conflicts
    merged_ddf = merged_ddf.rename(columns={
        'degree': 'from_degree',
        'in_degree': 'from_in_degree',
        'out_degree': 'from_out_degree',
        'clustering_coefficient': 'from_clustering_coeff',
        'degree_centrality': 'from_degree_centrality'
    })
    
    # Merge on To_ID
    merged_ddf = dd.merge(merged_ddf, graph_features_ddf, left_on='To_ID', right_on='Node', how='left')

    # Rename columns again
    merged_ddf = merged_ddf.rename(columns={
        'degree': 'to_degree',
        'in_degree': 'to_in_degree',
        'out_degree': 'to_out_degree',
        'clustering_coefficient': 'to_clustering_coeff',
        'degree_centrality': 'to_degree_centrality'
    })
    
    # Drop redundant columns
    merged_ddf = merged_ddf.drop(columns=['Node_x', 'Node_y'])
    
    return merged_ddf

# read data and train test split

In [6]:
import datatable as dt
from sklearn.model_selection import train_test_split


input_file = "HI-Small_Trans.csv"
raw_data = dt.fread(input_file, columns=dt.str32)

# Convert the raw_data DataTable to a pandas DataFrame
raw_data_df = raw_data.to_pandas()
# Splitting the raw_data into train and test sets
train_df, test_df = train_test_split(raw_data_df, test_size=0.2, random_state=42, stratify=raw_data_df['Is Laundering'])

# Convert the splits back to DataTable if necessary
train_dt = dt.Frame(train_df)
test_dt = dt.Frame(test_df)



In [7]:
train_dt = train_dt.head()
test_dt = test_dt.head()

# train set prep

In [8]:
initial_preprocessed_ddf, first_timestamp, currency_dict, payment_format_dict, bank_account_dict, account_dict = initial_preprocessing(train_dt, first_timestamp = -1)
initial_preprocessed_ddf.head()

Unnamed: 0,Index,From_ID,To_ID,Timestamp,Amount_Paid,Payment_Currency,Amount_Received,Receiving_Currency,Payment_Format,Is_Laundering
0,0,0,1,16870.0,280.69,0,280.69,0,0,0
1,1,2,3,-586250.0,6196.94,1,6196.94,1,1,0
2,2,4,5,-677090.0,16750875.23,0,16750875.23,0,1,0
3,3,6,7,-85610.0,1566.88,0,1566.88,0,1,0
4,4,8,9,165070.0,152.96,0,152.96,0,0,0


In [9]:
G, train_graph_ddf = create_graph(initial_preprocessed_ddf)
train_graph_ddf.head()

Unnamed: 0,Index,From_ID,To_ID,Timestamp,Amount_Paid,Payment_Currency,Amount_Received,Receiving_Currency,Payment_Format,Is_Laundering
0,0,0,1,16870.0,280.69,0,280.69,0,0,0
1,1,2,3,-586250.0,6196.94,1,6196.94,1,1,0
2,2,4,5,-677090.0,16750875.23,0,16750875.23,0,1,0
3,3,6,7,-85610.0,1566.88,0,1566.88,0,1,0
4,4,8,9,165070.0,152.96,0,152.96,0,0,0


In [10]:
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

Number of nodes: 19
Number of edges: 11


In [11]:
import pandas as pd
import ast
import dask

# Convert the list of unique nodes to a Dask DataFrame
unique_nodes = list(set(train_graph_ddf['From_ID']).union(train_graph_ddf['To_ID']))

#append to unique nodes whenever new accounts from test set come up
unique_nodes_dd = dd.from_pandas(pd.DataFrame(unique_nodes, columns=['Node']), npartitions=2)

# Apply extract_features function to each unique node
#graph_features = unique_nodes_dd.map_partitions(lambda df: df.apply(lambda row: extract_features(row['Node']), axis=1), meta=[('Node', 'int64'), ('degree', 'int64'), ('in_degree', 'int64'), ('out_degree', 'int64'), ('clustering_coefficient', 'int64'), ('degree_centrality', 'float64')])

# Apply extract_features function to each partition
graph_features = unique_nodes_dd.map_partitions(lambda df: df.apply(lambda row: extract_features(row['Node']), axis=1))

# Example of delayed graph features
graph_features = [dask.delayed(lambda x: x)(string_data) for string_data in graph_features]

# Compute delayed objects
graph_features_computed = dask.compute(*graph_features)

# Convert each string to a dictionary
dicts = [ast.literal_eval(string_data) for string_data in graph_features_computed]

# Create a list of lists containing the dictionary values for each entry
list_of_lists = [list(data_dict.values()) for data_dict in dicts]

# Create a DataFrame from the list of lists
lists_df = pd.DataFrame(list_of_lists, columns=dicts[0].keys())

# Convert specific columns to the desired data types
convert_dtype = {'Node': 'int64', 'degree': 'int64', 'in_degree': 'int64', 'out_degree': 'int64', 'clustering_coefficient': 'float64', 'degree_centrality': 'float64'}
graph_features_df = lists_df.astype(convert_dtype)
graph_features_ddf = dd.from_pandas(graph_features_df, npartitions=2)
# Print the DataFrame
print(graph_features_ddf.head())

   Node  degree  in_degree  out_degree  clustering_coefficient  \
0     0       1          0           1                     0.0   
1     1       3          2           1                     0.0   
2     2       1          0           1                     0.0   
3     3       1          1           0                     0.0   
4     4       1          0           1                     0.0   

   degree_centrality  
0           0.055556  
1           0.166667  
2           0.055556  
3           0.055556  
4           0.055556  


In [12]:
import pandas as pd
import ast
import dask

# Example of delayed graph features
graph_features = [dask.delayed(lambda x: x)(string_data) for string_data in graph_features]

# Compute delayed objects
graph_features_computed = dask.compute(*graph_features)

# Convert each string to a dictionary
dicts = [ast.literal_eval(string_data) for string_data in graph_features_computed]

# Create a list of lists containing the dictionary values for each entry
list_of_lists = [list(data_dict.values()) for data_dict in dicts]

# Create a DataFrame from the list of lists
lists_df = pd.DataFrame(list_of_lists, columns=dicts[0].keys())

# Convert specific columns to the desired data types
convert_dtype = {'Node': 'int64', 'degree': 'int64', 'in_degree': 'int64', 'out_degree': 'int64', 'clustering_coefficient': 'float64', 'degree_centrality': 'float64'}
graph_features_df = lists_df.astype(convert_dtype)
graph_features_ddf = dd.from_pandas(graph_features_df, npartitions=2)
# Print the DataFrame
print(graph_features_ddf.head())


   Node  degree  in_degree  out_degree  clustering_coefficient  \
0     0       1          0           1                     0.0   
1     1       3          2           1                     0.0   
2     2       1          0           1                     0.0   
3     3       1          1           0                     0.0   
4     4       1          0           1                     0.0   

   degree_centrality  
0           0.055556  
1           0.166667  
2           0.055556  
3           0.055556  
4           0.055556  


In [13]:
graph_features_ddf.head()

Unnamed: 0,Node,degree,in_degree,out_degree,clustering_coefficient,degree_centrality
0,0,1,0,1,0.0,0.055556
1,1,3,2,1,0.0,0.166667
2,2,1,0,1,0.0,0.055556
3,3,1,1,0,0.0,0.055556
4,4,1,0,1,0.0,0.055556


In [14]:
train_graph_ddf.head()

Unnamed: 0,Index,From_ID,To_ID,Timestamp,Amount_Paid,Payment_Currency,Amount_Received,Receiving_Currency,Payment_Format,Is_Laundering
0,0,0,1,16870.0,280.69,0,280.69,0,0,0
1,1,2,3,-586250.0,6196.94,1,6196.94,1,1,0
2,2,4,5,-677090.0,16750875.23,0,16750875.23,0,1,0
3,3,6,7,-85610.0,1566.88,0,1566.88,0,1,0
4,4,8,9,165070.0,152.96,0,152.96,0,0,0


In [15]:
preprocessed_train_df = merge_trans_with_gf(train_graph_ddf, graph_features_ddf)
preprocessed_train_df.compute()
# normalize the dataset then train

Unnamed: 0,Index,From_ID,To_ID,Timestamp,Amount_Paid,Payment_Currency,Amount_Received,Receiving_Currency,Payment_Format,Is_Laundering,from_degree,from_in_degree,from_out_degree,from_clustering_coeff,from_degree_centrality,to_degree,to_in_degree,to_out_degree,to_clustering_coeff,to_degree_centrality
0,3,6,7,-85610.0,1566.88,0,1566.88,0,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
1,1,2,3,-586250.0,6196.94,1,6196.94,1,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
2,6,12,12,-678170.0,1652.89,2,1652.89,2,3,0,2,1,1,0.0,0.111111,2,1,1,0.0,0.111111
0,0,0,1,16870.0,280.69,0,280.69,0,0,0,1,0,1,0.0,0.055556,3,2,1,0.0,0.166667
1,2,4,5,-677090.0,16750875.23,0,16750875.23,0,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
2,4,8,9,165070.0,152.96,0,152.96,0,0,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
3,5,10,11,58450.0,17269.7,1,17269.7,1,2,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
4,8,15,16,-344630.0,69.03,3,69.03,3,0,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
5,9,17,18,-80150.0,12.09,0,12.09,0,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
6,7,13,14,-2690.0,981.9,3,981.9,3,0,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556


In [16]:
from sklearn.preprocessing import StandardScaler

# Separate features (train_x) and target variable (train_y)
train_x = preprocessed_train_df.drop(columns=['Is_Laundering', 'Index'])
train_y = preprocessed_train_df['Is_Laundering']

# Normalize the dataset
scaler = StandardScaler()
train_x_normalized = scaler.fit_transform(train_x)
train_x_normalized

array([[-0.50129099, -0.48753809,  0.44408313, -0.33364456, -0.84515425,
        -0.33364456, -0.84515425,  0.10599979, -0.33333333, -0.33333333,
         0.        ,  0.        , -0.33333333, -0.46852129, -0.33333333,
        -0.5       ,  0.        , -0.46852129],
       [ 0.61268899,  0.45003516, -1.49352012, -0.33362745,  0.84515425,
        -0.33362745,  0.84515425,  2.22599555,  3.        ,  3.        ,
         0.        ,  0.        ,  3.        ,  1.09321633, -0.33333333,
         2.        ,  0.        ,  1.09321633],
       [-1.24394431, -1.23759669, -1.19295226, -0.33272304,  0.        ,
        -0.33272304,  0.        ,  0.10599979, -0.33333333, -0.33333333,
         0.        ,  0.        , -0.33333333, -0.46852129, -0.33333333,
        -0.5       ,  0.        , -0.46852129],
       [ 0.24136233,  0.26252051,  0.91514281, -0.33051921,  0.        ,
        -0.33051921,  0.        ,  1.16599767, -0.33333333, -0.33333333,
         0.        ,  0.        , -0.33333333, -0.468

# test set prep

In [17]:
import datatable as dt
import numpy as np
from datetime import datetime
from datatable import f, join, sort
import pandas as pd
import dask.dataframe as dd
import sys
import os
from sklearn.model_selection import train_test_split

def initial_preprocessing(raw_data, first_timestamp, currency_dict, payment_format_dict, bank_account_dict, account_dict):

    data = []

    def get_dict_value(name, collection):
        if name in collection:
            value = collection[name]
        else:
            value = len(collection)
            collection[name] = value
        return value

    for i in range(raw_data.nrows):
        datetime_object = datetime.strptime(raw_data[i, "Timestamp"], '%Y/%m/%d %H:%M')
        timestamp = datetime_object.timestamp()
        day = datetime_object.day
        month = datetime_object.month
        year = datetime_object.year

        if first_timestamp == -1:
            start_time = datetime(year, month, day)
            first_timestamp = start_time.timestamp() - 10

        timestamp = timestamp - first_timestamp

        receiving_currency = get_dict_value(raw_data[i, "Receiving Currency"], currency_dict)
        payment_currency = get_dict_value(raw_data[i, "Payment Currency"], currency_dict)

        payment_format = get_dict_value(raw_data[i, "Payment Format"], payment_format_dict)

        from_acc_id_str = raw_data[i, "From Bank"] + raw_data[i, 2]
        from_id = get_dict_value(from_acc_id_str, account_dict)

        to_acc_id_str = raw_data[i, "To Bank"] + raw_data[i, 4]
        to_id = get_dict_value(to_acc_id_str, account_dict)

        amount_received = float(raw_data[i, "Amount Received"])
        amount_paid = float(raw_data[i, "Amount Paid"])

        is_laundering = int(raw_data[i, "Is Laundering"])
        
        data.append([i, from_id, to_id, timestamp, amount_paid, payment_currency, amount_received, receiving_currency,
                     payment_format, is_laundering])
        
    # Creating a pandas DataFrame
    pandas_df = pd.DataFrame(data, columns=['Index', 'From_ID', 'To_ID', 'Timestamp', 'Amount_Paid', 'Payment_Currency',
                                     'Amount_Received', 'Receiving_Currency', 'Payment_Format', 'Is_Laundering'])

    ddf = dd.from_pandas(pandas_df, npartitions=2)

    return ddf, first_timestamp, currency_dict, payment_format_dict, bank_account_dict, account_dict

In [18]:
test_initial_preprocessed_ddf, first_timestamp, currency_dict, payment_format_dict, bank_account_dict, account_dict = initial_preprocessing(test_dt, first_timestamp, currency_dict, payment_format_dict, bank_account_dict, account_dict)
test_initial_preprocessed_ddf.head()

Unnamed: 0,Index,From_ID,To_ID,Timestamp,Amount_Paid,Payment_Currency,Amount_Received,Receiving_Currency,Payment_Format,Is_Laundering
0,0,19,20,95290.0,208.0,0,208.0,0,1,0
1,1,21,22,-531830.0,131.24,0,131.24,0,0,0
2,2,23,24,-58250.0,5130.11,1,5130.11,1,1,0
3,3,25,26,59890.0,3318.94,3,3318.94,3,4,0
4,4,27,28,-551210.0,1138.77,2,1138.77,2,4,0


In [19]:
test_graph_ddf = add_edges_to_graph(G, test_initial_preprocessed_ddf)

In [20]:
unique_nodes_test = list(set(test_graph_ddf['From_ID']).union(test_graph_ddf['To_ID']))

#apunique_nodes_tesunique_nodes_testto unique nodes whenever new accounts from test set come up
unique_nodes_dd_test = dd.from_pandas(pd.DataFrame(unique_nodes_test, columns=['Node']), npartitions=2)

import pandas as pd
import ast
import dask

graph_features_test = unique_nodes_dd_test.map_partitions(lambda df: df.apply(lambda row: extract_features(row['Node']), axis=1))

# Example of delayed graph features
graph_features_test = [dask.delayed(lambda x: x)(string_data) for string_data in graph_features_test]

# Compute delayed objects
graph_features_computed_test = dask.compute(*graph_features_test)

# Convert each string to a dictionary
dicts_test = [ast.literal_eval(string_data) for string_data in graph_features_computed_test]

# Create a list of lists containing the dictionary values for each entry
list_of_lists_test = [list(data_dict.values()) for data_dict in dicts_test]

# Create a DataFrame from the list of lists
lists_df_test = pd.DataFrame(list_of_lists_test, columns=dicts[0].keys())

# Convert specific columns to the desired data types
convert_dtype = {'Node': 'int64', 'degree': 'int64', 'in_degree': 'int64', 'out_degree': 'int64', 'clustering_coefficient': 'float64', 'degree_centrality': 'float64'}
graph_features_df_test = lists_df_test.astype(convert_dtype)
graph_features_ddf_test = dd.from_pandas(graph_features_df_test, npartitions=2)
# Print the DataFrame
print(graph_features_ddf_test.head())


   Node  degree  in_degree  out_degree  clustering_coefficient  \
0    19       1          0           1                     0.0   
1    20       1          1           0                     0.0   
2    21       1          0           1                     0.0   
3    22       1          1           0                     0.0   
4    23       1          0           1                     0.0   

   degree_centrality  
0           0.027027  
1           0.027027  
2           0.027027  
3           0.027027  
4           0.027027  


In [21]:
# Add new columns to transactions_ddf
test_graph_ddf['from_degree'] = None
test_graph_ddf['from_in_degree'] = None
test_graph_ddf['from_out_degree'] = None
test_graph_ddf['from_clustering_coeff'] = None
test_graph_ddf['from_degree_centrality'] = None
test_graph_ddf['to_degree'] = None
test_graph_ddf['to_in_degree'] = None
test_graph_ddf['to_out_degree'] = None
test_graph_ddf['to_clustering_coeff'] = None
test_graph_ddf['to_degree_centrality'] = None

In [22]:
preprocessed_test_df = merge_trans_with_gf(test_graph_ddf, graph_features)
preprocessed_test_df.compute()

Unnamed: 0,Index,From_ID,To_ID,Timestamp,Amount_Paid,Payment_Currency,Amount_Received,Receiving_Currency,Payment_Format,Is_Laundering,from_degree,from_in_degree,from_out_degree,from_clustering_coeff,from_degree_centrality,to_degree,to_in_degree,to_out_degree,to_clustering_coeff,to_degree_centrality
0,3,6,7,-85610.0,1566.88,0,1566.88,0,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
1,1,2,3,-586250.0,6196.94,1,6196.94,1,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
2,6,12,12,-678170.0,1652.89,2,1652.89,2,3,0,2,1,1,0.0,0.111111,2,1,1,0.0,0.111111
0,0,0,1,16870.0,280.69,0,280.69,0,0,0,1,0,1,0.0,0.055556,3,2,1,0.0,0.166667
1,2,4,5,-677090.0,16750875.23,0,16750875.23,0,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
2,4,8,9,165070.0,152.96,0,152.96,0,0,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
3,5,10,11,58450.0,17269.7,1,17269.7,1,2,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
4,8,15,16,-344630.0,69.03,3,69.03,3,0,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
5,9,17,18,-80150.0,12.09,0,12.09,0,1,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556
6,7,13,14,-2690.0,981.9,3,981.9,3,0,0,1,0,1,0.0,0.055556,1,1,0,0.0,0.055556


In [23]:
from sklearn.preprocessing import StandardScaler

# Separate features (train_x) and target variable (train_y)
test_x = preprocessed_test_df.drop(columns=['Is_Laundering', 'Index'])
test_y = preprocessed_test_df['Is_Laundering']

# Normalize the dataset
scaler = StandardScaler()
test_x_normalized = scaler.fit_transform(test_x)
test_x_normalized

array([[-1.24394431, -1.23759669, -1.19295226, -0.33272304,  0.        ,
        -0.33272304,  0.        ,  0.10599979, -0.33333333, -0.33333333,
         0.        ,  0.        , -0.33333333, -0.46852129, -0.33333333,
        -0.5       ,  0.        , -0.46852129],
       [ 0.61268899,  0.45003516, -1.49352012, -0.33362745,  0.84515425,
        -0.33362745,  0.84515425,  2.22599555,  3.        ,  3.        ,
         0.        ,  0.        ,  3.        ,  1.09321633, -0.33333333,
         2.        ,  0.        ,  1.09321633],
       [-0.50129099, -0.48753809,  0.44408313, -0.33364456, -0.84515425,
        -0.33364456, -0.84515425,  0.10599979, -0.33333333, -0.33333333,
         0.        ,  0.        , -0.33333333, -0.46852129, -0.33333333,
        -0.5       ,  0.        , -0.46852129],
       [ 0.79835232,  0.82506446,  0.71522202, -0.33376099,  1.69030851,
        -0.33376099,  1.69030851, -0.95399809, -0.33333333, -0.33333333,
         0.        ,  0.        , -0.33333333, -0.468

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(train_x_normalized, train_y)

# Predict on the test set
y_pred = rf_classifier.predict(test_x_normalized)

# Evaluate accuracy
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
