In [1]:
import pandas as pd
import numpy as np
import sklearn as skl

In [2]:
df = pd.read_csv('../data/transaction_dataset_cleaned.csv')

In [3]:
## Split features and target variable
X = df.drop(['FLAG', 'ERC20_avg_time_between_rec_2_tnx'], axis=1)
y = df['FLAG']

In [4]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: FLAG, dtype: int64

In [5]:
X.head()

Unnamed: 0,avg_min_between_sent_tnx,avg_min_between_received_tnx,time_diff_between_first_and_last,sent_tnx,received_tnx,num_created_contracts,unique_received_from_addresses,unique_sent_to_addresses,min_value_received,max_value_received,...,ERC20_most_rec_token_iDAG SPACE,ERC20_most_rec_token_iEx.ec Network Token,ERC20_most_rec_token_iXledger,ERC20_most_rec_token_minereum,ERC20_most_rec_token_savedroid,ERC20_most_rec_token_shellchains.com,ERC20_most_rec_token_timereum,ERC20_most_rec_token_vSlice,ERC20_most_rec_token_www.pnztrust.com,ERC20_most_rec_token_yocoinclassic
0,844.26,1093.71,704785.63,721,89,0,40,118,0.0,45.806785,...,0,0,0,0,0,0,0,0,0,0
1,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.613269,...,0,0,0,0,0,0,0,0,0,0
2,246194.54,2434.02,516729.3,2,10,0,10,2,0.113119,1.165453,...,0,0,0,0,0,0,0,0,0,0
3,10219.6,15785.09,397555.9,25,9,0,7,13,0.0,500.0,...,0,0,0,0,0,0,0,0,0,0
4,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.802411,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.model_selection import train_test_split

## Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1, stratify=y_train)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [8]:
depths = [5, 10, 20, None]
best_val_score = 0
best_depth = None


for depth in depths:
    model = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=depth)

    model.fit(X_train, y_train)

    val_preds = model.predict(X_val)
    val_score = f1_score(y_val, val_preds)

    print(f"Validation F1 Score with max_depth = {depth}: {val_score}")

    if val_score > best_val_score:
        best_val_score = val_score
        best_depth = depth

print(f"Best max_depth: {best_depth} with Validation F1 Score: {best_val_score}")

Validation F1 Score with max_depth = 5: 0.767277856135402
Validation F1 Score with max_depth = 10: 0.8604954367666232
Validation F1 Score with max_depth = 10: 0.8604954367666232
Validation F1 Score with max_depth = 20: 0.9173859432799013
Validation F1 Score with max_depth = 20: 0.9173859432799013
Validation F1 Score with max_depth = None: 0.9135802469135802
Best max_depth: 20 with Validation F1 Score: 0.9173859432799013
Validation F1 Score with max_depth = None: 0.9135802469135802
Best max_depth: 20 with Validation F1 Score: 0.9173859432799013


In [9]:
actual_values = y_val.reset_index(drop=True)

val_comparison_df = pd.DataFrame({
    'actual': actual_values,
    'predicted': val_preds
})

print(val_comparison_df)

      actual  predicted
0          0          0
1          1          1
2          1          0
3          1          0
4          0          0
...      ...        ...
1963       0          0
1964       0          0
1965       0          0
1966       0          0
1967       0          0

[1968 rows x 2 columns]


In [10]:
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=best_depth)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
preds = model.predict(X_test)

final_fi1 = f1_score(y_test, preds) 
print(f"Final F1 Score: {final_fi1}")

print("\nFinal Classification Report:")
print(classification_report(y_test, preds))

Final F1 Score: 0.9036295369211514

Final Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1533
           1       0.99      0.83      0.90       436

    accuracy                           0.96      1969
   macro avg       0.97      0.91      0.94      1969
weighted avg       0.96      0.96      0.96      1969



In [12]:
actual = y_test.reset_index(drop=True)

comparison_df = pd.DataFrame({
    'actual': actual,
    'predicted': preds
})

print(comparison_df)

      actual  predicted
0          0          0
1          0          0
2          0          0
3          0          0
4          0          0
...      ...        ...
1964       0          0
1965       0          0
1966       0          0
1967       0          0
1968       0          0

[1969 rows x 2 columns]


In [13]:
comparison_df.to_csv('results_comparison.csv', index=False)

In [14]:
print(X_train.columns.tolist())


['avg_min_between_sent_tnx', 'avg_min_between_received_tnx', 'time_diff_between_first_and_last', 'sent_tnx', 'received_tnx', 'num_created_contracts', 'unique_received_from_addresses', 'unique_sent_to_addresses', 'min_value_received', 'max_value_received', 'avg_val_received', 'min_val_sent', 'max_val_sent', 'avg_val_sent', 'min_val_sent_to_contract', 'max_val_sent_to_contract', 'avg_val_sent_to_contract', 'total_transactions', 'total_ether_sent', 'total_ether_received', 'total_ether_sent_to_contract', 'total_ether_balance', 'ERC20_total_tnxs', 'ERC20_total_ether_received', 'ERC20_total_ether_sent', 'ERC20_total_ether_sent_to_contract', 'ERC20_uniq_sent_addr', 'ERC20_uniq_rec_addr', 'ERC20_uniq_sent_contract_addr', 'ERC20_uniq_rec_contract_addr', 'ERC20_avg_time_between_sent_tnx', 'ERC20_avg_time_between_rec_tnx', 'ERC20_avg_time_between_contract_tnx', 'ERC20_min_val_rec', 'ERC20_max_val_rec', 'ERC20_avg_val_rec', 'ERC20_min_val_sent', 'ERC20_max_val_sent', 'ERC20_avg_val_sent', 'ERC20_m

In [15]:
df = pd.read_csv('../data/transaction_dataset_cleaned.csv') 

all_feature_columns = df.drop(columns=['FLAG']).columns.tolist()

sent_token_columns = [col for col in all_feature_columns if col.startswith('ERC20_most_sent_token_')]

rec_token_columns = [col for col in all_feature_columns if col.startswith('ERC20_most_rec_token_')]

with open('../src/master_column_list.txt', 'w') as f:
    for col_name in all_feature_columns:
        f.write(f"{col_name}\n")

with open('../src/master_sent.txt', 'w') as f:
    for col_name in sent_token_columns:
        f.write(f"{col_name}\n")

with open('../src/master_rec.txt', 'w') as f:
    for col_name in rec_token_columns:
        f.write(f"{col_name}\n")

In [25]:
# Model Test

import sys
import os
import pandas as pd

backend_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if backend_path not in sys.path:
    sys.path.append(backend_path)

from src.feature_pipeline import get_feature_vector, load_master_column_list, load_token_vocabulary

test_address = "0xb4c9d8a5812a024bdb177991af256da144776033" 

SENT_PATH = '../src/lists/master_sent.txt'
REC_PATH = '../src/lists/master_rec.txt'

MASTER_COLUMN_LIST = X_train.columns.tolist()

print("Running get_feature_vector...")
final_vector_df = get_feature_vector(
    test_address, 
    SENT_PATH, 
    REC_PATH, 
    MASTER_COLUMN_LIST
)

print("--- Final Vector Generated ---")
print(final_vector_df)

print("\n--- Running Prediction ---")
fraud_probability = model.predict_proba(final_vector_df)[0][1]
prediction = 1 if fraud_probability >= 0.3 else 0
print(f"Fraud probability: {fraud_probability:.1%}")
print(f"Prediction: {prediction}" )

Running get_feature_vector...
No ETH transactions found for 0xb4c9d8a5812a024bdb177991af256da144776033. Returning default features.
No ETH transactions found for 0xb4c9d8a5812a024bdb177991af256da144776033. Returning default features.
--- Final Vector Generated ---
   avg_min_between_sent_tnx  avg_min_between_received_tnx  \
0                         0                             0   

   time_diff_between_first_and_last  sent_tnx  received_tnx  \
0                                 0         0             0   

   num_created_contracts  unique_received_from_addresses  \
0                      0                               0   

   unique_sent_to_addresses  min_value_received  max_value_received  ...  \
0                         0                   0                   0  ...   

   ERC20_most_rec_token_iDAG SPACE  ERC20_most_rec_token_iEx.ec Network Token  \
0                                0                                          0   

   ERC20_most_rec_token_iXledger  ERC20_most_rec