In [None]:
import pandas as pd
import numpy as np
import sklearn as skl

In [None]:
df = pd.read_csv('../data/transaction_dataset_cleaned.csv')

In [None]:
## Split features and target variable
X = df.drop(['FLAG', 'ERC20_avg_time_between_rec_2_tnx'], axis=1)
y = df['FLAG']

In [None]:
y.head()

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

## Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1, stratify=y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [None]:
depths = [5, 10, 20, None]
best_val_score = 0
best_depth = None


for depth in depths:
    model = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=depth)

    model.fit(X_train, y_train)

    val_preds = model.predict(X_val)
    val_score = f1_score(y_val, val_preds)

    print(f"Validation F1 Score with max_depth = {depth}: {val_score}")

    if val_score > best_val_score:
        best_val_score = val_score
        best_depth = depth

print(f"Best max_depth: {best_depth} with Validation F1 Score: {best_val_score}")

In [None]:
actual_values = y_val.reset_index(drop=True)

val_comparison_df = pd.DataFrame({
    'actual': actual_values,
    'predicted': val_preds
})

print(val_comparison_df)

In [None]:
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=best_depth)
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)

final_fi1 = f1_score(y_test, preds) 
print(f"Final F1 Score: {final_fi1}")

print("\nFinal Classification Report:")
print(classification_report(y_test, preds))

In [None]:
actual = y_test.reset_index(drop=True)

comparison_df = pd.DataFrame({
    'actual': actual,
    'predicted': preds
})

print(comparison_df)

In [None]:
comparison_df.to_csv('results_comparison.csv', index=False)

In [None]:
print(X_train.columns.tolist())


In [None]:
all_feature_columns = X_train.columns.tolist()

sent_token_columns = [col for col in all_feature_columns if col.startswith('ERC20_most_sent_token_')]

rec_token_columns = [col for col in all_feature_columns if col.startswith('ERC20_most_rec_token_')]

with open('../src/lists/master_column_list.txt', 'w') as f:
    for col_name in all_feature_columns:
        f.write(f"{col_name}\n")

with open('../src/lists/master_sent.txt', 'w') as f:
    for col_name in sent_token_columns:
        f.write(f"{col_name}\n")

with open('../src/lists/master_rec.txt', 'w') as f:
    for col_name in rec_token_columns:
        f.write(f"{col_name}\n")

In [None]:
# Save model
import joblib

filename = '../models/fraud_model.joblib'
joblib.dump(model, filename)