In [None]:
library(tidyverse)
library(data.table)
library(caret)
library(randomForest)
library(xgboost)
library(corrplot)
library(ggplot2)
library(dplyr)

In [None]:
df = read.csv('/kaggle/input/ethereum-fraud-detection/transaction_dataset.csv', header = TRUE, row.names = NULL)


In [None]:
summary_df = data.frame(summary(df))

## First 2 columns also do not add any values - indices only.
## 7 columns are all either null or 0 - 'ERC20AvgTimeBetweenSentTnx', 'ERC20AvgTimeBetweenRecTnx', 'ERC20AvgTimeBetweenRec2Tnx', 'ERC20AvgTimeBetweenContractTnx', 'ERC20MinValSentContract', 'ERC20MaxValSentContract', 'ERC20AvgValSentContract' 
## Can be removed.

# Data Cleaning

In [None]:
df[duplicated(df), ]

# Shows no duplicates

In [None]:
Addresses = data.frame(table(df$Address))

## 9816 unique addresses in 9841 records. This implies presence of duplicates, but with some other column's value different.

In [None]:
df[df$Address=='0x4c7520df888aa4569a37ac7d132f89c65821f0af', ]

## All values are same except in the string variables at the end - 'ERC20.most.sent.token.type', 'ERC20_most_rec_token_type'. These columns have more than 50% rows as null, and the remaining values might not add much value to numerical models. Can be safely removed.

In [None]:
df <- subset(df, select = -c(X, 
                             Index, 
                             ERC20.avg.time.between.sent.tnx, 
                             ERC20.avg.time.between.rec.tnx, 
                             ERC20.avg.time.between.rec.2.tnx, 
                             ERC20.avg.time.between.contract.tnx, 
                             ERC20.min.val.sent.contract, 
                             ERC20.max.val.sent.contract, 
                             ERC20.avg.val.sent.contract, 
                             ERC20.most.sent.token.type,
                             ERC20_most_rec_token_type))

## 40 variables remaining (including our response variable)

In [None]:
df[duplicated(df), ]

In [None]:
df <- df %>% distinct()

## 9816 observations remian corresponding to unique addresses

In [None]:
missing_vals = data.frame(colSums(is.na(df)))

## Shows missing values in columns related to ERC20 token transactions. Nulls might simply mean that no transactions have happened. Therefore, we can fill the nulls with 0 value. 

In [None]:
df[is.na(df)] <- 0

# Exploratory Data Analysis

In [None]:
## Next, for a few other variables in the summary, we saw that most of them were 0 with very few non-zero values. This implies their variance will be low and hence can be removed because they will not help in detecting fraud. 

variances = data.frame(sapply(df, var, na.rm = TRUE))

In [None]:
## removing columns with low variance

df <- subset(df, select = -c(min.value.sent.to.contract, 
                             avg.value.sent.to.contract, 
                             max.val.sent.to.contract, 
                             total.ether.sent.contracts, 
                             ERC20.uniq.sent.addr.1))

In [None]:
## Normalizing the data before calculating correlation coefficients
preProc <- preProcess(df[,-which(names(df) == "FLAG")], 
                      method = c("center", "scale"))

df_normalized <- predict(preProc, df[,-which(names(df) == "FLAG")])
df_normalized <- cbind(FLAG = df$FLAG, df_normalized)

In [None]:
# Now that we have all unique records, we can safely remove addresses as they will not be of any use to the numerical models that we build. 

df_normalized <- subset(df_normalized, select = -c(Address))

In [None]:
cor_matrix <- cor(df_normalized)

In [None]:
threshold <- 0.85  # Set your correlation threshold

# Create a list of pairs of highly correlated features
highly_correlated_pairs <- list()

# Loop through the correlation matrix
for (i in 1:ncol(cor_matrix)) {
    for (j in 1:(i)) {
      # print(i)
      # print(j)
      if (abs(cor_matrix[i, j]) > threshold & abs(cor_matrix[i, j]) != 1) {
          pair <- c(colnames(cor_matrix)[i], 
                   colnames(cor_matrix)[j], 
                   cor_matrix[i, j])
          highly_correlated_pairs[[length(highly_correlated_pairs)+1]] <- pair
        }
    }
}

# Display the list of highly correlated pairs
for (pair in highly_correlated_pairs) {
    cat(pair[1], "+", pair[2], ":", round(as.numeric(pair[3]), 2), "\n") }


In [None]:
## removing some of correlated variables and keeping only one from each pair
## We are doing it manually because our dataset is not that big, and has less than 100 features to select from. It is also good for interpretability of our model.  

df_normalized <- subset(df_normalized, select = -c(ERC20.max.val.rec, 
                             ERC20.avg.val.rec, 
                             ERC20.min.val.sent, 
                             ERC20.max.val.sent, 
                             ERC20.avg.val.sent,
                             ERC20.uniq.rec.contract.addr))

In [None]:
{r}
cor_matrix_new <- cor(df_normalized[,-1])

# Visualizations

In [None]:
library(reshape2)

ggplot(data = melt(cor_matrix_new), aes(x = Var1, y = Var2, fill = value)) + 
  geom_tile(color = "white") +
  # geom_text(aes(label = ifelse(abs(value) > 0.4, round(value, 2), " ")), size = 2) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab",
                       name = "Correlation") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1)) +
  coord_fixed() +
  ggtitle("Correlation Matrix of Numerical Features") +
  xlab("") + ylab("")

In [None]:
ggplot(df, aes(x = log(ERC20.total.Ether.received), fill = as.factor(FLAG))) +
  geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
  labs(title = "Distribution of Total Ether Received in exchange of ERC-20 tokens", x = "log(Total Ether Received)", y = "Count") +
  theme_minimal()

ggplot(df, aes(x = log(Time.Diff.between.first.and.last..Mins.), fill = as.factor(FLAG))) +
  geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
  labs(title = "Distribution of time difference between first and last transaction", x = "log(Time Difference)", y = "Count") +
  theme_minimal()

In [None]:
## Anlaysing the flag variable which tells fraud or not

prop.table(table(df_normalized$FLAG))*100

ggplot(df_normalized, aes(x = factor(FLAG))) + 
  geom_bar(fill = c("lightgreen", "pink")) +
  ggtitle("Distribution of Fraud vs Non-Fraud") +
  xlab("FRAUD") +
  theme_minimal() +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5)

In [None]:
# Calculate counts for each class
count_data <- df_normalized %>%
  group_by(FLAG) %>%
  summarise(count = n()) %>%
  mutate(percentage = count/sum(count)*100,
         ypos = cumsum(percentage) - 0.5*percentage)

# Create donut chart
ggplot(count_data, aes(x = 2, y = percentage, fill = factor(FLAG))) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y", start = 0) +
  xlim(0.5, 2.5) +  # Creates the "hole" in the middle
  scale_fill_manual(values = c("lightgreen", "pink"), 
                    labels = c("Legitimate", "Fraudulent")) +
  ggtitle("Distribution of Fraudulent vs Legitimate") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        legend.position = "right") +
  geom_text(aes(label = paste0(round(percentage,1), "%")), 
            position = position_stack(vjust = 0.5), 
            size = 4) +
  guides(fill = guide_legend(title = "Labels"))


The above figure shows mild imbalance in data (3.5 : 1). Since the data is for fraud detection, it might need to be oversampled or downsampled for better training of less robust classification models like logistic regression. For robust algorithms like Random Forest and XGBoost, the imbalance is not huge and no action will be required.

1. Oversampling is good for small datasets as it retains original data and randomly creates new samples.

2. SMOTE is another type of oversampling method. It generates samples for minority class instead of generating random samples. It is also not ideal for a high dimensional dataset but still is the best method for imbalance datasets.

3. For large datasets, it can increase training time and also cause overfitting. As our minority class is not that small, we can go ahead with downsampling of data instead of oversampling. It reduces computational costs while avoiding overfitting to some extent.

We can try a hybrid approach - SMOTE + downsampling (if the training set becomes very large).

We will first try training the models without changing the sample ratio and evaluate how the model performs.

In [None]:
df_normalized$FLAG = as.factor(df_normalized$FLAG)

set.seed(7406)  # For reproducibility
train_indices <- sample(1:nrow(df_normalized), size = 0.8 * nrow(df_normalized))
df_train <- df_normalized[train_indices, ]
df_test <- df_normalized[-train_indices, ]

In [None]:
log_reg_model <- glm(FLAG ~ Avg.min.between.sent.tnx + Avg.min.between.received.tnx + Time.Diff.between.first.and.last..Mins. + min.val.sent + avg.val.sent + total.transactions..including.tnx.to.create.contract + ERC20.total.ether.sent + ERC20.uniq.sent.token.name, data = df_train, family = "binomial")
# summary(log_reg_model)

In [None]:
log_reg_pred_imbal <- predict(log_reg_model, df_test, type = "response")
log_reg_pred_class_imbal <- as.factor(ifelse(log_reg_pred_imbal >= 0.3, 1, 0))

# Evaluate performance
confusionMatrix(log_reg_pred_class_imbal, df_test$FLAG)

# Balancing data

Since the models are not performing well on imbalanced data, we will use SMOTE to balance the data first and then build models again.

In [None]:
library(smotefamily)

df_balanced <- SMOTE(X = df_normalized[,-which(names(df_normalized) == "FLAG")], 
                   target = df_normalized$FLAG)$data

names(df_balanced)[names(df_balanced) == "class"] <- "FLAG"
df_balanced$FLAG = as.factor(df_balanced$FLAG)

In [None]:
df_downsampled <- df_balanced %>%
  group_by(FLAG) %>%
  sample_n(size = min(table(df_balanced$FLAG))) %>%
  ungroup()

In [None]:
df_before <- data.frame(
  Class = c("Legitimate", "Fraudulent"),
  Count = table(df_normalized$FLAG) %>% as.numeric(),
  Dataset = "Before SMOTE"
)

df_after <- data.frame(
  Class = c("Legitimate", "Fraudulent"),
  Count = table(df_balanced$FLAG) %>% as.numeric(),
  Dataset = "After SMOTE"
)

df_after_downsampling = data.frame(
  Class = c("Legitimate", "Fraudulent"),
  Count = table(df_downsampled$FLAG) %>% as.numeric(),
  Dataset = "After Downsampling"
)

In [None]:
library(gridExtra)
library(patchwork)

create_pie <- function(data, title) {
  ggplot(data, aes(x = 2, y = Count, fill = Class)) +
    geom_bar(stat = "identity", width = 1, color = "white") +
    coord_polar(theta = "y", start = 0) +
    xlim(0.5, 2.5) +  # Donut hole
    scale_fill_manual(values = c("pink", "lightgreen")) +
    labs(title = title) +
    theme_void() +
    theme(
      plot.title = element_text(hjust = 0.5, size = 12),
      legend.position = "right"
    ) +
    geom_text(
      aes(label = paste0(round(Count / sum(Count) * 100, 1), "%")),
      position = position_stack(vjust = 0.5),
      size = 4
    )
}

# Generate plots
before_plot <- create_pie(df_before, "Before SMOTE")
after_plot <- create_pie(df_after, "After SMOTE")
after_downsampling <- create_pie(df_after_downsampling, "After Downsampling")

# Align plots with equal size (using patchwork)
combined_plots <- before_plot + after_plot + after_downsampling +
  plot_layout(guides = "collect")  # Shared legend

# Display
combined_plots


The data is now ready for modeling.

In [None]:
set.seed(7406)  # For reproducibility
train_indices <- sample(1:nrow(df_balanced), size = 0.8 * nrow(df_balanced))
df_bal_train <- df_balanced[train_indices, ]
df_bal_test <- df_balanced[-train_indices, ]

# Logistic Regression Model

In [None]:
log_reg_model_2 <- glm(FLAG ~ Avg.min.between.sent.tnx+
                             Avg.min.between.received.tnx+
                             Time.Diff.between.first.and.last..Mins.+
                             Sent.tnx+
                             Unique.Received.From.Addresses+
                             min.value.received+
                             avg.val.received+
                             min.val.sent+
                             avg.val.sent+
                             total.transactions..including.tnx.to.create.contract+
                             total.ether.balance+
                             Total.ERC20.tnxs+
                             ERC20.total.ether.sent+
                             ERC20.total.Ether.sent.contract+
                             ERC20.uniq.sent.addr+
                             ERC20.uniq.rec.addr+
                             ERC20.uniq.sent.token.name+
                             ERC20.uniq.rec.token.name, 
                       data = df_bal_train, family = "binomial")

In [None]:
log_reg_pred <- predict(log_reg_model_2, df_bal_test, type = "response")
log_reg_pred_class <- as.factor(ifelse(log_reg_pred >= 0.3, 1, 0))

# Evaluate performance
confusionMatrix(log_reg_pred_class, as.factor(df_bal_test$FLAG))

# Random Forest Model

In [None]:
trainControl <- trainControl(method = "cv", number = 5)  # 5-fold cross-validation

# Train the Random Forest model with automatic tuning
rf_model_2 <- train(FLAG ~ ., data = df_bal_train,
                  method = "rf",
                  trControl = trainControl,
                  tuneLength = 10)

In [None]:
varImpPlot(rf_model_2$finalModel)
plot(rf_model_2$finalModel)
#print(rf_model_2$finalModel)
#print(rf_model_2)

In [None]:
rf_model_final <- randomForest(FLAG ~ ., data = df_bal_train, 
                                ntree = 200, nodesize = 3, mtry=9)
print(rf_model_final)

In [None]:
rf_pred <- predict(rf_model_final, newdata = df_bal_test)
confusionMatrix(rf_pred, df_bal_test$FLAG)

In [None]:
varImpPlot(rf_model_final, main="Variable Importance in Random Forest Model")
plot(rf_model_final)

# Gradient Boosting Model

In [None]:
trainControl <- trainControl(method = "cv", number = 10) 
# 10-fold cross-validation

# Train the Boosting model (Gradient Boosting)
boosting_model <- train(FLAG ~ ., data = df_bal_train,
                        method = "gbm",
                        trControl = trainControl,
                        tuneLength = 10,  # Try 10 different combinations of parameters
                        verbose = FALSE)

In [None]:
library(gbm)
df_bal_train$FLAG = as.numeric(df_bal_train$FLAG)-1

boosting_model_final = gbm(FLAG ~ ., data = df_bal_train, distribution = "bernoulli",
                   n.trees = 400, interaction.depth = 10, 
                   shrinkage = 0.1, verbose = FALSE)

print(boosting_model_final)

In [None]:
boosting_pred <- predict(boosting_model_final, newdata = df_bal_test)
boosting_pred_class <- as.factor(ifelse(boosting_pred >= 0.3, 1, 0))

confusionMatrix(boosting_pred_class, df_bal_test$FLAG)

In [None]:
boosting_importance <- summary(boosting_model, plot = FALSE)

# Load ggplot2
library(ggplot2)

# Create a ggplot bar chart
ggplot(boosting_importance, aes(x = reorder(var, rel.inf), y = rel.inf)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for better readability
  labs(title = "Variable Importance in Boosting Model",
       x = "Variables", y = "Relative Importance") +
  theme_minimal()

# Evaluation metrics

We are using accuracy and F1 scores for model evaluation. To formally compare model performance, we perform 10-fold cross-validation, followed by the application of statistical tests like paired t-tests and Wilcoxon tests on the evaluation metrics across the folds.

In [None]:
set.seed(7406)
folds <- createFolds(df_balanced$FLAG, k = 10, returnTrain = TRUE)

log_acc <- rf_acc <- gbm_acc <- numeric(length(folds))
log_f1 <- rf_f1 <- gbm_f1 <- numeric(length(folds))

In [None]:
library(MLmetrics)

for (i in seq_along(folds)) {
  train_idx <- folds[[i]]
  train_data <- df_balanced[train_idx, ]
  test_data <- df_balanced[-train_idx, ]
  
  # Logistic Regression (no hyperparams to tune)
  log_model <- glm(FLAG ~ ., data = train_data, family = binomial)
  log_pred <- predict(log_model, test_data, type = "response")
  log_class <- ifelse(log_pred > 0.3, 1, 0)
  log_acc[i] <- mean(log_class == test_data$FLAG)
  log_f1[i] <- F1_Score(y_pred = log_class, y_true = as.numeric(as.character(test_data$FLAG)), positive = "1")

  
  # Random Forest with chosen hyperparams
  rf_model <- randomForest(FLAG ~ ., data = train_data, ntree = 200, 
                           mtry = 9, nodesize = 3)
  rf_pred <- predict(rf_model, test_data)
  rf_acc[i] <- mean(rf_pred == test_data$FLAG)
  rf_f1[i] <- F1_Score(y_pred = rf_pred, y_true = test_data$FLAG, positive = "1")
  
  # GBM with chosen hyperparams
  train_data$FLAG = as.numeric(train_data$FLAG)-1
  gbm_model <- gbm(FLAG ~ ., data = train_data, distribution = "bernoulli",
                   n.trees = 400, interaction.depth = 10, 
                   shrinkage = 0.1, verbose = FALSE)
  gbm_pred <- predict(gbm_model, test_data, type = "response")
  gbm_class <- ifelse(gbm_pred > 0.3, 1, 0)
  gbm_acc[i] <- mean(gbm_class == test_data$FLAG)
  gbm_f1[i] <- F1_Score(y_pred = gbm_class, y_true = as.numeric(as.character(test_data$FLAG)), positive = "1")

}

In [None]:
t.test(log_acc, rf_acc, paired = TRUE)
t.test(log_acc, gbm_acc, paired = TRUE)
t.test(rf_acc, gbm_acc, paired = TRUE)

In [None]:
t.test(log_f1, rf_f1, paired = TRUE)
t.test(log_f1, gbm_f1, paired = TRUE)
t.test(rf_f1, gbm_f1, paired = TRUE)

In [None]:
wilcox.test(log_acc, rf_acc, paired = TRUE)
wilcox.test(log_acc, gbm_acc, paired = TRUE)
wilcox.test(rf_acc, gbm_acc, paired = TRUE)

In [None]:
wilcox.test(log_f1, rf_f1, paired = TRUE)
wilcox.test(log_f1, gbm_f1, paired = TRUE)
wilcox.test(rf_f1, gbm_f1, paired = TRUE)

## Some more visualizations

In [None]:
g1 = ggplot(as.data.frame(confusionMatrix(log_reg_pred_class_imbal, df_test$FLAG)$table) %>%
  mutate(Percent = Freq / sum(Freq) * 100,
         Label = paste0(round(Percent, 1), "%")), aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percent), color = "white") +
  geom_text(aes(label = Label), vjust = 0.5, fontface = "bold") +
  scale_fill_gradient(low = "lightblue", high = "steelblue") +
  labs(title = "Logistic Regression (imbalanced data)") +
  xlab("Predicted Class") +
  scale_x_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  ylab("Actual Class") +
  scale_y_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

g2 = ggplot(as.data.frame(confusionMatrix(log_reg_pred_class, df_bal_test$FLAG)$table) %>%
  mutate(Percent = Freq / sum(Freq) * 100,
         Label = paste0(round(Percent, 1), "%")), aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percent), color = "white") +
  geom_text(aes(label = Label), vjust = 0.5, fontface = "bold") +
  scale_fill_gradient(low = "lightblue", high = "steelblue") +
  labs(title = "Logistic Regression (balanced data)") +
  xlab("Predicted Class") +
  scale_x_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  ylab("Actual Class") +
  scale_y_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

g3 = ggplot(as.data.frame(confusionMatrix(rf_pred, df_bal_test$FLAG)$table) %>%
  mutate(Percent = Freq / sum(Freq) * 100,
         Label = paste0(round(Percent, 1), "%")), aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percent), color = "white") +
  geom_text(aes(label = Label), vjust = 0.5, fontface = "bold") +
  scale_fill_gradient(low = "lightblue", high = "steelblue") +
  labs(title = "Random Forest") +
  xlab("Predicted Class") +
  scale_x_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  ylab("Actual Class") +
  scale_y_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

g4 = ggplot(as.data.frame(confusionMatrix(boosting_pred, df_bal_test$FLAG)$table) %>%
  mutate(Percent = Freq / sum(Freq) * 100,
         Label = paste0(round(Percent, 1), "%")), aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percent), color = "white") +
  geom_text(aes(label = Label), vjust = 0.5, fontface = "bold") +
  scale_fill_gradient(low = "lightblue", high = "steelblue") +
  labs(title = "Gradient Boosting") +
  xlab("Predicted Class") +
  scale_x_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  ylab("Actual Class") +
  scale_y_discrete(labels = c("0" = "Legit", "1" = "Fraud")) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

#grid.arrange(g1, g2, g3, g4, ncol = 2)