In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle


# Load data
data = pd.read_csv("/content/sample_data/metaverse_transactions_dataset.csv")
data.drop(columns=['timestamp'], inplace=True)

# Drop unnecessary columns
data.drop(columns=['sending_address', 'receiving_address'], inplace=True)

# Separate features and target
X = data.drop(columns=['anomaly'])
y = data['anomaly']

categorical_cols=X.select_dtypes(include=['object'])
categorical_cols=pd.get_dummies(categorical_cols, drop_first=False)

numerical_cols=X.select_dtypes(include=['int64'])
X=pd.concat([numerical_cols,categorical_cols],axis=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_data=X[['hour_of_day','login_frequency','session_duration']]
numerical_data_scaled = scaler.fit_transform(numerical_data)
scaled_num_df=pd.DataFrame(data=numerical_data_scaled, columns=['hrod','logfrq','sesdur'], index=X.index)
X.drop(columns=['hour_of_day','login_frequency','session_duration'], inplace=True)
X=pd.concat([scaled_num_df,X],axis=1)
X.rename(columns={'hrod': 'hour_of_day'}, inplace=True)
X.rename(columns={'logfrq': 'login_frequency'}, inplace=True)
X.rename(columns={'sesdur': 'session_duration'}, inplace=True)

# Shuffle the data
X, y_shuffled = shuffle(X, y, random_state=42)

# Split the shuffled data into training and testing sets
X_train_shuffled, X_test_shuffled, y_train_shuffled, y_test_shuffled = train_test_split(X, y_shuffled, test_size=0.25, random_state=42)


# Apply SMOTE oversampling
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_shuffled, y_train_shuffled)

from sklearn.neighbors import KNeighborsClassifier
knn_model= KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_resampled,y_train_resampled)
y_pred=knn_model.predict(X_test_shuffled)

from sklearn.metrics import confusion_matrix, classification_report
knn_model_train_accuracy=accuracy_score(y_train_resampled, knn_model.predict(X_train_resampled))
knn_model_test_accuracy=accuracy_score(y_test_shuffled, y_pred)

print("Training Accuracy: ", knn_model_train_accuracy)
print("Testing Accuracy: ", knn_model_test_accuracy)
print(classification_report(y_test_shuffled, y_pred))
print(confusion_matrix(y_test_shuffled, y_pred))





Training Accuracy:  0.9710962851068591
Testing Accuracy:  0.9012213740458015
               precision    recall  f1-score   support

    high_risk       1.00      1.00      1.00      1659
     low_risk       0.97      0.90      0.94     15830
moderate_risk       0.53      0.81      0.64      2161

     accuracy                           0.90     19650
    macro avg       0.84      0.91      0.86     19650
 weighted avg       0.93      0.90      0.91     19650

[[ 1659     0     0]
 [    0 14294  1536]
 [    0   405  1756]]
