In [1]:
# Import Libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [2]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Initialize an empty DataFrame to store evaluation results
evaluation_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"])

# Function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name="Model"):
    global evaluation_df

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Confusion Matrix:")
    print(cm)
    print("\n")

    # Create a dictionary of the evaluation metrics
    metrics = {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

    # Convert the dictionary to a DataFrame
    metrics_df = pd.DataFrame([metrics])

    # Append the metrics to the DataFrame using concat
    evaluation_df = pd.concat([evaluation_df, metrics_df], ignore_index=True)

# 1.0 Without Scalling or Dimention Redution

In [4]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/BitcoinNoScaler.csv')

## 1.1 Preperation of data for testing

In [5]:
# Split the data into features (X) and target (y)
target = 'label'

X = bitcoin_dataset.drop(columns=[target])
y = bitcoin_dataset[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print(X_train.shape)
print(y_test.shape)

(2304395, 8)
(576099,)


## 1.2 Model Training

In [7]:
# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [8]:
# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

## 1.3 Model Evaluation

In [9]:
# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoost-NoScalingNoDimRed")

--- XGBoost-NoScalingNoDimRed ---
Accuracy: 0.989
Precision: 0.79
Recall: 0.23
F1 Score: 0.36
Confusion Matrix:
[[567644    499]
 [  6120   1836]]




# 2.0 Without Scalling and with Oversampling

In [None]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset_oversampled = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/BitcoinNoScalerOversampled.csv')

## 2.1 Preperation of data for testing

In [None]:
# Split the data into features (X) and target (y)
target = 'label'

X = bitcoin_dataset_oversampled.drop(columns=[target])
y = bitcoin_dataset_oversampled[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X.shape)
print(y.shape)

(5709906, 8)
(5709906,)


## 2.2 Model Training

In [None]:
# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [None]:
# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

## 2.3 Model Evaluation

In [None]:
# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoos-NoScalingWithOversampling")

--- XGBoos-NoScalingWithOversampling ---
Accuracy: 0.942
Precision: 0.93
Recall: 0.95
F1 Score: 0.94
Confusion Matrix:
[[531546  39042]
 [ 26804 544590]]




# 3.0 Without Scalling and with Undersampling

In [None]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset_undersampled = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/BitcoinNoScalerUndersampled.csv')

## 3.1 Preperation of data for testing

In [None]:
# Split the data into features (X) and target (y)
target = 'label'

X = bitcoin_dataset_undersampled.drop(columns=[target])
y = bitcoin_dataset_undersampled[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X.shape)
print(y.shape)

(81604, 8)
(81604,)


## 3.2 Model Training

In [None]:
# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [None]:
# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

## 3.3 Model Evaluation

In [None]:
# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoost-NoScalingWithUndersampling")

--- XGBoost-NoScalingWithUndersampling ---
Accuracy: 0.884
Precision: 0.87
Recall: 0.91
F1 Score: 0.89
Confusion Matrix:
[[7045 1144]
 [ 748 7384]]




In [None]:
print(evaluation_df)

                                Model  Accuracy  Precision    Recall  F1 Score
0           XGBoost-NoScalingNoDimRed  0.988511   0.786296  0.230769  0.356817
1    XGBoos-NoScalingWithOversampling  0.942341   0.933105  0.953090  0.942992
2  XGBoost-NoScalingWithUndersampling  0.884076   0.865854  0.908018  0.886435


# 4.0 With Standard Scaling

In [None]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset_standard = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/BitcoinStandardScaler.csv')

## 4.1 Preperation of data for testing

In [None]:
# # Remove all rows with any NaN values
# df_cleaned = df.dropna()

# Split the data into features (X) and target (y)
target = 'label'

X = bitcoin_dataset_standard.drop(columns=[target])
y = bitcoin_dataset_standard[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X.shape)
print(y.shape)

(2895755, 8)
(2895755,)


## 4.2 Model Training

In [None]:
# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [None]:
# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

## 4.3 Model Evaluation

In [None]:
# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoost-StandardScaling")

--- XGBoost-StandardScaling ---
Accuracy: 0.989
Precision: 0.79
Recall: 0.24
F1 Score: 0.36
Confusion Matrix:
[[570681    505]
 [  6076   1889]]




In [None]:
evaluation_df = evaluation_df.drop([3])
print(evaluation_df)

                                Model  Accuracy  Precision    Recall  F1 Score
0           XGBoost-NoScalingNoDimRed  0.988511   0.786296  0.230769  0.356817
1    XGBoos-NoScalingWithOversampling  0.942341   0.933105  0.953090  0.942992
2  XGBoost-NoScalingWithUndersampling  0.884076   0.865854  0.908018  0.886435
4             XGBoost-StandardScaling  0.988637   0.789056  0.237163  0.364707


# 5.0 With MinMax Scaling

In [None]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset_minmax = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/BitcoinMinMaxScaler.csv')

## 5.1 Preperation of data for testing

In [None]:
# Split the data into features (X) and target (y)
target = 'label'

X = bitcoin_dataset_minmax.drop(columns=[target])
y = bitcoin_dataset_minmax[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X.shape)
print(y.shape)

(2875005, 8)
(2875005,)


## 5.2 Model Training

In [None]:
# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [None]:
# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

## 5.3 Model Evaluation

In [None]:
# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoost-MinMaxScaling")

--- XGBoost-MinMaxScaling ---
Accuracy: 0.987
Precision: 0.70
Recall: 0.10
F1 Score: 0.17
Confusion Matrix:
[[566717    329]
 [  7169    786]]




In [None]:
print(evaluation_df)

                                Model  Accuracy  Precision    Recall  F1 Score
0           XGBoost-NoScalingNoDimRed  0.988511   0.786296  0.230769  0.356817
1    XGBoos-NoScalingWithOversampling  0.942341   0.933105  0.953090  0.942992
2  XGBoost-NoScalingWithUndersampling  0.884076   0.865854  0.908018  0.886435
3             XGBoost-StandardScaling  0.988637   0.789056  0.237163  0.364707
4               XGBoost-MinMaxScaling  0.986960   0.704933  0.098806  0.173319


In [None]:
evaluation_df.to_csv('/content/drive/MyDrive/CO544/MLproject/Evaluation.csv', index=False)

# 6.0 With Standard Scaling and Oversampling

In [None]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset_standard_oversampled = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/BitcoinStandardScalerOversampled.csv')

## 6.1 Preperation of data for testing

In [None]:
# Split the data into features (X) and target (y)
target = 'label'

X = bitcoin_dataset_standard_oversampled.drop(columns=[target])
y = bitcoin_dataset_standard_oversampled[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X.shape)
print(y.shape)

(5709906, 8)
(5709906,)


## 6.2 Model Training

In [None]:
# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [None]:
# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

## 6.3 Model Evaluation

In [None]:
# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoost-StandardScalarOversampled")

--- XGBoost-StandardScalarOversampled ---
Accuracy: 0.937
Precision: 0.93
Recall: 0.95
F1 Score: 0.94
Confusion Matrix:
[[528379  42209]
 [ 29571 541823]]




In [None]:
print(evaluation_df)

                                Model  Accuracy  Precision    Recall  F1 Score
0           XGBoost-NoScalingNoDimRed  0.988511   0.786296  0.230769  0.356817
1    XGBoos-NoScalingWithOversampling  0.942341   0.933105  0.953090  0.942992
2  XGBoost-NoScalingWithUndersampling  0.884076   0.865854  0.908018  0.886435
3             XGBoost-StandardScaling  0.988637   0.789056  0.237163  0.364707
4               XGBoost-MinMaxScaling  0.986960   0.704933  0.098806  0.173319
5   XGBoost-StandardScalarOversampled  0.937144   0.927728  0.948248  0.937876


In [None]:
evaluation_df.to_csv('/content/drive/MyDrive/CO544/MLproject/Evaluation.csv', index=False)

# 7.0 Common Preprocessed Dataset

In [10]:
bitcoin_dataset_train = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/bitcoin_training_dataset.csv')

In [11]:
bitcoin_dataset_test = pd.read_csv('/content/drive/MyDrive/CO544/MLproject/ProcessedData/bitcoin_test_dataset.csv')

In [12]:
target = 'label'

X_train = bitcoin_dataset_train.drop(columns=[target])
y_train = bitcoin_dataset_train[target]

X_test = bitcoin_dataset_test.drop(columns=[target])
y_test = bitcoin_dataset_test[target]

# Convert Data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set Parameters
params = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eta': 0.3,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the Model

# Specify the number of boosting rounds
num_round = 100
bst = xgb.train(params, dtrain, num_round)

# Predict the test set
preds = bst.predict(dtest)
best_preds = [int(np.argmax(line)) for line in preds]

# XGBoost Model Evaluation
evaluate_model(y_test, best_preds, "XGBoost")

--- XGBoost ---
Accuracy: 0.744
Precision: 0.76
Recall: 0.72
F1 Score: 0.74
Confusion Matrix:
[[6400 1879]
 [2359 5928]]


