<a href="https://colab.research.google.com/github/cepdnaclk/e19-co544-Bitcoin-Transaction-Analysis-for-Ransomware-Identification/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [24]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [34]:
# Upload the dataset to drive and provide the path here

bitcoin_dataset = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/BitcoinHeistData.csv')

### Convert the labels to binary labels

In [35]:
# Define a function to convert labels to binary
def convert_to_binary(label):
    if label == 'white':
        return 0  # Assign 0 for 'White' class
    else:
        return 1  # Assign 1 for 'Ransomware' class

# Apply the function to create a new binary label column
bitcoin_dataset['Binary_Label'] = bitcoin_dataset['label'].apply(convert_to_binary)

# Check for missing values in the target variable
missing_target = bitcoin_dataset['Binary_Label'].isna().sum()
print(f"Number of missing values in target: {missing_target}")

# Check the updated DataFrame
print(bitcoin_dataset.head())

Number of missing values in target: 0
                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333      1   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244      1   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000      1   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906      1   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848    456   

   looped  neighbors       income            label  Binary_Label  
0       0          2  100050000.0  princetonCerber             1  
1       0          1  100000000.0   princetonLocky             1  
2       0          2  200000000.0  princetonCerber             1  
3       0          2   71200000.0  princetonCerber             1  
4       0          1  200000000.0   princetonLocky             1  


## Feature Transformation

By comparing skewness and Kurtosis of different transformations, following transformations are applied to features

```
length - yeojohnson
weight - yeojohnson
count - boxcox
looped - boxcox
neighbors - yeojohnson
income  - yeojohnson
```



In [36]:
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis, boxcox, yeojohnson
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer


# Define transformation functions

def boxcox_transform(x):
    return boxcox(x + 1e-9)[0]  # Shift values to be positive if necessary

def yeojohnson_transform(x):
    return yeojohnson(x)[0]

# Apply specified transformations to the features
bitcoin_dataset['length_transformed'] = yeojohnson_transform(bitcoin_dataset['length'])
bitcoin_dataset['weight_transformed'] = yeojohnson_transform(bitcoin_dataset['weight'])
bitcoin_dataset['count_transformed'] = boxcox_transform(bitcoin_dataset['count'])
bitcoin_dataset['looped_transformed'] = boxcox_transform(bitcoin_dataset['looped'])
bitcoin_dataset['neighbors_transformed'] = yeojohnson_transform(bitcoin_dataset['neighbors'])
bitcoin_dataset['income_transformed'] = yeojohnson_transform(bitcoin_dataset['income'])

## Eliminate Outliers

In [37]:
from scipy import stats

# Define the threshold for detecting outliers
threshold = 3

# Calculate Z-scores for each transformed feature
z_scores = stats.zscore(bitcoin_dataset[['length_transformed', 'weight_transformed', 'count_transformed', 'looped_transformed', 'neighbors_transformed', 'income_transformed']])

# Identify outliers based on Z-scores
outliers = (np.abs(z_scores) > threshold).any(axis=1)

# # Show the indices of outliers
# print("Indices of outliers:", np.where(outliers)[0])

# Remove outliers from the dataset
bitcoin_dataset_no_outliers = bitcoin_dataset[~outliers]

# # Visualize the transformed features after removing outliers
# for feature in features:
#     plot_transformations(bitcoin_dataset_no_outliers[f'{feature}_transformed'], f'{feature} (No Outliers)')

## Final Dataset (transformed features and outlier handled)

In [38]:
# Construct the dataset using transformed features with renamed columns
bitcoin_df = bitcoin_dataset_no_outliers[['year', 'day', 'length_transformed', 'weight_transformed', 'count_transformed', 'looped_transformed', 'neighbors_transformed', 'income_transformed','Binary_Label']].copy()

# Rename the columns
bitcoin_df.columns = ['year', 'day', 'length', 'weight', 'count', 'looped', 'neighbors', 'income','label']

# Print the DataFrame
print(bitcoin_df.head())

   year  day    length    weight         count      looped  neighbors  \
0  2017   11  2.817168  0.008247  1.000000e-09 -993.329103   0.641675   
1  2016  132  3.595750  0.000244  1.000000e-09 -993.329103   0.487116   
2  2016  246 -0.000000  0.428792  1.000000e-09 -993.329103   0.641675   
3  2016  322  4.023807  0.003887  1.000000e-09 -993.329103   0.641675   
4  2016  238  4.620371  0.066695  2.583543e+00 -993.329103   0.487116   

     income  label  
0  4.117890      1  
1  4.117884      1  
2  4.125573      1  
3  4.113623      1  
4  4.125573      1  


Feature Scaling:

For the four different models - SVM, Logistic Regression, XGBoost, and Random Forest - the choice of scalers may vary depending on the characteristics of your data and the algorithms themselves. Here's a general guideline for selecting scalers for each model:

### SVM:
- **Scaler**: MinMaxScaler or StandardScaler.
- **Reasoning**: SVM is sensitive to the scale of the features since it uses a distance-based metric to classify data points. Scaling the features to a similar range can improve the performance of SVM. MinMaxScaler may be preferable if you're using SVM with a kernel like RBF, as it bounds the data to a fixed range, preventing large values from dominating the distance calculations.

### Logistic Regression:
- **Scaler**: StandardScaler.
- **Reasoning**: Logistic Regression typically assumes that features are normally distributed. StandardScaler, which scales the data to have a mean of 0 and a standard deviation of 1, aligns with this assumption and is commonly used with logistic regression. It preserves the shape of the distribution and is suitable for linear models.

### XGBoost:
- **Scaler**: None or MinMaxScaler.
- **Reasoning**: XGBoost is a tree-based ensemble method and is inherently robust to the scale of features. You can choose not to scale the features when using XGBoost. However, if you want to scale the features, MinMaxScaler can be used to bound the features to a specific range, ensuring consistency across the trees.

### Random Forest:
- **Scaler**: None or MinMaxScaler.
- **Reasoning**: Similar to XGBoost, Random Forest is a tree-based ensemble method and is not sensitive to the scale of features. You can choose not to scale the features when using Random Forest. However, if you prefer to scale the features, MinMaxScaler can be used for consistency across the trees.

### Final Considerations:
- For SVM and Logistic Regression, scaling is generally recommended to improve performance.
- For tree-based models like XGBoost and Random Forest, scaling is optional and may not significantly impact performance. However, if you prefer consistency or plan to compare results with other models, you can use MinMaxScaler.

It's important to experiment with different scalers and evaluate their effects on model performance using cross-validation or other validation techniques to determine the optimal approach for your specific dataset and modeling goals.



## Standard Scaled dataset

In [39]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Select the numerical features to scale
numerical_features = ['length', 'weight', 'count', 'looped', 'neighbors', 'income']

# Check for missing values in the target variable before scaling
missing_target_before = bitcoin_df['label'].isna().sum()
print(f"Number of missing values in target before scaling: {missing_target_before}")

# Initialize the StandardScaler
standard_scaler = StandardScaler()

# Scale the numerical features using StandardScaler
standard_scaled_features = standard_scaler.fit_transform(bitcoin_df[numerical_features])

# Create a DataFrame with scaled features, ensuring the indices match the original dataframe
standard_scaled_df = pd.DataFrame(standard_scaled_features, columns=numerical_features, index=bitcoin_df.index)

# Concatenate the scaled features with the non-scaled features
bitcoin_df_scaled = pd.concat([bitcoin_df.drop(columns=numerical_features), standard_scaled_df], axis=1)

# Check for missing values in the target variable after scaling
missing_target_after = bitcoin_df_scaled['label'].isna().sum()
print(f"Number of missing values in target after scaling: {missing_target_after}")

# Verify that the number of missing values in the target variable hasn't changed
assert missing_target_before == missing_target_after, "Mismatch in missing values count before and after scaling."

# Use the scaled dataset for further processing
bitcoin_df = bitcoin_df_scaled

# Print the first few rows of the scaled DataFrame
print("Standard Scaled DataFrame:")
print(bitcoin_df.head())


Number of missing values in target before scaling: 0
Number of missing values in target after scaling: 0
Standard Scaled DataFrame:
   year  day  label    length    weight     count    looped  neighbors  \
0  2017   11      1  0.290905 -1.139795 -0.767030 -0.395908   0.527530   
1  2016  132      1  0.732148 -1.185167 -0.767030 -0.395908  -1.407922   
2  2016  246      1 -1.305660  1.244529 -0.767030 -0.395908   0.527530   
3  2016  322      1  0.974739 -1.164512 -0.767030 -0.395908   0.527530   
4  2016  238      1  1.312828 -0.808415  1.449737 -0.395908  -1.407922   

     income  
0 -0.558012  
1 -0.558413  
2 -0.045499  
3 -0.842672  
4 -0.045499  


## Min-Max Scaled Dataset


In [31]:
# Select the numerical features to scale
numerical_features = ['length', 'weight', 'count', 'looped', 'neighbors', 'income']

# Initialize the scalers
minmax_scaler = MinMaxScaler()


# Scale the features using MinMaxScaler
minmax_scaled_features = minmax_scaler.fit_transform(bitcoin_df[numerical_features])

# Create DataFrames with scaled features
minmax_scaled_df = pd.DataFrame(minmax_scaled_features, columns=numerical_features)

# Concatenate the scaled features with non-scaled features
minmax_scaled_df = pd.concat([bitcoin_df.drop(columns=numerical_features), minmax_scaled_df], axis=1)

print("MinMax Scaled DataFrame:")
print(minmax_scaled_df.head())

# Use MinMax Scaled dataset from this point onwards
bitcoin_df = minmax_scaled_df

MinMax Scaled DataFrame:
     year    day  label    length    weight     count  looped  neighbors  \
0  2017.0   11.0    1.0  0.609728  0.012518  0.000000     0.0    0.43121   
1  2016.0  132.0    1.0  0.778238  0.000370  0.000000     0.0    0.00000   
2  2016.0  246.0    1.0  0.000000  0.650865  0.000000     0.0    0.43121   
3  2016.0  322.0    1.0  0.870884  0.005900  0.000000     0.0    0.43121   
4  2016.0  238.0    1.0  1.000000  0.101237  0.909528     0.0    0.00000   

     income  
0  0.259310  
1  0.259217  
2  0.377801  
3  0.193497  
4  0.377801  


## Dimension Reduction(if required):

The first principal component explains almost all of the variance in the data (close to 100%), while the subsequent components explain very little variance. In this case, retaining just one principal component would capture the majority of the variance in the data.

Therefore, this dataset can be reduced to one dimension.

In [44]:
from sklearn.decomposition import PCA

# Assume bitcoin_df is your DataFrame containing the features
# Select the numerical features to apply PCA
numerical_features = ['length', 'weight', 'count', 'looped', 'neighbors', 'income']

# Extract the numerical features
X = bitcoin_df[numerical_features]

# Initialize PCA with the desired number of components
pca = PCA(n_components=2)  # You can specify the number of components you want to retain

# Fit PCA to the feature matrix
pca.fit(X)

# Transform the feature matrix to its principal components
X_pca = pca.transform(X)

# X_pca will contain the reduced-dimensional representation of the data
reduced_df = pd.DataFrame(X_pca)

# Use dimension reduced data frame from this point onwards
bitcoin_df = reduced_df

## Balance Dataset with Under Sampling

In [41]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

target = 'label'

# Check for missing values in the target variable
missing_target = bitcoin_df[target].isna().sum()
print(f"Number of missing values in target: {missing_target}")

# Split the data into features (X) and target (y)
X = bitcoin_df.drop(columns=[target])
y = bitcoin_df[target]

# Initialize the resamplers
undersample = RandomUnderSampler(random_state=42)

# Apply undersampling
X_under, y_under = undersample.fit_resample(X, y)

# Print the number of samples after resampling
print("Number of samples after undersampling:", X_under.shape[0])

print("\nClass distribution after undersampling:")
print(pd.Series(y_under).value_counts())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.2, random_state=42)

Number of missing values in target: 0
Number of samples after undersampling: 81604

Class distribution after undersampling:
label
0    40802
1    40802
Name: count, dtype: int64



## Balance Dataset with a combination of undersampling and oversampling



In [42]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

# Assuming 'target' is the name of your target variable column
target = 'label'

# Split the data into features (X) and target (y)
X = bitcoin_df.drop(columns=[target])
y = bitcoin_df[target]

# Initialize the resamplers
smoteenn = SMOTEENN(random_state=42)

# Apply combination (SMOTE + ENN)
X_combined, y_combined = smoteenn.fit_resample(X_train, y_train)


# Print the number of samples after resampling
print("Number of samples after combination:", X_combined.shape[0])

print("\nClass distribution after combination :")
print(pd.Series(y_combined).value_counts())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


Number of samples after combination: 37116

Class distribution after combination :
label
1    20496
0    16620
Name: count, dtype: int64


## Balance Dataset with oversampling

In [43]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

# Assuming 'target' is the name of your target variable column
target = 'label'

# Split the data into features (X) and target (y)
X = bitcoin_df.drop(columns=[target])
y = bitcoin_df[target]

# Initialize the resamplers
smote = SMOTE(random_state=42)

# Apply oversampling (SMOTE)
X_over, y_over = smote.fit_resample(X, y)

# Print the number of samples after resampling
print("Number of samples after oversampling (SMOTE):", X_over.shape[0])

print("\nClass distribution after oversampling (SMOTE):")
print(pd.Series(y_over).value_counts())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)


Number of samples after oversampling (SMOTE): 5709906

Class distribution after oversampling (SMOTE):
label
1    2854953
0    2854953
Name: count, dtype: int64
