In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import warnings
import joblib

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [2]:
# Load the dataset
df = pd.read_csv('Cryptocurrency_cleaned.csv')

# Display the first few rows
df.head()

Unnamed: 0,Class,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25,a26,a27,a28,a29,a30,a31,a32,a33,a34,a35,a36,a37,a38,a39,a40,a41,a42,a43,a44,a45,a46,a47,Class_encoded
0,False,844.26,1093.71,704785.63,721,89,0,40,118,0.0,45.806785,6.589513,0.0,31.22,1.200681,0.0,0.0,0.0,810,865.691093,586.466675,0.0,-279.224419,265.0,35588540.0,35603170.0,0.0,30.0,54.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,15000000.0,265586.1476,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire,0
1,False,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.613269,0.385685,0.0,1.8,0.032844,0.0,0.0,0.0,102,3.087297,3.085478,0.0,-0.001819,8.0,403.4283,2.260809,0.0,1.0,5.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,365.0,57.632615,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token,0
2,False,246194.54,2434.02,516729.3,2,10,0,10,2,0.113119,1.165453,0.358906,0.05,3.538616,1.794308,0.0,0.0,0.0,12,3.588616,3.589057,0.0,0.000441,8.0,521.5121,0.0,0.0,0.0,7.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,442.8198,65.189009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON,0
3,False,10219.6,15785.09,397555.9,25,9,0,7,13,0.0,500.0,99.48884,0.0,450.0,70.001834,0.0,0.0,0.0,34,1750.045862,895.399559,0.0,-854.646303,14.0,17111.05,11412.23,0.0,2.0,11.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,11412.23,1555.550174,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON,0
4,False,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.802411,2.671095,0.0,9.0,0.022688,0.0,0.0,0.0,4619,104.318883,53.421897,0.0,-50.896986,42.0,162829.7,123539.9,0.0,4.0,23.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,90000.0,4934.232147,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS,0


In [3]:
# Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

In [4]:
# Separate features and target variable
X = df.drop(['Class', 'Class_encoded'], axis=1)
y = df['Class'].astype(int)

## Task 1: Feature Selection

- *Manual Selection*: Based on domain knowledge and understanding of the data, manually select a subset of features that are intuitively relevant to fraud detection.
- *Gini Index Method*: Use a decision tree classifier to rank features based on the Gini impurity decrease they bring when used to split the data.
- *Information Gain Method*: Utilize an entropy-based approach to evaluate the reduction in uncertainty about the target variable when given the feature.
- *Correlation Method*: Analyze the correlation matrix to select features that have a high correlation with the target variable but low correlation with each other to avoid multicollinearity.

### 1. Manual Selection: 
Since manual selection requires domain-specific knowledge which might not be directly applicable without further insights into each attribute, we'll focus on other methods. Also we don't haveinformation for each attribute.

### 2. Gini Index Method:
We'll use a decision tree classifier to identify the most important features based on the Gini impurity decrease.

In [5]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X, y)
gini_importances = pd.Series(dt.feature_importances_, index=X.columns)

In [6]:
# Selecting top N features based on each method
N = 10  # Number of features to select

In [7]:
# Top N features based on Gini Importance
gini_top_features = gini_importances.nlargest(N).index.tolist()

In [8]:
gini_top_features

['a46', 'a47', 'a10', 'a23', 'a3', 'a4', 'a11', 'a7', 'a44', 'a25']

In [9]:
gini_subset = df[gini_top_features + ['Class']]

In [10]:
# Save to csv
gini_subset.to_csv('gini_selection.csv', index=False)

### 3. Information Gain Method: 
We'll use mutual information gain as a proxy to select features that share the most information with the target variable.

In [11]:
info_gain_importances = mutual_info_classif(X, y)
info_gain_importances = pd.Series(info_gain_importances, index=X.columns)

In [12]:
# Top N features based on Information Gain
info_gain_top_features = info_gain_importances.nlargest(N).index.tolist()

In [13]:
info_gain_top_features

['a23', 'a45', 'a28', 'a30', 'a46', 'a3', 'a47', 'a10', 'a22', 'a11']

In [14]:
info_gain_subset = df[info_gain_top_features + ['Class']]

In [15]:
info_gain_subset.to_csv('info_gain_selection.csv', index=False)

### 4. Correlation Method: 
We'll compute the correlation between all features and the target variable, selecting those with the highest absolute values, considering multicollinearity among predictors.

In [16]:
# Pearson correlation
correlations = X.corrwith(y)
correlations = correlations.abs().sort_values(ascending=False)

In [17]:
# Top N features based on Correlation
correlation_top_features = correlations.nlargest(N + 1).index.tolist()  # +1 as it includes the target

In [18]:
correlation_top_features

['a46', 'a3', 'a2', 'a18', 'a5', 'a4', 'a14', 'a47', 'a45', 'a30', 'a8']

In [19]:
correlation_subset = df[correlation_top_features + ['Class']]

In [20]:
correlation_subset.to_csv('correlation_selection.csv', index=False)

---
---

## Task 2: Model Building and Evaluation

- `Decision Tree` (CART): Train and test using the CART algorithm.
- `Random Forest`: A more robust ensemble method building upon decision trees.
- `LightGBM` (LGBM): A gradient boosting framework that uses tree-based learning algorithms, known for its efficiency and accuracy.
- `Support Vector Machine` (SVM): A powerful classifier that works well for both linear and non-linear data.

    - We'll split each dataset into 70% training data and 30% testing data.
    - For each dataset and algorithm pair, we'll train the model and compute its performance metrics: True Positives (TP), False Positives (FP), True Negatives (TN), False Negatives (FN), accuracy, precision, and recall.

In [21]:
# Load the datasets
gini_df = pd.read_csv('gini_selection.csv')
info_gain_df = pd.read_csv('info_gain_selection.csv')
correlation_df = pd.read_csv('correlation_selection.csv')

In [22]:
# Define a function to train and evaluate a model
def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    cm = confusion_matrix(y_test, predictions)
    TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    return TP, FP, TN, FN, accuracy, precision, recall

In [23]:
# Model training and evaluation function
def evaluate_all_models(datasets):
    results = {}
    for name, df in datasets.items():
        X = df.drop('Class', axis=1)
        y = df['Class']
        
        # Scaling features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Splitting the dataset
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
        
        # Models
        models = {
            'CART': DecisionTreeClassifier(random_state=42),
            'Random Forest': RandomForestClassifier(random_state=42),
            'LGBM': LGBMClassifier(random_state=42),
            'SVM': SVC(random_state=42)
        }
        
        for model_name, model in models.items():
            results[f"{name}_{model_name}"] = train_evaluate_model(model, X_train, X_test, y_train, y_test)
    
    return results

In [24]:
datasets = {
    'Original': df.drop(['Class_encoded'], axis=1),
    'Gini': gini_df,
    'Info Gain': info_gain_df,
    'Correlation': correlation_df
}

In [25]:
results = evaluate_all_models(datasets)

[LightGBM] [Info] Number of positive: 1529, number of negative: 5359
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6982
[LightGBM] [Info] Number of data points in the train set: 6888, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221980 -> initscore=-1.254163
[LightGBM] [Info] Start training from score -1.254163
[LightGBM] [Info] Number of positive: 1529, number of negative: 5359
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1962
[LightGBM] [Info] Number of data points in the train set: 6888, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221980 -> initscore=-1.254163
[LightGBM] [Info] Start training from score -1.254163
[LightGBM] [Info] 

In [26]:
results_df = pd.DataFrame.from_dict(results, orient='index',
                                     columns=['TP', 'FP', 'TN', 'FN', 'Accuracy', 'Precision', 'Recall'])

In [27]:
# Resetting index to make it a column and splitting it for better readability
results_df.reset_index(inplace=True)
results_df[['Dataset', 'Model']] = results_df['index'].str.split('_', expand=True)
results_df.drop('index', axis=1, inplace=True)

In [28]:
# Reordering columns for clarity
results_df = results_df[['Dataset', 'Model', 'TP', 'FP', 'TN', 'FN', 'Accuracy', 'Precision', 'Recall']]
print(results_df)

        Dataset          Model   TP  FP    TN   FN  Accuracy  Precision    Recall
0      Original           CART  628  16  2287   22  0.987132   0.975155  0.966154
1      Original  Random Forest  615   1  2302   35  0.987809   0.998377  0.946154
2      Original           LGBM  633   4  2299   17  0.992889   0.993721  0.973846
3      Original            SVM  344  38  2265  306  0.883508   0.900524  0.529231
4          Gini           CART  624  14  2289   26  0.986454   0.978056  0.960000
5          Gini  Random Forest  619   1  2302   31  0.989164   0.998387  0.952308
6          Gini           LGBM  634   4  2299   16  0.993227   0.993730  0.975385
7          Gini            SVM  347   9  2294  303  0.894345   0.974719  0.533846
8     Info Gain           CART  621  16  2287   29  0.984761   0.974882  0.955385
9     Info Gain  Random Forest  616   2  2301   34  0.987809   0.996764  0.947692
10    Info Gain           LGBM  626   9  2294   24  0.988825   0.985827  0.963077
11    Info Gain 

#### Original Dataset
- **CART** shows high performance with an accuracy of 98.71%, precision of 97.52%, and recall of 96.62%. However, it has a slightly higher false positive rate compared to Random Forest.
- **Random Forest** achieves the highest precision (99.84%) with a strong balance of accuracy (98.78%) and recall (94.62%), indicating it is very reliable at predicting true positives while minimizing false positives.
- **LGBM** stands out with the highest recall (97.38%) and very high precision (99.37%), suggesting it's exceptionally effective at identifying fraudulent transactions with minimal false negatives and positives.
- **SVM** has significantly lower performance metrics compared to the tree-based models, indicating it may not be as suitable for this particular task in terms of accuracy (88.35%), precision (90.05%), and recall (52.92%).

#### Gini Feature Subset
- **CART** maintains high performance on the Gini subset, with slightly improved precision and recall compared to the original dataset.
- **Random Forest** shows a small improvement in recall (95.23%) while maintaining almost perfect precision, highlighting its consistency across different feature subsets.
- **LGBM** again shows superior performance, with the best recall (97.54%) and almost perfect precision, indicating its strong capability to generalize well on reduced feature sets.
- **SVM**, while showing an improvement in precision on this subset, still lags significantly behind in recall, making it less effective for practical application.

#### Information Gain Feature Subset
- **CART** and **Random Forest** both show a slight decrease in performance metrics compared to the original and Gini subsets, but still perform robustly with high accuracy and precision.
- **LGBM** presents a balanced performance with excellent precision and recall, slightly outperforming the Random Forest in recall, showcasing its effectiveness in feature-rich environments.
- **SVM** shows a modest improvement in precision but remains the least effective model in terms of recall across the subsets.

#### Correlation Feature Subset
- **CART** shows a slight improvement in precision and maintains high accuracy and recall, indicating its robustness across various feature selection methods.
- **Random Forest** continues to exhibit high precision with a slight improvement in recall, reinforcing its reliability and consistency.
- **LGBM** demonstrates high performance with the best balance between precision and recall among the models, indicating its strong predictive power and efficiency.
- **SVM** remains consistent with its performance, showing slightly improved precision but still lower recall compared to other models.

---
---

## Task 3: Model Comparison and Selection

- Compare the performance metrics across all models to identify the most effective combination of features and algorithm for fraud detection.
- The best model will be selected based on a balance of accuracy, precision, and recall, considering the critical nature of minimizing false negatives (FN) and false positives (FP) in fraud detection.

#### Performance Analysis Overview

The metrics to consider are Accuracy, Precision, and Recall:
- **Accuracy** measures the overall correctness of the model.
- **Precision** measures the accuracy of positive predictions (important to minimize false positives).
- **Recall** measures the ability of the model to detect all positive instances (important to minimize false negatives).

#### Models Performance

- **CART (Decision Tree)**
  - Performs well across all datasets, with slightly better performance on the original dataset.
  - High recall but slightly lower precision compared to Random Forest and LGBM in some cases.

- **Random Forest**
  - Consistently high performance across all metrics and datasets.
  - Offers the best balance between precision and recall, indicating fewer false positives and negatives.

- **LGBM (Light Gradient Boosting Machine)**
  - Shows the best or near-best performance in most metrics across all datasets.
  - Especially high recall values indicate excellent capability in identifying true positive cases.

- **SVM (Support Vector Machine)**
  - Significantly lower performance compared to tree-based models, especially in recall, indicating a higher miss rate for true positive cases.

#### Best Model Selection

Given the metrics, **LGBM** emerges as the top performer, especially considering the high recall and precision across different datasets. Its consistent performance across the original and feature-selected datasets indicates robustness and effectiveness in fraud detection tasks. LGBM's ability to handle large datasets efficiently, along with its superior handling of categorical features and gradient boosting mechanism, likely contributes to its standout performance.

#### Best Dataset

The **Gini** feature selection method with the **LGBM** model shows slightly higher accuracy and recall compared to other feature selection methods, making it the most effective combination. This suggests that the Gini method effectively captures the most relevant features for fraud detection, and when combined with LGBM's modeling capabilities, it maximizes the detection of fraudulent transactions with minimal false positives.

#### Conclusion

The **LGBM model trained on the Gini feature-selected dataset** is recommended as the best model for this fraud detection task, based on your results. This combination provides a balanced approach to maximizing recall and precision, essential for minimizing both false negatives (missing fraudulent transactions) and false positives (falsely flagging legitimate transactions as fraud).

---
---

## Task 4: Re-training and Real-time Application

- Re-train the selected best model on the entire dataset (using the chosen feature subset).
- Demonstrate its application on real-time input to predict fraud.

#### Re-train the best model (LGBM) on the entire dataset using the Gini feature subset.

In [29]:
X_best = gini_df.drop('Class', axis=1)
y_best = gini_df['Class']

In [30]:
# Scaling features
scaler = StandardScaler()
X_best_scaled = scaler.fit_transform(X_best)

In [31]:
# Splitting the dataset into training and testing sets
X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best_scaled, y_best, test_size=0.3, random_state=42)

In [32]:
# Initialize and train the best model (LGBM)
best_model = LGBMClassifier(random_state=42)
best_model.fit(X_train_best, y_train_best)

[LightGBM] [Info] Number of positive: 1529, number of negative: 5359
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1962
[LightGBM] [Info] Number of data points in the train set: 6888, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221980 -> initscore=-1.254163
[LightGBM] [Info] Start training from score -1.254163


In [33]:
# Evaluate the re-trained model on the test set
predictions_best = best_model.predict(X_test_best)
accuracy_best = accuracy_score(y_test_best, predictions_best)
precision_best = precision_score(y_test_best, predictions_best)
recall_best = recall_score(y_test_best, predictions_best)

In [34]:
accuracy_best, precision_best, recall_best

(0.993227226549272, 0.9937304075235109, 0.9753846153846154)

In [35]:
# Export the model
model_filename = 'best_model.joblib'
joblib.dump(best_model, model_filename)

model_filename

'best_model.joblib'

#### Predict Fraud in Real-Time Transactions.
- True:  Fraudulent
- False: Not Fraudulent

In [36]:
# Import the model
loaded_model = joblib.load(model_filename)

In [37]:
sample_data = [
    [0.5, -0.2, 0.3, -0.1, 0.2, -0.3, 0.4, 0.1, -0.4, 0.6],
    [-0.3, 0.6, 0.1, 0.1, -0.7, -0.4, -0.8, 0.8, -0.3, -0.2],
    [0.8, -0.9, -1, 0.3, 0.3, 0.4, -0.1, -0.8, -0.7, 0.8],
    [-0.3, 0.6, -0.4, 0.5, -0.3, -0.8, 0.2, -0.5, 0.3, -0.6],
    [-0.6, -0.7, 0, 0.7, -0.8, -0.8, 1, -0.5, 0.7, 0.2],
    [-0.5, 0.4, -0.2, 0.1, -0.4, 0.3, -0.3, -0.2, 0.6, -0.1],
    [-0.8, -0.2, -0.7, 0.3, 0, 0.4, 0, 0.3, -1, -0.6]
]

# Predict using the loaded model
predictions = loaded_model.predict(sample_data)
predictions

array([False,  True, False, False,  True, False,  True])

In [38]:
# Selecting 5 samples from the test data
X_test_best_samples = X_test_best[:5]
y_test_best_samples = y_test_best[:5]

# Predict on the selected test data samples
test_data_predictions = loaded_model.predict(X_test_best_samples)
print("Pridected: ", test_data_predictions)
print("Acutal: ", y_test_best_samples.values)

Pridected:  [ True  True False False False]
Acutal:  [ True  True False False False]
