In [None]:
# Model Tuning

import pandas as pd
import numpy as np
import datetime
import networkx as nx

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import sys
sys.path.insert(0, '../src')
import helpers as h
import visualizations as v
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

In [114]:
# Load data 
otc_df = h.load_bitcoin_edge_data('../data/soc-sign-bitcoinotc.csv.gz')

# Load preprocess features
df_otc_features = pd.read_csv('../data/df_features_otc.csv')
df_otc_velocity = pd.read_csv('../data/df_otc_vd.csv')

merge_cols = ['ratee','rater','date','rating','class']
features_df = pd.merge(df_otc_features, df_otc_velocity, on=merge_cols)
features_df.drop(['binomial_rating', 'color', 'penwidth'], inplace=True, axis=1)

### drop velocity features

In [115]:
velocity_cols = ['vel_24_in_pos','vel_24_in_all','vel_24_out_pos',
                 'vel_24_out_neg','vel_24_out_all','vel_24_all',
                 'vel_48_in_pos','vel_48_in_all','vel_48_out_pos',
                 'vel_48_out_neg','vel_48_out_all','vel_48_all'] 
features_df.drop(velocity_cols, inplace=True, axis=1)

### Random Forest Classifier

In [107]:
X = features_df.copy()
X = X.drop(['rater', 'ratee', 'rating','date'], axis=1)
y = X.pop('class')
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, shuffle=True,
                                                    random_state=123)

RF = RandomForestClassifier(n_jobs=-1, random_state=123)
RF.fit(X_train, y_train)
y_preds = RF.predict(X_test)
recall = recall_score(y_test, y_preds)
precision = precision_score(y_test, y_preds)

print(f"Recall: {recall}")
print(f"Precision: {precision}")
RF.feature_importances_

Recall: 0.5297418630751964
Precision: 0.8109965635738832


array([0.01860441, 0.03722324, 0.02184222, 0.11695053, 0.11888501,
       0.09094446, 0.05163884, 0.05192537, 0.05446381, 0.09490893,
       0.01630384, 0.01584243, 0.02450624, 0.0140141 , 0.02686423,
       0.02545155, 0.02502995, 0.02722609, 0.02166095, 0.0226645 ,
       0.05830643, 0.06474286])

In [154]:
import seaborn as sns
fig = plt.figure(figsize=(8,8))

name = "Random Forest"
indices = np.argsort(RF.feature_importances_)[::-1][:40]
ax1 = sns.barplot(y=X_train.columns[indices][:40],x = RF.feature_importances_[indices][:40] , orient='h')
ax1.set_xlabel("Relative importance",fontsize=12)
ax1.set_ylabel("Features",fontsize=12)
ax1.tick_params(labelsize=9)
ax1.set_title(name + " feature importance")
plt.tight_layout()

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

<Figure size 576x576 with 0 Axes>

## Model Tuning

### Selecting a hyperparameter grid

In [112]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [150]:
# Random grid to search for best hyperparameters - 100 different combinations, 3 fold cv
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)

start_time = time.time()
rf_random.fit(X_train, y_train)
print(f"{(time.time() - start_time):.0f} seconds execution time")

In [121]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

### Compare Best Random Search Model with Base Model

In [134]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"Model Performance:")
    print(f"F1_score: {f1:0.4f}")
    print(f"Recall score = {recall:0.4f}")
    print(f"Precision score = {precision:0.4f}\n")   
    return f1
base_model = RandomForestClassifier(oob_score=True, n_estimators=10, random_state=42)
base_model.fit(X_train, y_train)
base_f1_score = evaluate(base_model, X_test, y_test)

best_random = rf_random.best_estimator_
random_f1_score = evaluate(best_random, X_test, y_test)

improvement = (100 * (random_f1_score - base_f1_score) / base_f1_score)
print(f"Improvement of {improvement:0.2f}%")

Model Performance:
F1_score: 0.6402
Recall score = 0.5241
Precision score = 0.8222

Model Performance:
F1_score: 0.6570
Recall score = 0.5342
Precision score = 0.8530

Improvement of 2.63%


Model Performance:
F1_score: 0.6402
Recall score = 0.5241
Precision score = 0.8222

Model Performance:
F1_score: 0.6570
Recall score = 0.5342
Precision score = 0.8530

Improvement of 2.63%

## GridSearch

In [153]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15],
    'max_features': [4, 5],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [1000, 1200, 1400, 1600]}
    
rf = RandomForestClassifier(oob_score=True)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
start_time = time.time()
grid_search.fit(X_train, y_train)
print(f"{(time.time() - start_time):.0f} seconds execution time")

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 71.3min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 73.4min finished


4434 seconds execution time


In [155]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 15,
 'max_features': 4,
 'min_samples_leaf': 3,
 'min_samples_split': 7,
 'n_estimators': 1000}

{'bootstrap': True,
 'max_depth': 15,
 'max_features': 4,
 'min_samples_leaf': 3,
 'min_samples_split': 7,
 'n_estimators': 1000}

In [164]:
best_grid = grid_search.best_estimator_
grid_f1_score = evaluate(best_grid, X_test, y_test)

improvement = (100 * (grid_f1_score - base_f1_score) / base_f1_score)
print(f"Improvement of {improvement:0.2f}%")
print(f"\nAccuracy Score: {grid_search.best_estimator_.score(X_test, y_test):0.4f}")
print(f"OOB Score: {grid_search.best_estimator_.oob_score_:0.4f}")


Model Performance:
F1_score: 0.6571
Recall score = 0.5387
Precision score = 0.8421

Improvement of 2.64%

Accuracy Score: 0.9437
OOB Score: 0.9462


Model Performance:
F1_score: 0.6571
Recall score = 0.5387
Precision score = 0.8421

Improvement of 2.64%

Accuracy Score: 0.9437
OOB Score: 0.9462

### Model Exploration

In [None]:
import shap
explainer = shap.TreeExplainer(my_model)
shap_values = explainer.shap_values(val_X)
# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], val_X)

### Example of collusion fraud

In [85]:
features_df[features_df['ratee']==4986]

Unnamed: 0,rater,ratee,rating,date,class,num_ratings_received,num_neg_received,num_pos_received,neg_ratings_pct,rating_received_sum,rating_received_avg,days_since_first_rated,days_since_last_rated,days_active,successive_neg_rating,ego_triad_300,ego_triad_210,ego_triad_201,ego_triad_120,ego_triad_all,ego_cluster_coef,ego_degree,ego_betweeness,ego_closeness,ego_num_cliques,vel_24_in_neg,vel_48_in_neg


In [101]:
# THis is the example!!!
example_user = X[(features_df['ratee']==4986)].copy()
preds = RF.predict_proba(example_user)[:,1]
example_user['prediction'] = preds
example_user.to_csv()

Unnamed: 0,num_ratings_received,num_neg_received,num_pos_received,neg_ratings_pct,rating_received_sum,rating_received_avg,days_since_first_rated,days_since_last_rated,days_active,successive_neg_rating,ego_triad_300,ego_triad_210,ego_triad_201,ego_triad_120,ego_triad_all,ego_cluster_coef,ego_degree,ego_betweeness,ego_closeness,ego_num_cliques,vel_24_in_neg,vel_48_in_neg,prediction
28678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068631
28701,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069282
28707,2.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,1.0,1.0,2.0,0.0,0.0,0.023366
28718,3.0,0.0,3.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,3.0,0.333333,6.0,0.666667,1.0,2.0,0.0,0.0,0.445
28721,4.0,0.0,4.0,0.0,4.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,5.0,0.0,6.0,0.166667,8.0,0.833333,1.0,3.0,0.0,0.0,0.035
28729,5.0,0.0,5.0,0.0,5.0,1.0,2.0,0.0,2.0,0.0,1.0,1.0,6.0,0.0,8.0,0.1875,9.0,0.725,0.833333,4.0,0.0,0.0,0.69
28731,6.0,1.0,5.0,0.166667,-3.0,-0.5,2.0,0.0,2.0,0.0,1.0,1.0,6.0,0.0,8.0,0.1875,9.0,0.725,0.833333,4.0,1.0,1.0,0.68


In [45]:
example_user = X[(features_df['ratee']==4524)].copy()
preds = RF.predict_proba(example_user)[:,1]
example_user['prediction'] = preds
example_user[:10]

Unnamed: 0,num_ratings_received,num_neg_received,num_pos_received,neg_ratings_pct,rating_received_sum,rating_received_avg,days_since_first_rated,days_since_last_rated,days_active,successive_neg_rating,ego_triad_300,ego_triad_210,ego_triad_201,ego_triad_120,ego_triad_all,ego_cluster_coef,ego_degree,ego_betweeness,ego_closeness,ego_num_cliques,vel_24_in_neg,vel_48_in_neg,prediction
24474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071278
25221,1.0,0.0,1.0,0.0,3.0,3.0,19.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015
27312,2.0,0.0,2.0,0.0,5.0,2.5,54.0,54.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.5,0.5,2.0,0.0,0.0,0.01
27483,3.0,0.0,3.0,0.0,6.0,2.0,60.0,41.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.333333,0.333333,3.0,0.0,0.0,0.01
28088,4.0,0.0,4.0,0.0,11.0,2.75,92.0,37.0,92.0,0.0,0.0,1.0,1.0,0.0,2.0,0.153846,6.0,0.583333,0.5625,3.0,0.0,0.0,0.67
28091,5.0,1.0,4.0,0.2,1.0,0.2,92.0,32.0,92.0,0.0,0.0,1.0,1.0,0.0,2.0,0.153846,6.0,0.583333,0.5625,3.0,1.0,1.0,0.88
33203,6.0,2.0,4.0,0.333333,-9.0,-1.5,386.0,293.0,386.0,1.0,0.0,2.0,2.0,0.0,4.0,0.307692,6.0,0.5,0.666667,3.0,0.0,0.0,0.6


In [34]:
X_test[(y_preds==1)&(y_test==1)&(X_test['ego_triad_all']>3)&(X_test['ego_triad_all']<10)].head()

Unnamed: 0,num_ratings_received,num_neg_received,num_pos_received,neg_ratings_pct,rating_received_sum,rating_received_avg,days_since_first_rated,days_since_last_rated,days_active,successive_neg_rating,ego_triad_300,ego_triad_210,ego_triad_201,ego_triad_120,ego_triad_all,ego_cluster_coef,ego_degree,ego_betweeness,ego_closeness,ego_num_cliques,vel_24_in_neg,vel_48_in_neg
14277,7.0,3.0,4.0,0.428571,0.0,0.0,2.0,0.0,2.0,1.0,1.0,0.0,2.0,4.0,7.0,0.388889,7.0,0.5,0.8,2.0,2.0,2.0
14093,5.0,2.0,3.0,0.4,4.0,0.8,1.0,0.0,1.0,1.0,2.0,2.0,0.0,0.0,4.0,0.833333,6.0,0.083333,1.0,1.0,2.0,2.0
33203,6.0,2.0,4.0,0.333333,-9.0,-1.5,386.0,293.0,386.0,1.0,0.0,2.0,2.0,0.0,4.0,0.307692,6.0,0.5,0.666667,3.0,0.0,0.0
33227,7.0,1.0,6.0,0.142857,-4.0,-0.571429,573.0,340.0,573.0,0.0,0.0,0.0,3.0,1.0,4.0,0.045455,9.0,0.466667,0.5,5.0,0.0,0.0
26486,11.0,7.0,4.0,0.636364,-66.0,-6.0,9.0,0.0,9.0,3.0,4.0,0.0,0.0,0.0,4.0,0.666667,7.0,0.25,0.75,2.0,3.0,3.0


In [44]:
features_df[(features_df['rating_received_sum']==-9)&(features_df['num_ratings_received']==6)
           &(features_df['num_neg_received']==2)]

Unnamed: 0,rater,ratee,rating,date,class,num_ratings_received,num_neg_received,num_pos_received,neg_ratings_pct,rating_received_sum,rating_received_avg,days_since_first_rated,days_since_last_rated,days_active,successive_neg_rating,ego_triad_300,ego_triad_210,ego_triad_201,ego_triad_120,ego_triad_all,ego_cluster_coef,ego_degree,ego_betweeness,ego_closeness,ego_num_cliques,vel_24_in_neg,vel_48_in_neg
33203,3988,4524,-5,2014-07-28 01:34:15,1,6.0,2.0,4.0,0.333333,-9.0,-1.5,386.0,293.0,386.0,1.0,0.0,2.0,2.0,0.0,4.0,0.307692,6.0,0.5,0.666667,3.0,0.0,0.0
