# Model Tuning

In [1]:
import pandas as pd
import numpy as np
import datetime
import networkx as nx
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import sys
sys.path.insert(0, '../src')
import helpers as h
import visualizations as v
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

In [2]:
df_historical = pd.read_csv('../data/historical_features.csv')

# Create Subset of High Value Rating Transactions
mask = df_historical[(df_historical['num_neg_received']==0) & 
                        (df_historical['num_pos_received']>=3)]

# Create Subset of High Value Rating Transactions with fraud but suspicious removed
# df_mask = df_historical[(df_historical['num_neg_received']==0) & 
#                         (df_historical['num_pos_received']>=3) & 
#                         ((df_historical['rating']>0) | (df_historical['rating']==-10))]

df_historical.drop(['color', 'penwidth'], inplace=True, axis=1)
df_historical = df_historical[df_historical.index.isin(mask.index)]

df_velocity = pd.read_csv('../data/velocity_features.csv')
df_velocity.drop(['color', 'penwidth'], inplace=True, axis=1)
df_velocity = df_velocity[df_velocity.index.isin(mask.index)]

df_graph = pd.read_csv('../data/graph_features.csv')
df_graph.drop(['color', 'penwidth'], inplace=True, axis=1)
df_graph = df_graph[df_graph.index.isin(mask.index)]

# Merge All Three Feature Categories
merge_cols = ['ratee','rater','date','rating']
dfs = [df_historical, df_velocity, df_graph]
df_all = reduce(lambda left,right: pd.merge(left,right,on=merge_cols), dfs)

In [3]:
# # Load data 
# otc_df = h.load_bitcoin_edge_data('../data/soc-sign-bitcoinotc.csv.gz')

# # Load preprocess features
# df_otc_features = pd.read_csv('../data/df_features_otc.csv')
# df_otc_velocity = pd.read_csv('../data/df_otc_vd.csv')

# merge_cols = ['ratee','rater','date','rating','class']
# features_df = pd.merge(df_otc_features, df_otc_velocity, on=merge_cols)
# features_df.drop(['binomial_rating', 'color', 'penwidth'], inplace=True, axis=1)

In [4]:
X = df_all.copy()
y = np.where(X['rating']<0, 1, 0) # set class as all negative ratings
X = X.drop(['rater', 'ratee', 'rating', 'date'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    random_state=123)

### Baseline

In [6]:
RF = RandomForestClassifier(n_jobs=-1, random_state=123)
RF.fit(X_train, y_train)
y_preds = RF.predict(X_test)
recall = recall_score(y_test, y_preds)
precision = precision_score(y_test, y_preds)

print(f"Recall: {recall}")
print(f"Precision: {precision}")

Recall: 0.11450381679389313
Precision: 0.5769230769230769


## Model Tuning

### Selecting a hyperparameter grid

In [7]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [8]:
# Random grid to search for best hyperparameters - 100 different combinations, 3 fold cv
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, scoring=make_scorer(recall_score),
                               random_state=123, n_jobs = -1)

start_time = time.time()
rf_random.fit(X_train, y_train)
print(f"{(time.time() - start_time):.0f} seconds execution time")

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 17.6min finished


1090 seconds execution time


In [9]:
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': False}

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

### Compare Best Random Search Model with Base Model

In [10]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"Model Performance:")
    print(f"F1_score: {f1:0.4f}")
    print(f"Recall score = {recall:0.4f}")
    print(f"Precision score = {precision:0.4f}\n")   
    return f1
base_model = RandomForestClassifier(oob_score=True, n_estimators=10, random_state=123)
base_model.fit(X_train, y_train)
base_f1_score = evaluate(base_model, X_test, y_test)

best_random = rf_random.best_estimator_
random_f1_score = evaluate(best_random, X_test, y_test)

improvement = (100 * (random_f1_score - base_f1_score) / base_f1_score)
print(f"Improvement of {improvement:0.2f}%")

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Model Performance:
F1_score: 0.2013
Recall score = 0.1221
Precision score = 0.5714

Model Performance:
F1_score: 0.1975
Recall score = 0.1221
Precision score = 0.5161

Improvement of -1.85%


Model Performance:
F1_score: 0.6402
Recall score = 0.5241
Precision score = 0.8222

Model Performance:
F1_score: 0.6570
Recall score = 0.5342
Precision score = 0.8530

Improvement of 2.63%

## GridSearch

In [12]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [45, 50, 55],
    'max_features': [4, 5],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [1400, 1600, 1800]}
    
rf = RandomForestClassifier(oob_score=True)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=make_scorer(recall_score))
start_time = time.time()
grid_search.fit(X_train, y_train)
print(f"{(time.time() - start_time):.0f} seconds execution time")

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 29.1min finished


1784 seconds execution time


In [13]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 55,
 'max_features': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 1800}

##### {'bootstrap': True,
 'max_depth': 55,
 'max_features': 5
 'min_samples_leaf': 1
 'min_samples_split': 3
 'n_estimators': 1800}

In [14]:
best_grid = grid_search.best_estimator_
grid_f1_score = evaluate(best_grid, X_test, y_test)

improvement = (100 * (grid_f1_score - base_f1_score) / base_f1_score)
print(f"Improvement of {improvement:0.2f}%")
print(f"\nAccuracy Score: {grid_search.best_estimator_.score(X_test, y_test):0.4f}")
print(f"OOB Score: {grid_search.best_estimator_.oob_score_:0.4f}")


Model Performance:
F1_score: 0.1887
Recall score = 0.1145
Precision score = 0.5357

Improvement of -6.25%

Accuracy Score: 0.9713
OOB Score: 0.9732


Model Performance:
F1_score: 0.6571
Recall score = 0.5387
Precision score = 0.8421

Improvement of 2.64%

Accuracy Score: 0.9437
OOB Score: 0.9462