In [2]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')

from data.loader import DataLoader
from data.preparation import DataPreparation
from models.rfc import RFCModel

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import pandas as pd

In [27]:
# Texas
X_train, y_train = DataLoader('../data/fraudTrain.csv').load_state_by_id(0)
X_test, y_test = DataLoader('../data/fraudTest.csv').load_state_by_id(0)

In [28]:
prep_smote = DataPreparation()
scaler_smote = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.2,
    n_jobs=-1)
rfc_smote = RFCModel(n_estimators=75)
smote_pipeline = make_pipeline(prep_smote, scaler_smote, smote, rfc_smote)

In [29]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.5s finished


In [30]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  75 out of  75 | elapsed:    0.1s finished


In [31]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,40255,25
Actual Fraud,54,59


In [33]:
scores = pd.DataFrame(metrics, index=[0])
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE_TX')
scores = pd.concat([smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,SMOTE_TX,0.998044,0.702381,0.522124,0.598985


In [37]:
# New York
X_train, y_train = DataLoader('../data/fraudTrain.csv').load_state_by_id(1)
X_test, y_test = DataLoader('../data/fraudTest.csv').load_state_by_id(1)

In [35]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.3s finished


In [38]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  75 out of  75 | elapsed:    0.1s finished


In [39]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,35721,22
Actual Fraud,80,95


In [40]:
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE_NY')
scores = pd.concat([scores, smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,SMOTE_TX,0.998044,0.702381,0.522124,0.598985
0,SMOTE_NY,0.99716,0.811966,0.542857,0.650685


# Continual

In [41]:
# Texas
X_train_TX, y_train_TX = DataLoader('../data/fraudTrain.csv').load_state_by_id(0)
X_test_TX, y_test_TX = DataLoader('../data/fraudTest.csv').load_state_by_id(0)

X_train_NY, y_train_NY = DataLoader('../data/fraudTrain.csv').load_state_by_id(1)
X_test_NY, y_test_NY = DataLoader('../data/fraudTest.csv').load_state_by_id(1)

In [3]:
import sys
import sys
sys.path.append('..')

In [4]:
from generative_rf import FeatureGenerator, class_sampling
import numpy as np

In [8]:
def loss(y, y_pred):
    return np.sum(np.abs(y-y_pred))

In [9]:
from sklearn.ensemble import RandomForestClassifier
from generative_rf import FeatureGenerator, class_sampling

gen_rf = FeatureGenerator()
X, y = DataLoader('../data/fraudTrain.csv').load_state_by_id(0)
gen_rf.register(RandomForestClassifier().fit(X, y)).reinforce(X)
gen_rf.update_moments(X)

while True:
  X, y = DataLoader('../data/fraudTrain.csv').load_state_by_id(0)
  y_pred = gen_rf.predict_proba(X)
  # loss() and MAX_LOSS are provided by you
  if loss(y, y_pred).mean() > 80:
    # generate new data
    # please tailor approx_n for the problem at hand
    X2, w2 = gen_rf.generate(approx_n=20000)
    proba = gen_rf.predict_proba(X2)
    X2, y2, sample_weights = class_sampling(X2, proba, w2)

    # merge with current data
    X_all = np.concatenate([X, X2], axis=0)
    y_all = np.concatenate([y, y2], axis=0)
    w = np.concatenate([[1]*len(y), sample_weights], axis=0)

    # train a new forest from all the data
    new_rf = RandomForestClassifier().fit(X_all, y_all, sample_weight=w)
    gen_rf.register(new_rf).reinforce(X2, w2)

  # this is called regardless of retraining
  gen_rf.reinforce(X).update_moments(X)

ValueError: could not convert string to float: '2019-01-01 00:17:40'