In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')

from data.loader import DataLoader
from data.preparation import DataPreparation
from models.rfc import RFCModel

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import pandas as pd

In [2]:
# Texas
X_train, y_train = DataLoader('../data/fraudTrain.csv').load_state_by_id(0)
X_test, y_test = DataLoader('../data/fraudTest.csv').load_state_by_id(0)

In [3]:
prep_smote = DataPreparation()
scaler_smote = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.2,
    n_jobs=-1)
rfc_smote = RFCModel(n_estimators=75)
smote_pipeline = make_pipeline(prep_smote, scaler_smote, smote, rfc_smote)

In [4]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.6s finished


In [5]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done  75 out of  75 | elapsed:    0.1s finished


In [6]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,40261,19
Actual Fraud,48,65


In [7]:
scores = pd.DataFrame(metrics, index=[0])
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE_TX')
scores = pd.concat([smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,SMOTE_TX,0.998341,0.77381,0.575221,0.659898


In [8]:
# New York
X_train, y_train = DataLoader('../data/fraudTrain.csv').load_state_by_id(1)
X_test, y_test = DataLoader('../data/fraudTest.csv').load_state_by_id(1)

In [9]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.0s finished


In [10]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done  75 out of  75 | elapsed:    0.1s finished


In [11]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,35721,22
Actual Fraud,55,120


In [12]:
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE_NY')
scores = pd.concat([scores, smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,SMOTE_TX,0.998341,0.77381,0.575221,0.659898
0,SMOTE_NY,0.997856,0.84507,0.685714,0.757098


# Continual

In [19]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')
sys.path.append('../generative_rf')
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from generative_rf import FeatureGenerator, class_sampling
from data.loader import DataLoader
from data.preparation import ContinousDataPreparation
from sklearn.ensemble import RandomForestClassifier
from evaluate.evaluate import evaluate_model
from tqdm import tqdm

In [20]:
loader = DataLoader('../data/fraudTrain.csv')
prep = ContinousDataPreparation()
prep.fit()

<data.preparation.ContinousDataPreparation at 0x1d96f677cd0>

In [21]:
X_test, y_test = DataLoader('../data/fraudTest.csv').load()
X_test = prep.transform(X_test)

In [22]:
# gen_rf = FeatureGenerator()
# X, y = loader.load_state_by_id(0)
# X = prep.transform(X)
# X, y = smote = SMOTE(
#     random_state=42,
#     sampling_strategy=0.2,
#     n_jobs=-1).fit_resample(X, y)
# rfc = RandomForestClassifier(n_estimators=75, n_jobs=-1).fit(X, y)

# y_pred = rfc.predict(X_test)
# metrics = evaluate_model(y_test, y_pred)
# scores = pd.DataFrame(metrics, index=[0])

# gen_rf.register(rfc).reinforce(X)
# gen_rf.update_moments(X)

# # loader.get_state_list_size()
# for i in tqdm(range(1, 5)):
#   X_new, y_new = loader.load_state_by_id(i)
#   X_new = prep.transform(X_new)
#   X_new, y_new = SMOTE(
#     random_state=42,
#     sampling_strategy=0.2,
#     n_jobs=-1).fit_resample(X_new, y_new)
#   # generate new data
#   # please tailor approx_n for the problem at hand
#   # Boti: len(y_new) to be the same size as the new data
#   X_gen, w_gen = gen_rf.generate(approx_n=len(y_new))
#   proba = gen_rf.predict_proba(X_gen)
#   y_gen = np.argmax(proba, axis=1)
#   # sample_weights = (-np.sort(-proba, axis=1)[:,:1]).flatten() * w_gen
#   # X_gen, y_gen, sample_weights = class_sampling(X_gen, proba, w_gen)

#   # merge with current data
#   X_all = np.concatenate([X_new, X_gen], axis=0)
#   y_all = np.concatenate([y_new, y_gen], axis=0)
#   # w = np.concatenate([[1]*len(y_new), sample_weights], axis=0)

#   # train a new forest from all the data
#   rfc = RandomForestClassifier(n_estimators=75, n_jobs=-1).fit(X_all, y_all)
#   gen_rf.register(rfc).reinforce(X_gen, w_gen)

#   # this is called regardless of retraining
#   gen_rf.reinforce(X_new).update_moments(X_new)
  
#   y_pred = rfc.predict(X_test)
#   metrics = evaluate_model(y_test, y_pred)
#   smote_score = pd.DataFrame(metrics, index=[i])
#   scores = pd.concat([scores, smote_score])
  
# scores

asszem ez nem jött így össze elsőre :D

RollingRF sandbox

In [23]:
from baselines import BalancedForgettingRF
scores = pd.DataFrame(columns=['accuracy', 'precision', 'recall','f1_score'])
continual = BalancedForgettingRF(0.9)

In [24]:
for i in tqdm(range(0, 10)):
  X, y = loader.load_state_by_id(i)
  X = prep.transform(X)
  rfc = RandomForestClassifier(n_estimators=75, n_jobs=-1).fit(X, y)
  continual.merge(rfc, X)

  y_pred = continual.predict(X_test)
  metrics = evaluate_model(y_test, y_pred)
  scores.loc[f'rolling_{i}'] = metrics

100%|██████████| 10/10 [00:45<00:00,  4.53s/it]


In [25]:
# X, y = loader.load_state_by_id(0)
# print(y.sum() / len(y))
# X = prep.transform(X)
# X, y = SMOTE(
#     random_state=42,
#     sampling_strategy=0.2,
#     n_jobs=-1).fit_resample(X, y)
# print(y.sum() / len(y))

In [26]:
scores

Unnamed: 0,accuracy,precision,recall,f1_score
rolling_0,0.976443,0.055619,0.319347,0.094738
rolling_1,0.996099,0.49112,0.296503,0.369767
rolling_2,0.996817,0.698312,0.308625,0.428063
rolling_3,0.997052,0.812577,0.307226,0.445873
rolling_4,0.997054,0.840483,0.292308,0.43376
rolling_5,0.996998,0.887805,0.254545,0.395652
rolling_6,0.99707,0.894656,0.273193,0.418571
rolling_7,0.997074,0.91256,0.267599,0.413843
rolling_8,0.99704,0.908497,0.259207,0.403337
rolling_9,0.99698,0.91771,0.239161,0.379438


In [27]:
from evaluate.evaluate import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[553528,     46],
       [  1632,    513]], dtype=int64)