In [2]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')

from data.loader import DataLoader
from data.preparation import DataPreparation
from models.rfc import RFCModel

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import pandas as pd

In [27]:
# Texas
X_train, y_train = DataLoader('../data/fraudTrain.csv').load_state_by_id(0)
X_test, y_test = DataLoader('../data/fraudTest.csv').load_state_by_id(0)

In [28]:
prep_smote = DataPreparation()
scaler_smote = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.2,
    n_jobs=-1)
rfc_smote = RFCModel(n_estimators=75)
smote_pipeline = make_pipeline(prep_smote, scaler_smote, smote, rfc_smote)

In [29]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.5s finished


In [30]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  75 out of  75 | elapsed:    0.1s finished


In [31]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,40255,25
Actual Fraud,54,59


In [33]:
scores = pd.DataFrame(metrics, index=[0])
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE_TX')
scores = pd.concat([smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,SMOTE_TX,0.998044,0.702381,0.522124,0.598985


In [37]:
# New York
X_train, y_train = DataLoader('../data/fraudTrain.csv').load_state_by_id(1)
X_test, y_test = DataLoader('../data/fraudTest.csv').load_state_by_id(1)

In [35]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    3.3s finished


In [38]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  75 out of  75 | elapsed:    0.1s finished


In [39]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,35721,22
Actual Fraud,80,95


In [40]:
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE_NY')
scores = pd.concat([scores, smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,SMOTE_TX,0.998044,0.702381,0.522124,0.598985
0,SMOTE_NY,0.99716,0.811966,0.542857,0.650685


# Continual

In [44]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')
sys.path.append('../generative_rf')
import numpy as np
import pandas as pd
from generative_rf import FeatureGenerator, class_sampling
from data.loader import DataLoader
from data.preparation import ContinousDataPreparation
from sklearn.ensemble import RandomForestClassifier
from evaluate.evaluate import evaluate_model
from tqdm import tqdm

In [45]:
loader = DataLoader('../data/fraudTrain.csv')
prep = ContinousDataPreparation()
prep.fit()

<data.preparation.ContinousDataPreparation at 0x1b222277d90>

In [46]:
X_test, y_test = DataLoader('../data/fraudTest.csv').load()
X_test = prep.transform(X_test)

In [47]:
gen_rf = FeatureGenerator()
X, y = loader.load_state_by_id(0)
X = prep.transform(X)
rfc = RandomForestClassifier(n_estimators=75, n_jobs=-1).fit(X, y)

y_pred = rfc.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
scores = pd.DataFrame(metrics, index=[0])

In [48]:
scores

Unnamed: 0,accuracy,precision,recall,f1_score
0,0.971178,0.046904,0.334732,0.082278


In [49]:
gen_rf.register(rfc).reinforce(X)
gen_rf.update_moments(X)

# loader.get_state_list_size()
for i in tqdm(range(1, 5)):
  X_new, y_new = loader.load_state_by_id(i)
  X_new = prep.transform(X_new)
  # generate new data
  # please tailor approx_n for the problem at hand
  # Boti: len(y_new) to be the same size as the new data
  X_gen, w_gen = gen_rf.generate(approx_n=len(y_new))
  proba = gen_rf.predict_proba(X_gen)
  X_gen, y_gen, sample_weights = class_sampling(X_gen, proba, w_gen)

  # merge with current data
  X_all = np.concatenate([X_new, X_gen], axis=0)
  y_all = np.concatenate([y_new, y_gen], axis=0)
  w = np.concatenate([[1]*len(y_new), sample_weights], axis=0)

  # train a new forest from all the data
  rfc = RandomForestClassifier(n_estimators=75, n_jobs=-1).fit(X_all, y_all, sample_weight=w)
  gen_rf.register(rfc).reinforce(X_gen, w_gen)

  # this is called regardless of retraining
  gen_rf.reinforce(X_new).update_moments(X_new)
  
  y_pred = rfc.predict(X_test)
  metrics = evaluate_model(y_test, y_pred)
  smote_score = pd.DataFrame(metrics, index=[i])
  scores = pd.concat([scores, smote_score])
  
scores

100%|██████████| 4/4 [01:30<00:00, 22.72s/it]


Unnamed: 0,accuracy,precision,recall,f1_score
0,0.971178,0.046904,0.334732,0.082278
1,0.948989,0.03234,0.422378,0.06008
2,0.809634,0.009131,0.449417,0.017899
3,0.593131,0.005847,0.617716,0.011584
4,0.583628,0.004937,0.532867,0.009783


asszem ez nem jött így össze elsőre :D