In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [2]:
from pathlib import Path

In [3]:
dPath = Path("../docs/dumps")

In [4]:
import pickle

In [5]:
with open(dPath / "train_data.pkl", 'rb') as filename:
    train_data = pickle.load(filename)

In [6]:
with open(dPath / "valid_data.pkl", 'rb') as filename:
    valid_data = pickle.load(filename)

In [7]:
with open(dPath / "not_running.pkl", 'rb') as filename:
    not_running = pickle.load(filename)

In [8]:
X_train = train_data.drop("Detected", axis=1)

In [9]:
y_train = train_data.Detected

In [10]:
X_valid = valid_data.drop("Detected", axis=1)

In [11]:
y_valid = valid_data.Detected

In [12]:
with open(dPath / "rf_exp_04_names.pkl", 'rb') as filename:
    names = pickle.load(filename)

In [13]:
X_train = X_train[names] 

In [14]:
X_valid = X_valid[names]

In [20]:
from imblearn.over_sampling import ADASYN

In [21]:
sm = ADASYN(random_state=42, n_jobs=-1,  n_neighbors=5)

In [22]:
%time X_train, y_train = sm.fit_resample(X_train, y_train)

Wall time: 51.8 s


In [23]:
X_train_not_running = not_running.drop("Detected", axis=1)
y_train_not_running = not_running.Detected
X_train_not_running = X_train_not_running[names]

In [24]:
X_train = pd.concat([X_train,X_train_not_running, X_valid], ignore_index=True,axis=0)

In [25]:
y_train = pd.concat([y_train,y_train_not_running, y_valid], ignore_index=True,axis=0)

In [None]:
X_train['returnType'] = X_train.returnType.astype('int64')   

In [26]:
y_train.sum()/y_train.shape[0]

0.48736511651307735

In [27]:
from catboost import CatBoostClassifier

In [40]:
models = []
np.random.seed(42)

for i in tqdm(range(60)):
    
    cb = CatBoostClassifier(
    custom_loss=['AUC'],
    random_seed=np.random.randint(2 ** 10),
    cat_features=[1,22],
    iterations=150,
    logging_level='Silent')

    cb.fit(X_train, y_train)
    models.append(cb.copy())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




In [41]:
with open(dPath / "cb_exp_05_stacking.pkl", 'wb') as filename:
    pickle.dump(models,filename)

In [42]:
predictions = []
for _model in tqdm(models):
    predictions.append(_model.predict_proba(X_valid)[:,1])
    
predictions = np.vstack(predictions).T

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




In [43]:
prediction = np.hstack([predictions]).mean(axis=1)

In [44]:
prediction > 0.5

array([ True,  True, False, ..., False, False,  True])

In [45]:
from sklearn.metrics import classification_report

In [46]:
print(classification_report(y_valid, prediction >=0.5))

              precision    recall  f1-score   support

       False       0.70      0.40      0.51     16657
        True       0.83      0.94      0.88     50839

    accuracy                           0.81     67496
   macro avg       0.76      0.67      0.70     67496
weighted avg       0.80      0.81      0.79     67496



In [47]:
from sklearn import metrics

In [48]:
fpr, tpr, thresholds = metrics.roc_curve(y_valid, prediction)

In [49]:
metrics.auc(fpr, tpr)

0.8045244998565659