In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import credit_pipeline.training as tr
import credit_pipeline.evaluate as ev

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from mpi4py import MPI
import socket

In [4]:
df = pd.read_csv('data/pha-asteroids.csv', index_col=0)

In [5]:
classCol = df['class']
X = df.drop(['pha', 'class', 'neo'], axis=1)
y = df['pha']

X_train, X_test, y_train, y_test, C_train, C_test = train_test_split(X, y, classCol, test_size=0.2, random_state=42)

In [6]:
mu, sigma = 0, 10
np.random.seed(42)
# creating a noise with the same dimension as the dataset
noise = np.random.normal(mu, sigma, X_train.shape)
noise 


array([[  4.96714153,  -1.38264301,   6.47688538, ...,  -2.34153375,
         -2.34136957,  15.79212816],
       [  7.67434729,  -4.69474386,   5.42560044, ...,  -4.65729754,
          2.41962272, -19.13280245],
       [-17.24917833,  -5.62287529, -10.1283112 , ...,  -9.08024076,
        -14.12303701,  14.65648769],
       ...,
       [ -5.01156421,  -6.02751129,   3.36559513, ..., -15.93566072,
        -10.94954361,  -4.52581385],
       [  3.28019041,   9.16418866, -12.28727187, ...,  -1.80357532,
         -6.90569901,  -4.95870709],
       [ 14.29080638, -13.27926843,  -2.0788138 , ...,  -7.98021547,
         10.24630795,  15.00234551]])

In [9]:
from sklearn.feature_selection import r_regression
r_value = abs(r_regression(X_train, y_train))
r_value = pd.Series(r_value)
r_value.index = X_train.columns
r_value.sort_values(ascending=False, inplace=True)
r_value.index,r_value

(Index(['H', 'moid', 'e', 'i', 'ma', 'n', 'a'], dtype='object'),
 H       0.296144
 moid    0.214034
 e       0.157835
 i       0.042526
 ma      0.020235
 n       0.013764
 a       0.000427
 dtype: float64)

In [33]:
k=2
['H', 'moid', 'e', 'i', 'ma', 'n', 'a'][:(-1) * k if k > 0 else None]

['H', 'moid', 'e', 'i', 'ma']

In [152]:
#Train
X_noisy_train = X_train + noise
XC_train = pd.concat([X_train, C_train], axis=1)
XC_noisy_train = pd.concat([X_noisy_train, C_train], axis=1)

#Test
XC_test = pd.concat([X_test, C_test], axis=1)

In [153]:
clf = LGBMClassifier(verbose=-1)
clf = tr.create_pipeline(X_train, y_train, clf, do_EBE=False)
clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)[:,-1]
ev.roc_auc_score(y_test, y_prob)

np.float64(0.9999069866326503)

In [154]:
clf = LGBMClassifier(verbose=-1)
clf = tr.create_pipeline(X_noisy_train, y_train, clf, do_EBE=False)
clf.fit(X_noisy_train, y_train)
y_prob = clf.predict_proba(X_test)[:,-1]
ev.roc_auc_score(y_test, y_prob)

np.float64(0.6330136451817868)

In [155]:
clf = LGBMClassifier(verbose=-1)
clf = tr.create_pipeline(XC_train, y_train, clf, do_EBE=False)
clf.fit(XC_train, y_train)
y_prob = clf.predict_proba(XC_test)[:,-1]
ev.roc_auc_score(y_test, y_prob)

np.float64(0.999894906974553)

In [156]:
clf = LGBMClassifier(verbose=-1)
clf = tr.create_pipeline(XC_noisy_train, y_train, clf, do_EBE=False)
clf.fit(XC_noisy_train, y_train)
y_prob = clf.predict_proba(XC_test)[:,-1]
ev.roc_auc_score(y_test, y_prob)

np.float64(0.7840857752362177)