In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import r2_score
import seaborn as sb

In [2]:
from core.utils.preprocessing import df_to_xy
#fix rnd seed
#np.random.seed(7)

# Read and sanitize the data
df = pd.read_excel("../data/t00/data.xls")

x, y = df_to_xy(df, fuse_risk=True, normalize=True, df_min=df.min(), df_max=df.max())
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 2/5)

In [3]:
x[0]

array([0.21052632, 0.24324324, 0.26315789, 1.        , 0.05928854,
       1.        , 0.76666658, 0.51351348, 0.39393936, 0.14734101])

**Loss function**

**Metrics**

In [4]:
from core.models.metrics import gain_mean, avg_gain_ratio

**Model: No data Augmentation**

In [5]:
# model = KerasModel(loss=loss_tf, metrics=[gain_tf], batch_size=256, epochs=1000)
# da = DACombine()
# xTrain_a, yTrain_a = xTrain.astype(K.floatx()), yTrain.astype(K.floatx())
# split = int(xTrain.shape[1] * 0.75)
# #xTrain_a, yTrain_a = xTrain[:split], yTrain[:split]
# xVal, yVal = xTrain[split:], yTrain[split:]
# #xTrain_a, yTrain_a = da.fit_predict(xTrain, yTrain, size=1024, distance=10)
# history = model.fit(xTrain_a.astype(K.floatx()), yTrain_a.astype(K.floatx()),  validation_split=0.25, verbose=0)

# #print(history.history)

# loss_hist = pd.DataFrame(data={'loss': history.history['loss'], 'val_loss': history.history['val_loss']})
# loss_hist.plot()

# acc_hist = pd.DataFrame(data={'acc': history.history['gain_tf'], 'val_acc': history.history['val_gain_tf']})
# acc_hist.plot()

# yPred = model.predict(xTest, batch_size=128)

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPred)})
# #stl = model.score(xTest, yTest, verbose=0)

In [6]:
# yPred = model.predict(xTest)

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPred)})
# out_data.plot()

# yPred = model.predict(xTest)
# print("gain_mean: ", gain_mean(yTest.ravel(), yPred.ravel()))
# print("gain_ratio: ", avg_gain_ratio(yTest.ravel(), yPred.ravel()))

**Model with data augmentation**

In [7]:
# #sci-kit like training
# model = KerasModel(loss=loss_tf, metrics=[gain_tf], batch_size=30, epochs=200)
# da = DACombine()
# xTrain_a, yTrain_a = xTrain, yTrain
# split = int(xTrain.shape[1] * 0.75)
# xTrain_a, yTrain_a = xTrain[:split], yTrain[:split]
# xVal, yVal = xTrain[split:], yTrain[split:]
# xTrain_a, yTrain_a = da.fit_predict(xTrain_a, yTrain_a, size=xTrain_a.shape[1]*16, distance=10, retarget=True, distribution=True, combine=True)
# history = model.fit(xTrain_a.astype('float'), yTrain_a.astype('float'),  validation_data=(xVal, yVal), verbose=0)
# loss_hist = pd.DataFrame(data={'loss': history.history['loss'], 'val_loss': history.history['val_loss']})
# loss_hist.plot()

# acc_hist = pd.DataFrame(data={'acc': history.history['gain_tf'], 'val_acc': history.history['val_gain_tf']})
# acc_hist.plot()

# yPred = model.predict(xTest, batch_size=128)

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPred)})
# #stl = model.score(xTest, yTest, verbose=0)

In [8]:
# yPred = model.predict(xTest)

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPred)})
# out_data.plot()

# yPred = model.predict(xTest)
# print("gain_mean: ", gain_mean(yTest.ravel(), yPred.ravel()))
# print("gain_ratio: ", avg_gain_ratio(yTest.ravel(), yPred.ravel()))

** Model using _mse_ loss and data augmentation**

In [9]:
# #sci-kit like training
# model = KerasModel(loss='mse', metrics=[gain_tf], batch_size=60, epochs=200)
# da = DACombine()
# split = int(xTrain.shape[1] * 0.75)
# xTrain_a, yTrain_a = xTrain[:split], yTrain[:split]
# xVal, yVal = xTrain[split:], yTrain[split:]
# #xTrain_a, yTrain_a = da.fit_predict(xTrain_a, yTrain_a, size=xTrain_a.shape[1]*16, distance=10, retarget=True, distribution=True, combine=True)
# history = model.fit(xTrain_a.astype('float'), yTrain_a.astype('float'), validation_data=(xVal, yVal))
# loss_hist = pd.DataFrame(data={'loss': history.history['loss'], 'val_loss': history.history['val_loss']})
# loss_hist.plot()

# acc_hist = pd.DataFrame(data={'acc': history.history['gain_tf'], 'val_acc': history.history['val_gain_tf']})
# acc_hist.plot()

# yPred = model.predict(xTest, batch_size=128)

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPred)})
# #stl = model.score(xTest, yTest, verbose=0)

In [10]:
# yPred = model.predict(xTest)

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPred)})
# out_data.plot()

# yPred = model.predict(xTest)
# print("gain_mean: ", gain_mean(yTest.ravel(), yPred.ravel()))
# print("gain_ratio: ", avg_gain_ratio(yTest.ravel(), yPred.ravel()))

In [11]:
from core.utils.data_augmentation import DASampling, DACombine
from core.utils.preprocessing import df_to_xy, df_to_xydf
from sklearn.preprocessing import StandardScaler

da = DACombine()

#scaler = StandardScaler()
#scaler.fit(xTrain)
#xTrain = scaler.transform(xTrain)
split = int(xTrain.shape[0] * 0.75)
xTrain_a, yTrain_a = xTrain[:split], yTrain[:split]
xVal, yVal = xTrain[split:], yTrain[split:]
#da2 = D
das = DASampling()

xTrain_a, yTrain_a = da.fit_predict(xTrain_a, yTrain_a, size=10000, distance=5, retarget=True, distribution=True, combine=True)
print(np.unique(yTrain_a))
#xTrain_a, yTrain_a = das.generate_data(xTrain, yTrain, size=10000)


[  0  10  15  40  45  55  60  65  75  80  85  90  95 100 105 110 125 130
 155]


In [12]:
from core.models.oracle import OracleModel

In [13]:
from core.utils.benchmark import process_benchmark_cv

model = OracleModel()

# process_benchmark_cv(model, xTrain, yTrain)
OracleModel().fit(xTrain, yTrain)

  y = column_or_1d(y, warn=True)
  self.model.fit(xTrain_a, yTrain_a)


In [14]:
from core.utils.benchmark import process_benchmarks
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.neural_network import MLPClassifier
from core.models.acceptance import AcceptanceModel
from core.models.featureless import EMModel, RandomModel
from core.models.cluster import ClusterExtModel

MODELS = {
    "random": RandomModel(),
    "svr": LinearSVR(),
    "pagg": PassiveAggressiveClassifier(),
    "forest": RandomForestClassifier(n_estimators=100),
    "cluster": ClusterExtModel(base_model="affinity")
}

benchmark_models = {}
for key_orac, orac in MODELS.items():
    for key_mod, mod in MODELS.items():
        benchmark_models[f"oracle_{key_orac}_{key_mod}"] = OracleModel(orac, mod)
        
# benchmark_models = {
#     "oracle_em_forest": OracleModel(RandomModel(), RandomForestClassifier(n_estimators=100)),
#     "oracle_svr_svr": OracleModel(LinearSVR(), LinearSVR()),
#     "oracle_forest_forest": OracleModel(RandomForestClassifier(n_estimators=100), RandomForestClassifier(n_estimators=100)),
#     "oracle_pagg_pagg": OracleModel(PassiveAggressiveClassifier(), PassiveAggressiveClassifier()),
#     "oracle_forest_pagg": OracleModel(RandomForestClassifier(n_estimators=100), PassiveAggressiveClassifier()),
#     "oracle_forest_cluster": OracleModel(RandomForestClassifier(n_estimators=100), ClusterExtModel(base_model="affinity")),
# }

results = dict()
results = process_benchmarks(benchmark_models, x, y.ravel(), augment_data=[None])

results_mean = {key: item.mean() for key, item in results.items()}
results_std = {key: item.std() for key, item in results.items()}
results_df = pd.DataFrame(results_mean).T
results_df.sort_values("avg_loss_ratio", inplace=True)
results_df

`classes=array([  0,   5,  10,  11,  14,  15,  18,  20,  22,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  63,  64,  65,  66,  67,
        69,  70,  71,  74,  75,  80,  85,  90,  95, 100, 105, 120, 125,
       140, 150, 195])` is not the same as on last call to partial_fit, was: array([  5,  10,  40,  50,  70,  75,  80,  85,  90, 100, 105, 120, 125,
       150])


ValueError: classes should include all valid labels that can be in y

In [36]:
# pd.DataFrame(results_std).T * 100/ pd.DataFrame(results_mean).T
m = ClusterExtModel(base_model="affinity")
m.fit(xTrain, yTrain)
np.unique(m.predict(xTest)), np.unique(yTest)

(array([ 75.,  80., 100., 105.]),
 array([ 10,  15,  20,  25,  30,  40,  50,  60,  70,  75,  80,  90,  95,
        100, 110, 120, 125, 140, 150, 195]))

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras import optimizers
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = xTrain.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizers.RMSprop(lr=0.001), metrics=['acc'])
NN_model.summary()

In [None]:
# yPred = np.argmax(NN_model.predict(xTest), axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.neural_network import MLPClassifier
from core.models.acceptance import AcceptanceModel


orac = MLPClassifier((1000,))
#orac = LinearSVC()
#mod = MLPClassifier((1000,))
#orac = mod = NN_model
orac = LinearSVC()
mod = AcceptanceModel()

model = OracleModel(mod, orac)
model.fit(xTrain, yTrain.reshape(-1, 1))

yPred = model.predict(xTest)

m2 = AcceptanceModel.get_trained_model(xTrain, yTrain)
yPred = m2.predict(xTrain)
print("acc: ", avg_gain_ratio(yTrain, yPred))
yPred = m2.predict(xTest)
print("val_acc: ", avg_gain_ratio(yTest, yPred))

**CMP Model**

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPClassifier
from core.models.acceptance import AcceptanceModel
oracle = RandomForestClassifier(n_estimators=20, n_jobs=-1, max_depth=32)
#oracle = MLPClassifier()
#oracle = LinearSVR()
#oracle.fit(xTrain_a, yTrain_a.ravel())
#oracle = AcceptanceModel.get_trained_model(xTrain=xTrain_a, yTrain=yTrain_a.ravel(), epochs=3)
#oracle = LinearSVR()
oracle.fit(xTrain_a, yTrain_a.ravel())
#oracle.partial_fit(xTrain, yTrain)


In [None]:
yPred = oracle.predict(xTrain_a)
print(np.unique(yPred))
print("train acc: ", gain_mean(yTrain_a, yPred))
print((yPred==yTrain_a).sum())
print("acc: ", avg_gain_ratio(yTrain, oracle.predict(xTrain)))

In [None]:
xTrain_o = xTrain_a.copy()
yTrain_o = oracle.predict(xTrain_o)

In [None]:
np.unique(yTrain_o.astype(int))

In [None]:
# history = NN_model.fit(xTrain_o, to_categorical(yTrain_o, 200), epochs=200, shuffle=True, validation_split=0.2, batch_size=1024, verbose=0)
# NN_model.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=0.00001), metrics=['acc'])
# history2 = NN_model.fit(xTrain, to_categorical(yTrain, 200), epochs=3000, validation_data=(xVal, to_categorical(yVal, 200)), shuffle=True, batch_size=128, verbose=0)

In [None]:
from core.models.deep import KerasModel
from core.models.acceptance import AcceptanceModel
from sklearn.linear_model import (LogisticRegression, LogisticRegressionCV, LinearRegression, ARDRegression,
                                  ElasticNet, ElasticNetCV)
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.model_selection import cross_val_score, cross_validate

#model = KerasModel()
# model.fit(xTrain_o, yTrain_o.ravel().astype(int), batch_size=512)
#model = AcceptanceModel()
#model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
#model = LinearSVR()
#model.fit(xTrain_o, yTrain_o.ravel())
#print(history.history)
model = LinearRegression()
# model = SVC(gamma="auto")

def avg_gain_score(estimator, x, y):
    ypred = estimator.predict(x)
    return avg_gain_ratio(y, ypred)

model = LogisticRegression(penalty='l1', solver='liblinear', multi_class='auto')

cv_res = cross_validate(model, xTrain, yTrain.ravel(), scoring=avg_gain_score,  cv=5, return_train_score=True, return_estimator=True)
cv_res

In [None]:
for model in cv_res["estimator"]:
    yPred = model.predict(xTest)
    print("acc: ", avg_gain_ratio(yTest, yPred))
# loss_hist = pd.DataFrame(data={'loss': history.history['loss'], 'val_loss': history.history['val_loss']})
# loss_hist.plot()

# acc_hist = pd.DataFrame(data={'acc': history.history['acc'], 'val_acc': history.history['val_acc']})
# acc_hist.plot()

# loss_hist = pd.DataFrame(data={'loss': history2.history['loss'], 'val_loss': history2.history['val_loss']})
# loss_hist.plot()

# acc_hist = pd.DataFrame(data={'acc': history2.history['acc'], 'val_acc': history2.history['val_acc']})
# acc_hist.plot()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
parameters = {
    'penalty': ('l1', 'elasticnet', 'l2', 'none'),
    'dual': (False, True),
    'solver': ('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga' ),
    'multi_class': ('ovr', 'multinomial', 'auto'),
    'max_iter': (100, 500, 1000),
    'class_weight': ('balanced', None),
    'C':[1, 10]
}
grid = GridSearchCV(model, parameters, cv=3, n_jobs=-1, scoring=avg_gain_score, return_train_score=True, error_score=0.0)
grid.fit(xTrain, yTrain)
print(grid.cv_results_["mean_test_score"].max())

yPred = grid.predict(xTest)
print(np.unique(yPred))
avg_gain_ratio(yTest, yPred)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
parameters = {
    'n_estimators': (10, 100, 500),
    'criterion': ("gini", "entropy"),
    'max_depth': (None, 8, 32),
    'min_samples_split': (None, 2, 16),
    'min_samples_leaf': (None, 1, 16),
    'max_features': ("auto", None),
}
grid = GridSearchCV(model, parameters, cv=3, n_jobs=-1, scoring=avg_gain_score, return_train_score=True, error_score=0.0)
grid.fit(xTrain[:200], yTrain[:200])

print(grid.cv_results_["mean_test_score"].max())

yPred = grid.predict(xTest)
print(np.unique(yPred))
avg_gain_ratio(yTest, yPred)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import LinearSVR
# model = LinearSVR()
# parameters = {
#     'loss': ('epsilon_insensitive', 'squared_epsilon_insensitive'),
#     'fit_intercept': (False, True),
# #     'solver': ('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga' ),
# #     'multi_class': ('ovr', 'multinomial', 'auto'),
#     'max_iter': (100, 500, 1000),
# #     'class_weight': ('balanced', None),
#     'C':[1, 10]
# }
# grid = GridSearchCV(model, parameters, cv=3, n_jobs=-1, scoring=avg_gain_score, return_train_score=True, error_score=0.0)
# grid.fit(xTrain, yTrain)
# print(grid.cv_results_["mean_test_score"].max())

# yPred = grid.predict(xTest)
# print(np.unique(yPred))
# avg_gain_ratio(yTest, yPred)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import LinearSVC
# model = LinearSVC()
# parameters = {
#     'penalty': ('l1', 'l2'),
#     'loss': ('hinge', 'squared_hinge'),
#     'dual': (True, False),
# #     'shrinking': (True, False),
# #     'solver': ('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga' ),
#     'multi_class': ('ovr', 'crammer_singer'),
#     'max_iter': (-1, 500, 1000),
# #     'class_weight': ({i*5: 1 for i in range(41)},)
# }
# grid = GridSearchCV(model, parameters, cv=3, n_jobs=-1, scoring=avg_gain_score, return_train_score=True, error_score=0.0)
# grid.fit(xTrain, yTrain)
# print(grid.cv_results_["mean_test_score"].max())

# yPred = grid.predict(xTest)
# print(np.unique(yPred))
# avg_gain_ratio(yTest, yPred)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.neural_network import MLPClassifier
# model = MLPClassifier()
# parameters = {
#     'hidden_layer_sizes': (50, 100, 500),
#     'activation': ('logistic', 'tanh', 'relu'),
#     'solver': ('lbfgs', 'sgd', 'adam'),
#     'batch_size': (64, 128, "auto"),
#     'learning_rate': ('constant', 'invscaling', 'adaptive'),
#     'max_iter': (100, 200, 500),
# #     'kernel':('linear', 'rbf'),
# }
# grid = GridSearchCV(model, parameters, cv=3, n_jobs=-1, scoring=avg_gain_score, return_train_score=True, error_score=0.0)
# grid.fit(xTrain, yTrain)
# grid.cv_results_["mean_test_score"].max()

In [None]:
model = KerasModel(batch_size=512)
model.fit(xTrain_a, yTrain_a)
yPred = model.predict(xTest)
avg_gain_ratio(yTest, yPred)

In [None]:

model.fit(xTrain, yTrain, epochs=300)
yPred = model.predict(xTest)
print(np.unique(yPred))
avg_gain_ratio(yTest, yPred)

In [None]:
grid.get_params(True)

In [None]:
yPred = model.predict(xTest)
gain_mean(yTest, yPred)
# m = MLPClassifier()
# m.fit(xTrain_o, yTrain_o.ravel())
# m.partial_fit(xTrain, yTrain.ravel())
# yPred = m.predict(xVal)
# gain_mean(yVal, yPred)

In [None]:
# from keras.layers import Dense, Dropout, Embedding, LSTM
# from keras.models import Sequential
# model = Sequential()
# model.add(Embedding(input_dim=1000, output_dim=128, input_length=10))
# model.add(LSTM(units=64))
# model.add(Dropout(rate=0.5))
# model.add(Dense(1, activation='sigmoid'))


In [None]:

# model.compile(loss='mse', optimizer=optimizers.RMSprop(lr=0.001), metrics=['mse'])
# model.fit(xTrain_a, yTrain_a, batch_size=1024, epochs=200)
# model.compile(loss='mse', optimizer=optimizers.RMSprop(lr=0.00001), metrics=['mse'])
# model.fit(xTrain, yTrain, validation_data=(xVal, yVal), epochs=100, batch_size=64)

In [None]:
yPred = model.predict(xTest)
gain_mean(yTest, yPred)

In [None]:
np.unique(yPred)

In [None]:
import numpy as np

m = np.random.random([12, 10])
m.mean(), m.std()

In [None]:
# ##### from sklearn.preprocessing import PolynomialFeatures, StandardScaler
# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.svm import LinearSVR, LinearSVC
# from sklearn.linear_model import (ARDRegression, PassiveAggressiveClassifier, PassiveAggressiveRegressor,
#                                   LogisticRegression, LogisticRegressionCV, SGDClassifier, SGDRegressor,
#                                  TheilSenRegressor)
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import *
# from sklearn.decomposition import KernelPCA, MiniBatchSparsePCA, FastICA
# from sklearn.preprocessing import PolynomialFeatures

# from core.models import ConservativeModel, RandomModel, EMModel
# pol = PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)

# pol.fit_transform(xTrain).shape

# clf = Pipeline(
#     [
#         ("poly", PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)),
#         ("scaler", StandardScaler()),
#         #("pca", FastICA(8)),
#         #("forest", RandomForestClassifier(n_estimators=100, n_jobs=-1)),
#         #("rnd", RandomModel()),
#         #("conservative", ConservativeModel()),
#         #("em", EMModel()),
#         #("ard", ARDRegression()),
#         #("sgd", SGDClassifier(loss="epsilon_insensitive", penalty="l1")),
#         #("clf", RadiusNeighborsClassifier()),
#         #("bag", BaggingClassifier(n_jobs=-1)),
#         #("linear_svr", LinearSVR()),
#         #("mlp", MLPClassifier(hidden_layer_sizes=(1000, ))),
#         ("voting", VotingClassifier([
#             ("mlp", MLPClassifier()),
#             ("bag", BaggingClassifier()),
#             ("forest", RandomForestClassifier(n_estimators=100,)),
#             ("svc", LinearSVC()),
#             ("sgd", SGDClassifier()),
#             ("passiv", PassiveAggressiveClassifier())
#         ], n_jobs=-1))
        
#     ]
# )

# clf.fit(xTrain_a, yTrain_a.ravel())
# yPred = clf.predict(xTrain)
# print("train acc", avg_gain_ratio(yTrain, yPred))
# yPred = clf.predict(xTest)
# print("val acc", avg_gain_ratio(yTest, yPred))

In [None]:
x[0]