In [1]:
import numpy as np

In [2]:
import os

In [3]:
OUT_DIR = os.path.join("/","data", "km3net", "ml_models_results","energy")

In [4]:
train_test_dir=os.path.join("/", "data","km3net","Xy_multi_data_files", "train_test_files", "log_energies_stratified") 

In [5]:
os.makedirs(OUT_DIR, exist_ok=True)

In [6]:
from data_files import get_train_validation_test_files

In [7]:
from tqdm import tqdm

In [8]:
fnames_train, fnames_val, fnames_test, index_filelist = get_train_validation_test_files(n_files=100, 
                                                                                        train_test_dir=train_test_dir)

In [9]:
len(fnames_train)

100

In [10]:
np.load(fnames_train[0])['y']

array([ 3.15108166,  2.96422002,  2.8765873 , ...,  4.75827133,
        5.28655077,  4.88470079])

### Training Data

In [11]:
def import_X_y(filelist, set_name, fdata = lambda X: X, ftarget = lambda y: y):
    filename = os.path.join(OUT_DIR, "Xy_{}.npz".format(set_name))
    if os.path.exists(filename):
        with np.load(filename) as xy:
            X = xy["X"]
            y = xy["y"]
    else:
        X = None
        y = None
        for i in tqdm(range(100)):
            with np.load(filelist[i]) as f_t:
                X_f = fdata(f_t['x'])
                y_f = ftarget(f_t['y'])
                if X is None:
                    X = X_f
                    y = y_f
                else:
                    X = np.vstack((X, X_f))
                    y = np.hstack((y, y_f))
    
    np.savez(filename, X=X, y=y)
    return X, y

In [12]:
TZ_AXIS = (2, 3)
def get_Time_Coord(X):
    TZ = np.sum(X, axis=TZ_AXIS)
    TZ = TZ.reshape(TZ.shape[0], TZ.shape[1]*TZ.shape[2])
    return TZ

In [14]:
def to_label(y):
    y[y <= 0] = 0
    y[y > 0] = 1
    return y

In [13]:
X_train, y_train = import_X_y(fnames_train, "train", fdata = get_Time_Coord)

100%|██████████| 100/100 [08:23<00:00,  5.04s/it]


In [14]:
X_train.shape

(165610, 1350)

In [15]:
X_train.size

223573500

In [16]:
y_train.shape

(165610,)

### Test Data

In [17]:
X_test, y_test = import_X_y(fnames_test, "test", fdata = get_Time_Coord)

100%|██████████| 100/100 [02:03<00:00,  1.23s/it]


In [18]:
X_test.shape

(51818, 1350)

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

## Random Forest

In [20]:
rf_500 = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=42, verbose=1)

In [21]:
rf_500.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 64.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 271.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 605.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 686.3min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=42, verbose=1, warm_start=False)

In [25]:
y_pred_rf_500 = rf_500.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   10.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   11.8s finished


In [26]:
mse_rf_500 = mean_squared_error(y_test, y_pred_rf_500)

In [27]:
r2_rf_500 = r2_score(y_test, y_pred_rf_500)

In [28]:
mse_rf_500

0.41391226791253521

In [29]:
print(r2_rf_500)

0.708426082056


In [34]:
np.savetxt(os.path.join(OUT_DIR, "rf_500_test_cls_pred.txt"), y_pred_rf_500)

In [35]:
y_pred_rf500_proba = rf_500.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    4.4s finished


In [36]:
np.savetxt(os.path.join(OUT_DIR, 'rf_500_test_prob_pred.txt'), y_pred_rf500_proba)

In [23]:
from sklearn.externals import joblib

In [24]:
joblib.dump(rf_500, filename=os.path.join(OUT_DIR,'rf_500.pkl'))

['/data/km3net/ml_models_results/energy/rf_500.pkl']

In [39]:
rf_100 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, criterion='entropy', verbose=1)

In [40]:
rf_100.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [41]:
y_pred_rf_100 = rf_100.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.9s finished


In [42]:
acc_rf_100 = accuracy_score(y_test, y_pred_rf_100)
cfm_rf_100 = confusion_matrix(y_test, y_pred_rf_100)

In [43]:
print(acc_rf_100)

0.885599598595


In [44]:
print(cfm_rf_100)

[[20586  3017]
 [ 2911 25304]]


In [45]:
np.savetxt(os.path.join(OUT_DIR, "rf_100_test_cls_pred.txt"), y_pred_rf_100)

In [46]:
y_pred_rf100_proba = rf_100.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.9s finished


In [47]:
np.savetxt(os.path.join(OUT_DIR, 'rf_100_test_prob_pred.txt'), y_pred_rf100_proba)

In [48]:
joblib.dump(rf_100, filename=os.path.join(OUT_DIR,'rf_100.pkl'))

['ml_models_results/updown_classification/rf_100.pkl']

### Adaboost Classifier

In [49]:
bdt = AdaBoostRegressor(random_state=42, n_estimators=500)

In [None]:
bdt.fit(X_train, y_train)

In [69]:
joblib.dump(bdt, filename=os.path.join(OUT_DIR,'bdt.pkl'))

['./ml_models_results/bdt.pkl']

In [70]:
y_pred_bdt = bdt.predict(X_test)

In [71]:
accuracy_score(y_test, y_pred_bdt)

0.67053919487436797

In [72]:
confusion_matrix(y_test, y_pred_bdt)

array([[10946, 12657],
       [ 4415, 23800]])

In [None]:
np.savetxt(os.path.join(OUT_DIR, "bdt_test_cls_pred.txt"), y_pred_bdt)