In [1]:
import numpy as np

In [2]:
import os

In [52]:
OUT_DIR = os.path.join("/","data", "km3net", "ml_models_results","updown_classification")

In [11]:
train_test_dir=os.path.join("/", "data","km3net","Xy_multi_data_files", "train_test_files", "updown_numu_nue_stratified") 

In [4]:
os.makedirs(OUT_DIR, exist_ok=True)

In [5]:
from data_files import get_train_validation_test_files

In [6]:
from tqdm import tqdm

In [13]:
fnames_train, fnames_val, fnames_test, index_filelist = get_train_validation_test_files(n_files=100, 
                                                                                        train_test_dir=train_test_dir)

In [14]:
len(fnames_train)

100

In [15]:
np.load(fnames_train[0])['y']

array([-0.697193, -0.603984,  0.896572, ..., -0.612716,  0.871776,
        0.665704])

### Training Data

In [26]:
X = None
y = None

def to_label(y):
    y[y <= 0] = 0
    y[y > 0] = 1
    return y

for i in tqdm(range(100)):
    with np.load(fnames_train[i]) as f_t, np.load(fnames_val[i]) as f_v:
        X_t = np.sum(f_t['x'], axis=(2, 3))
        X_v = np.sum(f_v['x'], axis=(2, 3))
#        X_f = np.vstack((X_t, X_v))
        y_t = to_label(f_t['y'])
        y_v = to_label(f_v['y'])
#        y_f = np.hstack((y_t, y_v))
        if X is None:
            X = X_f
            y = y_f
        else:
            X = np.vstack((X, X_f))
            y = np.hstack((y, y_f))

100%|██████████| 100/100 [05:57<00:00,  3.58s/it]


In [16]:
def import_X_y(filelist, set_name, fdata = lambda X: X, ftarget = lambda y: y):
    filename = os.path.join(OUT_DIR, "Xy_{}.npz".format(set_name))
    if os.path.exists(filename):
        with np.load(filename) as xy:
            X = xy["X"]
            y = xy["y"]
    else:
        X = None
        y = None
        for i in tqdm(range(100)):
            with np.load(filelist[i]) as f_t:
                X_f = fdata(f_t['x'])
                y_f = ftarget(f_t['y'])
                if X is None:
                    X = X_f
                    y = y_f
                else:
                    X = np.vstack((X, X_f))
                    y = np.hstack((y, y_f))
    
    np.savez(filename, X=X, y=y)
    return X, y

In [17]:
TZ_AXIS = (2, 3)
def get_Time_Coord(X):
    TZ = np.sum(X, axis=TZ_AXIS)
    TZ = TZ.reshape(TZ.shape[0], TZ.shape[1]*TZ.shape[2])
    return TZ

In [18]:
def to_label(y):
    y[y <= 0] = 0
    y[y > 0] = 1
    return y

In [19]:
X_train, y_train = import_X_y(fnames_train, "train", fdata = get_Time_Coord, ftarget=to_label)

100%|██████████| 100/100 [03:52<00:00,  2.32s/it]


In [20]:
X_train.shape

(165610, 1350)

In [21]:
X_train.size

223573500

In [22]:
y_train.shape

(165610,)

### Test Data

In [23]:
X_test, y_test = import_X_y(fnames_test, "test", fdata = get_Time_Coord, ftarget=to_label)

100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


In [24]:
X_test.shape

(51818, 1350)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

## Random Forest

In [26]:
rf_500 = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42, criterion='entropy', verbose=1)

In [28]:
rf_500.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 16.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [29]:
y_pred_rf_500 = rf_500.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    4.2s finished


In [30]:
acc_rf_500 = accuracy_score(y_test, y_pred_rf_500)

In [31]:
cfm_rf_500 = confusion_matrix(y_test, y_pred_rf_500)

In [32]:
print(cfm_rf_500)

[[20609  2994]
 [ 2729 25486]]


In [33]:
acc_rf_500

0.88955575282720289

In [34]:
np.savetxt(os.path.join(OUT_DIR, "rf_500_test_cls_pred.txt"), y_pred_rf_500)

In [35]:
y_pred_rf500_proba = rf_500.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    4.4s finished


In [36]:
np.savetxt(os.path.join(OUT_DIR, 'rf_500_test_prob_pred.txt'), y_pred_rf500_proba)

In [37]:
from sklearn.externals import joblib

In [38]:
joblib.dump(rf_500, filename=os.path.join(OUT_DIR,'rf_500.pkl'))

['ml_models_results/updown_classification/rf_500.pkl']

In [39]:
rf_100 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, criterion='entropy', verbose=1)

In [40]:
rf_100.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [41]:
y_pred_rf_100 = rf_100.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.9s finished


In [42]:
acc_rf_100 = accuracy_score(y_test, y_pred_rf_100)
cfm_rf_100 = confusion_matrix(y_test, y_pred_rf_100)

In [43]:
print(acc_rf_100)

0.885599598595


In [44]:
print(cfm_rf_100)

[[20586  3017]
 [ 2911 25304]]


In [45]:
np.savetxt(os.path.join(OUT_DIR, "rf_100_test_cls_pred.txt"), y_pred_rf_100)

In [46]:
y_pred_rf100_proba = rf_100.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.9s finished


In [47]:
np.savetxt(os.path.join(OUT_DIR, 'rf_100_test_prob_pred.txt'), y_pred_rf100_proba)

In [48]:
joblib.dump(rf_100, filename=os.path.join(OUT_DIR,'rf_100.pkl'))

['ml_models_results/updown_classification/rf_100.pkl']

### Adaboost Classifier

In [49]:
bdt = AdaBoostClassifier(random_state=42, n_estimators=500)

In [50]:
bdt.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=500, random_state=42)

In [53]:
joblib.dump(bdt, filename=os.path.join(OUT_DIR,'bdt.pkl'))

['/data/km3net/ml_models_results/updown_classification/bdt.pkl']

In [54]:
y_pred_bdt = bdt.predict(X_test)

In [55]:
accuracy_score(y_test, y_pred_bdt)

0.66812690570844113

In [56]:
confusion_matrix(y_test, y_pred_bdt)

array([[10808, 12795],
       [ 4402, 23813]])

In [57]:
np.savetxt(os.path.join(OUT_DIR, "bdt_test_cls_pred.txt"), y_pred_bdt)