In [2]:
import numpy as np

In [3]:
import os

In [4]:
OUT_DIR = os.path.join("/","data", "km3net", "ml_models_results","numu_nue")

In [5]:
train_test_dir=os.path.join("/", "data","km3net","Xy_multi_data_files", "train_test_files", "numu_nue_stratified_labels") 

In [6]:
os.makedirs(OUT_DIR, exist_ok=True)

In [7]:
from data_files import get_train_validation_test_files

In [8]:
from tqdm import tqdm

In [9]:
fnames_train, fnames_val, fnames_test, index_filelist = get_train_validation_test_files(n_files=100, 
                                                                                        train_test_dir=train_test_dir)

In [10]:
len(fnames_train)

100

In [11]:
np.load(fnames_train[0])['y']

array([1, 1, 0, ..., 0, 1, 1], dtype=uint8)

### Training Data

In [12]:
def import_X_y(filelist, set_name, fdata = lambda X: X, ftarget = lambda y: y):
    filename = os.path.join(OUT_DIR, "Xy_{}.npz".format(set_name))
    if os.path.exists(filename):
        with np.load(filename) as xy:
            X = xy["X"]
            y = xy["y"]
    else:
        X = None
        y = None
        for i in tqdm(range(100)):
            with np.load(filelist[i]) as f_t:
                X_f = fdata(f_t['x'])
                y_f = ftarget(f_t['y'])
                if X is None:
                    X = X_f
                    y = y_f
                else:
                    X = np.vstack((X, X_f))
                    y = np.hstack((y, y_f))
    
    np.savez(filename, X=X, y=y)
    return X, y

In [13]:
TZ_AXIS = (2, 3)
def get_Time_Coord(X):
    TZ = np.sum(X, axis=TZ_AXIS)
    TZ = TZ.reshape(TZ.shape[0], TZ.shape[1]*TZ.shape[2])
    return TZ

In [14]:
def to_label(y):
    y[y <= 0] = 0
    y[y > 0] = 1
    return y

In [17]:
X_train, y_train = import_X_y(fnames_train, "train", fdata = get_Time_Coord)

100%|██████████| 100/100 [04:05<00:00,  2.46s/it]


In [18]:
X_train.shape

(165610, 1350)

In [19]:
X_train.size

223573500

In [20]:
y_train.shape

(165610,)

### Test Data

In [21]:
X_test, y_test = import_X_y(fnames_test, "test", fdata = get_Time_Coord, ftarget=to_label)

100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


In [26]:
X_test.shape

(51818, 1350)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

## Random Forest

In [24]:
rf_500 = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42, criterion='entropy', verbose=1)

In [25]:
rf_500.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 11.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [27]:
y_pred_rf_500 = rf_500.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.5s finished


In [28]:
acc_rf_500 = accuracy_score(y_test, y_pred_rf_500)

In [29]:
cfm_rf_500 = confusion_matrix(y_test, y_pred_rf_500)

In [30]:
print(cfm_rf_500)

[[20321  1171]
 [ 4355 25971]]


In [31]:
acc_rf_500

0.89335752055270368

In [32]:
np.savetxt(os.path.join(OUT_DIR, "rf_500_test_cls_pred.txt"), y_pred_rf_500)

In [33]:
y_pred_rf500_proba = rf_500.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.4s finished


In [34]:
np.savetxt(os.path.join(OUT_DIR, 'rf_500_test_prob_pred.txt'), y_pred_rf500_proba)

In [35]:
from sklearn.externals import joblib

In [36]:
joblib.dump(rf_500, filename=os.path.join(OUT_DIR,'rf_500.pkl'))

['/data/km3net/ml_models_results/numu_nue/rf_500.pkl']

In [37]:
rf_100 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, criterion='entropy', verbose=1)

In [38]:
rf_100.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.7min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [43]:
y_pred_rf_100 = rf_100.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.5s finished


In [44]:
acc_rf_100 = accuracy_score(y_test, y_pred_rf_100)
cfm_rf_100 = confusion_matrix(y_test, y_pred_rf_100)

In [45]:
print(acc_rf_100)

0.89079084488


In [46]:
print(cfm_rf_100)

[[20274  1218]
 [ 4441 25885]]


In [47]:
np.savetxt(os.path.join(OUT_DIR, "rf_100_test_cls_pred.txt"), y_pred_rf_100)

In [48]:
y_pred_rf100_proba = rf_100.predict_proba(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.6s finished


In [49]:
np.savetxt(os.path.join(OUT_DIR, 'rf_100_test_prob_pred.txt'), y_pred_rf100_proba)

In [50]:
joblib.dump(rf_100, filename=os.path.join(OUT_DIR,'rf_100.pkl'))

['/data/km3net/ml_models_results/numu_nue/rf_100.pkl']

### Adaboost Classifier

In [51]:
bdt = AdaBoostClassifier(random_state=42, n_estimators=500)

In [52]:
bdt.fit(X_train, y_train)

KeyboardInterrupt: 

In [69]:
joblib.dump(bdt, filename=os.path.join(OUT_DIR,'bdt.pkl'))

['./ml_models_results/bdt.pkl']

In [70]:
y_pred_bdt = bdt.predict(X_test)

In [71]:
accuracy_score(y_test, y_pred_bdt)

0.67053919487436797

In [72]:
confusion_matrix(y_test, y_pred_bdt)

array([[10946, 12657],
       [ 4415, 23800]])

In [None]:
np.savetxt(os.path.join(OUT_DIR, "bdt_test_cls_pred.txt"), y_pred_bdt)