In [1]:
import numpy as np
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
import pandas as pd

In [2]:
data_dir = '/storage/yw18581/data/'
data_folder = os.path.join(data_dir, 'train_validation_test')
clean_dir = os.path.join(data_folder, 'clean_300_june')

In [3]:
def cut_X(arr, reshape = None):
    x_cut = arr[:,960:1300,600:]
    if reshape:
        if len(x_cut.shape)>3:
            x_cut = x_cut[...,0]
            x_cut_out = x_cut.reshape(x_cut.shape[0],x_cut.shape[1]*x_cut.shape[2])
    else:
        x_cut_out = x_cut
    return x_cut_out

def reshape_RF(arr):
    arr_RF = arr.reshape((arr.shape[0], arr.shape[1]*arr.shape[2]))
    return arr_RF

def cut_reshape(arr):
    arr_cut = cut_X(arr)
    arr_RF = reshape_RF(arr_cut)
    return arr_RF

In [None]:
estimators = [20, 30, 50, 80, 120, 180, 250, 300, 500]
depths = [10, 50,  100]

In [4]:
estimators = [20, 30, 50, 80, 120, 180, 250, 300, 500]
depths = [10, 50,  100]

In [8]:
Xy_train = np.load("/storage/yw18581/data/train_validation_test/Xy_train_clean_300_24_10_25.npz")

In [9]:
Xy_val = np.load("/storage/yw18581/data/train_validation_test/Xy_val_clean_300_24_10_25.npz")

In [10]:
Xy_test = np.load("/storage/yw18581/data/train_validation_test/Xy_test_clean_300_24_10_25.npz")

In [11]:
X_test = cut_reshape(Xy_test["y"])
y_test = Xy_test["dist"]

In [12]:
X_train = cut_reshape(Xy_train["y"])
y_train = Xy_train["dist"]
X_val = cut_reshape(Xy_val["y"])
y_val = Xy_val["dist"]

In [13]:
X_train = np.vstack((X_train, X_val))
y_train = np.hstack((y_train, y_val))

In [48]:
def import_no_split(pos, keyword):
    Xy = np.load(os.path.join(clean_dir,"Xy_"+pos+"_{}.npz".format(keyword)))
    X = Xy["y"]
    y = Xy["dist"]
    X_RF = cut_reshape(X)
    return X_RF, y

X_15_1, y_15_1 = import_no_split("15mm", "clean300_june")
X_15_2, y_15_2 = import_no_split("15mm", "second_batch_clean300_june")

X_15_gt = np.vstack((X_15_1, X_15_2))
y_15_gt = np.hstack((y_15_1, y_15_2))


In [12]:
errors_gt = []

In [24]:
for est in estimators:
    for dep in depths:
        rf = RandomForestRegressor(random_state=42, n_estimators=est,
                                  max_depth=dep, n_jobs=2, verbose=2)
        rf.fit(X_train, y_train)
        preds_test_gt = rf.predict(X_test)
        preds_test_15mm_gt = rf.predict(X_15_gt)
        
        
        mse_gt = mean_squared_error(preds_test_gt, y_test)
        
        mse_15_gt = mean_squared_error(preds_test_15mm_gt, y_15_gt)
        
        errors_gt.append((rf, est, dep, mse_gt, mse_15_gt))
    

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20


KeyboardInterrupt: 

In [27]:
pickle.dump(errors_gt, 
            open(os.path.join(data_dir, "trained_models", "RF_OPTIMISATION_CLEAN_300_GT.npz"), 'wb'))

In [5]:
errors = np.load(os.path.join(data_dir, "trained_models", "RF_OPTIMISATION_CLEAN_300_GT.npz"))

In [34]:
errors2 = np.load(os.path.join(data_dir, "trained_models", "RF_OPTIMISATION_CLEAN_300_GT_small_est.npz"))

In [35]:
errors2

[(RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=2,
             oob_score=False, random_state=42, verbose=2, warm_start=False),
  4,
  0.0,
  25.0),
 (RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=6, n_jobs=2,
             oob_score=False, random_state=42, verbose=2, warm_start=False),
  6,
  0.0009259259259259268,
  24.53125),
 (RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impur

In [14]:
rf = RandomForestRegressor(random_state=42, n_estimators=20, n_jobs=2, verbose=2)

In [17]:
rf = errors[1][0]

In [18]:
rf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=2,
           oob_score=False, random_state=42, verbose=2, warm_start=False)

In [19]:
Xy_15mm_unet = np.load("/storage/yw18581/data/trained_models/preds_15mm.npz")

preds_15mm_unet = Xy_15mm_unet["preds_15mm"]

In [20]:
X_RF_15_unet = cut_reshape(preds_15mm_unet)

X_RF_15_unet_uint = X_RF_15_unet.astype(np.uint8)*255

In [21]:
Xy_test_unet = np.load("/storage/yw18581/data/trained_models/preds_500.npz")

In [22]:
y_test_unet = Xy_test_unet["preds_500"]

y_test_unet_reshape = cut_reshape(y_test_unet).astype(np.uint8)*255

In [23]:
preds_test_unet = rf.predict(y_test_unet_reshape)
preds_test_15mm_unet = rf.predict(X_RF_15_unet_uint)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished


In [24]:
mse_unet = mean_squared_error(preds_test_unet, y_test)


In [26]:
y_15_gt = np.ones(preds_test_15mm_unet.shape[0])*15

In [31]:
mse_15_unet = mean_squared_error(preds_test_15mm_unet-2, y_15_gt)

In [28]:
mse_unet

0.16402083333333328

In [32]:
mse_15_unet

0.4434375

In [33]:
preds_test_15mm_unet-2

array([14.75, 14.75, 14.75, 14.75, 15.5 , 14.75, 14.75, 15.5 , 14.75,
       17.  , 15.5 , 15.5 , 14.75, 15.5 , 15.5 , 15.5 , 14.75, 15.5 ,
       15.5 , 15.5 , 15.5 , 14.75, 15.5 , 15.5 , 15.5 , 15.5 , 15.5 ,
       14.  , 14.75, 15.5 , 15.5 , 14.75, 14.75, 15.5 , 15.5 , 15.5 ,
       14.75, 14.75, 14.75, 15.5 , 15.5 , 15.5 , 15.5 , 14.75, 15.5 ,
       15.5 , 15.5 , 15.5 , 15.5 , 15.5 , 14.75, 15.5 , 14.75, 16.25,
       14.75, 14.75, 14.  , 14.75, 14.75, 14.75, 14.75, 14.75, 14.75,
       15.5 , 14.75, 15.5 , 14.75, 15.5 , 15.5 , 16.25, 15.5 , 15.5 ,
       14.75, 15.5 , 14.75, 15.5 , 15.5 , 14.75, 14.75, 14.75, 16.25,
       14.75, 15.5 , 16.25, 15.5 , 15.5 , 14.75, 14.75, 16.25, 15.5 ,
       15.5 , 15.5 , 14.  , 16.25, 15.5 , 15.5 , 15.5 , 14.75, 15.5 ,
       14.75, 14.75, 15.5 , 14.75, 15.5 , 14.75, 15.5 , 15.5 , 14.75,
       14.75, 15.5 , 14.75, 15.5 , 14.75, 16.25, 15.5 , 15.5 , 15.5 ,
       15.5 , 14.75, 14.  , 16.25, 15.5 , 14.75, 15.5 , 15.5 , 15.5 ,
       15.5 , 15.5 ,

In [36]:
errors3 = np.load(os.path.join(data_dir, "trained_models", "RF_OPTIMISATION_CLEAN_300_GT_large_est.npz"))

In [37]:
errors3

[(RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=2,
             oob_score=False, random_state=42, verbose=2, warm_start=False),
  50,
  0.0013150000000000024,
  3.6307499999999995),
 (RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=2,
             oob_score=False, random_state=42, verbose=2, warm_start=False),
  60,
  0.0009131944444444442,
  5.826770833333334),
 (RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto'

In [38]:
rf20 = errors2[8][0]

In [39]:
rf20

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=2,
           oob_score=False, random_state=42, verbose=2, warm_start=False)

In [40]:
preds_test_unet = rf20.predict(y_test_unet_reshape)
preds_test_15mm_unet = rf20.predict(X_RF_15_unet_uint)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished


In [41]:
mse_unet = mean_squared_error(preds_test_unet, y_test)



In [42]:
mse_15_unet = mean_squared_error(preds_test_15mm_unet-2, y_15_gt)

In [43]:
mse_unet

0.16402083333333328

In [44]:
mse_15_unet

0.4434375

### repeat and save data

In [45]:
rf20_new = RandomForestRegressor(random_state=42, n_estimators=20,
                                n_jobs=2, verbose=2)

In [46]:
rf20_new.fit(X_train, y_train)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:  1.1min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=2,
           oob_score=False, random_state=42, verbose=2, warm_start=False)

In [49]:
preds_test_gt = rf20_new.predict(X_test)
preds_test_15mm_gt = rf20_new.predict(X_15_gt)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished


In [50]:
mse_gt = mean_squared_error(preds_test_gt, y_test)
mse_15_gt = mean_squared_error(preds_test_15mm_gt, y_15_gt)

In [51]:
mse_gt

0.0038020833333333322

In [52]:
mse_15_gt

0.27125

In [53]:
preds_test_unet = rf20_new.predict(y_test_unet_reshape)
preds_test_15mm_unet = rf20_new.predict(X_RF_15_unet_uint)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    0.0s finished


In [54]:
mse_unet = mean_squared_error(preds_test_unet, y_test)
mse_15_unet = mean_squared_error(preds_test_15mm_unet-2, y_15_gt)


In [55]:
mse_unet

0.16402083333333328

In [56]:
mse_15_unet

0.4434375

In [60]:
np.savez_compressed("rf_predictions_new.npz",
                   y_gt = y_test, preds_test_gt = preds_test_gt,
                   y_15_gt = y_15_gt, preds_15_gt=preds_test_15mm_gt, 
                   y_test_unet = y_test, preds_test_unet = preds_test_unet,
                   y_15_unet = y_15_gt, preds_15_unet = preds_test_15mm_unet)