# Initialization

In [1]:
!pip install boto3 progressbar2 sfdmap GPUtil

Collecting boto3
  Downloading boto3-1.23.4-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.5/132.5 kB[0m [31m543.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting progressbar2
  Using cached progressbar2-4.0.0-py2.py3-none-any.whl (26 kB)
Collecting sfdmap
  Using cached sfdmap-0.1.1-py3-none-any.whl
Collecting GPUtil
  Using cached GPUtil-1.4.0-py3-none-any.whl
Collecting botocore<1.27.0,>=1.26.4
  Downloading botocore-1.26.4-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Using cached jmespath-1.0.0-py3-none-any.whl (23 kB)
Collecting s3transfer<0.6.0,>=0.5.0
  Using cached s3transfer-0.5.2-py3-none-any.whl (79 kB)
Collecting python-utils>=3.0.0
  Downloading python_utils-3.2.3-py2.py3-none-any.whl (20 kB)
Installing collected packages: GPUtil, sfdmap, python-utils, 

In [2]:
# imports
import pandas as pd
import numpy as np
import os
import sys
import pickle
from matplotlib import pyplot as plt
import matplotlib
import boto3

# random seed
seed = 42
np.random.seed(seed)

# local files paths
local_home_dir_path = os.path.expanduser("~")
local_work_dir_path = os.path.join(local_home_dir_path, 'git')
local_code_dir_path = os.path.join(local_work_dir_path , 'code')

# S3 file paths
endpoint_url = 'https://s3-west.nrp-nautilus.io'
bucket_name = 'tau-astro'
prefix = 'almogh'
s3_work_dir_path = os.path.join(prefix, 'workdir3')
s3_saves_dir_path = os.path.join(s3_work_dir_path , 'model_saves')
s3_data_dir_path = os.path.join(s3_work_dir_path , 'data')
s3_v2_data_ver_dir_path = os.path.join(s3_data_dir_path,'100K_V2')
s3_v4_data_ver_dir_path = os.path.join(s3_data_dir_path,'100K_V4')

s3_client = boto3.client("s3", endpoint_url=endpoint_url)

# adding code folder to path
sys.path.insert(1, local_code_dir_path)
from s3 import to_s3_npy, to_s3_pkl, from_s3_npy, from_s3_pkl, to_s3_fig

# Infer RF

## Loading data

In [3]:
# load data
print('Loading data and creating dataset')
gs = from_s3_pkl(s3_client, bucket_name, os.path.join(s3_v4_data_ver_dir_path,'gs_test_V4.pkl'))
X = from_s3_npy(s3_client, bucket_name, os.path.join(s3_v2_data_ver_dir_path, 'spec.npy'))
full_wl_grid = from_s3_npy(s3_client, bucket_name, os.path.join(s3_data_dir_path, 'wl_grid.npy'))
wl_grid = from_s3_npy(s3_client, bucket_name, os.path.join(s3_v4_data_ver_dir_path, 'wl_100K_V4.npy'))
start_i = (np.abs(full_wl_grid - wl_grid[0])).argmin()
end_i = 1+(np.abs(full_wl_grid - wl_grid[-1])).argmin()
X = X[gs.index, start_i:end_i]

Loading data and creating dataset
loading from uri: s3://tau-astro/almogh/workdir3/data/100K_V4/gs_test_V4.pkl
loading from uri: s3://tau-astro/almogh/workdir3/data/100K_V2/spec.npy
loading from uri: s3://tau-astro/almogh/workdir3/data/wl_grid.npy
loading from uri: s3://tau-astro/almogh/workdir3/data/100K_V4/wl_100K_V4.npy


In [4]:
assert not np.any(np.isnan(X)), 'NaN!'

## Load RF

In [5]:
load_RF_name = 'simple___2022_05_10___11_24_58___100K_V4_full_data_set'
s3_load_dir_path = os.path.join(s3_saves_dir_path, 'RF', load_RF_name)
print('loading from folder (S3): {0}'.format(s3_load_dir_path))

from CustomRandomForest import CustomRandomForest
rf = CustomRandomForest.load_s3(s3_client, bucket_name, os.path.join(s3_load_dir_path, 'crf.pkl'))

loading from folder (S3): almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set
loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/crf.pkl


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


## Predict on test set

In [6]:
print('Applying the RF (calculate leaves)')
X_leaves = rf.apply(X)

print('Predicting fully')
Y_hat = rf.predict_full_from_leaves(X_leaves)

print('Calculating the similarity matrix')
from CustomRandomForest import build_similarity_matrix
sim_mat = build_similarity_matrix(X_leaves, Y_hat)

Applying the RF (calculate leaves)
apply: starting 500 jobs


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    6.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.1s finished


Predicting fully
Calculating the similarity matrix


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 850 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 1426 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 2146 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 2560 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3010 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 3496 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 4576 tasks      | e

assembling the matrix.


In [7]:
print('Calculating the distance matrix and weirdness scores')
dist_mat_hat_test_set = 1 - sim_mat
weird_scores_hat_test_set = np.mean(dist_mat_hat_test_set, axis=1)

Calculating the distance matrix and weirdness scores


# save

In [8]:
print('Saving the weirdness scores')
to_s3_npy(weird_scores_hat_test_set, s3_client, bucket_name, os.path.join(s3_load_dir_path, 'weird_scores_hat_test_set.npy'))

print('Saving the dissimilarity matrix')
to_s3_npy(dist_mat_hat_test_set, s3_client, bucket_name, os.path.join(s3_load_dir_path, 'dist_mat_hat_test_set.npy'))

Saving the weirdness scores
saving to uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/weird_scores_hat_test_set.npy
Saving the dissimilarity matrix
saving to uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/dist_mat_hat_test_set.npy


True