# Predicting galaxy redshift from band magnitudes with random forest

Zooey Nguyen

2021-07-08

- 2021-07-16: Update with photoz_metrics and v3 crossmatched data

- 2021-07-20: Update with new photoz_metrics metrics outputs

In [1]:
from photoz_utils import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Setup data

In [2]:
df = import_photoz_data('/data/HSC/HSC_v3/all_specz_flag_forced_forced2_spec_z_matched_online.csv')
df = clean_photoz_data(df, filters=[1,2])
X_train, X_test, y_train, y_test = split_photoz_data(df)
df

Unnamed: 0,g_mag,r_mag,i_mag,z_mag,y_mag,z_spec
1,22.610886,21.079186,20.253893,19.866356,19.743130,0.527950
2,21.891678,20.254738,19.048626,18.613430,18.380713,0.621500
3,21.656437,19.928366,18.916716,18.476753,18.291279,0.526540
4,21.724205,19.995935,19.013975,18.560516,18.372887,0.526540
5,21.748327,21.549549,21.588028,21.292486,21.126167,2.383940
...,...,...,...,...,...,...
801241,21.772613,20.585226,19.878244,19.573465,19.341536,0.508998
801242,25.315399,23.766863,22.617428,22.156572,21.976051,0.658518
801243,24.335815,23.585403,22.994476,22.861565,22.376507,0.654057
801244,24.495150,24.309818,24.181927,23.360998,23.462173,1.410675


## Train random forest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators = 200)
rf_model.fit(X_train, y_train)

## Evaluate training point estimates

In [None]:
y_train_pred = rf_model.predict(X_train)
y_train_pred = pd.Series(y_train_pred, index=y_train.index)
plot_predictions(y_train_pred, y_train)

In [None]:
metrics_agg = get_point_metrics(y_train_pred, y_train)
metrics_binned = get_point_metrics(y_train_pred, y_train, binned=True)
metrics_binned

In [None]:
plot_point_metrics(metrics_binned)

## Evaluate training density estimates

In [None]:
predictions = []
for i in range (len(rf_model.estimators_)):
    predictions.append(np.array(rf_model.estimators_[i].predict(X_train)))    
y_train_densities = np.asarray(predictions).T

In [None]:
density_metrics = get_density_metrics(y_train_densities, y_train)

In [None]:
plot_density_metrics(density_metrics)

## Evaluate test point estimates

In [None]:
y_test_pred = rf_model.predict(X_test)
y_test_pred = pd.Series(y_test_pred, index=y_test.index)
plot_predictions(y_test_pred, y_test)

In [None]:
metrics_agg = get_point_metrics(y_test_pred, y_test)
metrics_binned = get_point_metrics(y_test_pred, y_test, binned=True)
metrics_binned

In [None]:
plot_point_metrics(metrics_binned)

## Evaluate test density estimates

In [None]:
predictions = []
for i in range (len(rf_model.estimators_)):
    predictions.append(np.array(rf_model.estimators_[i].predict(X_test)))    
y_test_densities = np.asarray(predictions).T

In [None]:
density_metrics = get_density_metrics(y_test_densities, y_test)

In [None]:
PIT = density_metrics['PIT']
CRPS = density_metrics['CRPS']

In [None]:
plot_density_metrics(density_metrics)