# Make predictions for plotting the map

In [1]:
import numpy as np
import pandas as pd
import config as cfg
import utils_ml as mut
from sklearn.metrics import roc_curve
data_path = cfg.data_path + cfg.ml_ready_ndvi_data


year = 2008
# load data for training and testing the model
x, y, ids = mut.load_ndvi_as_numpy(data_path, mut.as_list(year), balance_flag=0)

# split into test (1000 examples) and train (remaining) 
xtr, ytr, idstr, xte, yte, idste  = mut.split_for_map(x,y,ids, 99)

# train model
clf = mut.build_model_opt("rf", random_state=99)
clf_trained = clf.fit(xtr, ytr)
# make prediction
proba = clf_trained.predict_proba(xte)

# compute optimal threshold (# Youden's J statistic)
fpr, tpr, thr = roc_curve(yte, proba[:, 1])
jstat = tpr - fpr # Youden's J statistic
opt_thr = thr[np.argmax(jstat)]
print(f'opt thr: {opt_thr}')

# predict test examples
yhatte = proba[:, 1] > opt_thr
predicted_loss_ids = idste[yhatte]
predicted_noloss_ids = idste[~yhatte]

opt thr: 0.4489324564574025


## Create true and predicted dataframes from true and predicted IDs

In [3]:
# load the test data in dataframe format
df = mut.load_ndvi_as_df(data_path, year)
dfte = df.loc[df.new_ID.isin(idste)]
print(f"true test data shape: {dfte.shape}")

# sort ground truth by new_ID and save
df_true = dfte.copy()
df_true.sort_values(by='new_ID')
df_true.to_csv("ground_truth.csv", index=False)

true test data shape: (1000, 22)


In [None]:
# predicted dataframe

# extract fields which are predicted as loss
df_predicted_loss = dfte.loc[dfte.new_ID.isin(predicted_loss_ids)]
df_predicted_loss.loss.replace(0, 1, inplace=True) # change orig label to predicted label

# extract fields which are predicted as no-loss
df_predicted_noloss = dfte.loc[dfte.new_ID.isin(predicted_noloss_ids)]
df_predicted_noloss.loss.replace(1, 0, inplace=True) # change orig label to predicted label

# concat the two and save
df_predicted = pd.concat([df_predicted_loss, 
                          df_predicted_noloss], 
                          ignore_index=True)
print(f"true test data shape: {df_predicted.shape}")
df_predicted.sort_values(by='new_ID')
df_predicted.to_csv("predicted.csv", index=False)