# PREDICT SPECIES

Using the model we had trained in the training notebook, we will now make predictions on unseen data. If you have new data to predict on, this is the notebook you would use.

## Imports

In [7]:
import pandas as pd
from tensorflow.keras.models import load_model

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler
)

## Calling the trained model

In [3]:
dl_model = load_model('model/imputation_model.h5')

## Read the test data

In [4]:
df = pd.read_csv("../imputation_model/raw_data/sql_field_imputation_data.csv")
df.head()

Unnamed: 0,tag_id_long,date,watershed,river,site,method,local,fork_length_mm,species
0,,2022-05-11,englishman,shelly creek,martindale rd,smolt trap,in-river,105.0,co
1,,2022-05-09,cowichan,cowichan,mainstem fence,rst,in-river,94.0,co
2,,2022-06-23,nanaimo,nanaimo,living forest,beach seine,marine,65.0,ck
3,,2022-06-23,nanaimo,nanaimo,living forest,beach seine,marine,65.0,ck
4,,2022-06-23,nanaimo,nanaimo,living forest,beach seine,marine,64.0,ck


## Pre-processing

In [5]:
df["year"] = pd.DatetimeIndex(df['date']).year
df['day_of_year'] = pd.to_datetime(df['date']).dt.dayofyear
df.head()

Unnamed: 0,tag_id_long,date,watershed,river,site,method,local,fork_length_mm,species,year,day_of_year
0,,2022-05-11,englishman,shelly creek,martindale rd,smolt trap,in-river,105.0,co,2022,131
1,,2022-05-09,cowichan,cowichan,mainstem fence,rst,in-river,94.0,co,2022,129
2,,2022-06-23,nanaimo,nanaimo,living forest,beach seine,marine,65.0,ck,2022,174
3,,2022-06-23,nanaimo,nanaimo,living forest,beach seine,marine,65.0,ck,2022,174
4,,2022-06-23,nanaimo,nanaimo,living forest,beach seine,marine,64.0,ck,2022,174


In [6]:
numeric_feats = ["fork_length_mm", "day_of_year"]  # apply scaling
categorical_feats = ["watershed", "river", "site", "method", "local", "year"]  # apply one-hot encoding
drop_feats = [ "date", "tag_id_long", "species" ]  

In [8]:
ct = make_column_transformer (    
    (StandardScaler(), numeric_feats),      
    (OneHotEncoder(), categorical_feats),  
    ("drop", drop_feats)
)

In [9]:
transformed = ct.fit_transform(df)

In [11]:
column_names = (
    numeric_feats  
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)

In [13]:
df_transformed = pd.DataFrame.sparse.from_spmatrix(transformed, columns=column_names)
df_transformed.head()

Unnamed: 0,fork_length_mm,day_of_year,watershed_black creek,watershed_campbell river,watershed_chemainus,watershed_cowichan,watershed_englishman,watershed_koksilah,watershed_nanaimo,watershed_puntledge,...,method_nan,local_chemainus,local_in-river,local_kelvin creek,local_lower koksilah,local_marine,local_nan,year_2021,year_2022,year_2023
0,0.562196,-0.854225,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.154906,-0.972261,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.918858,1.68355,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.918858,1.68355,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.955884,1.68355,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [16]:
columns_to_keep = ['fork_length_mm',
 'day_of_year',
 'watershed_nanaimo',
 'watershed_puntledge',
 'river_nanaimo',
 'river_puntledge',
 'site_above tsolum',
 'site_cedar bridge',
 'site_condensory bridge',
 'site_jack point',
 'site_little mexico',
 'site_living forest',
 'site_newcastle',
 'site_snuneymuxw beach',
 'site_tsolum confluence',
 'method_beach seine',
 'local_in-river',
 'local_marine',
 'year_2021',
 'year_2022',
 'year_2023']

In [18]:
df_filtered = df_transformed.loc[:, columns_to_keep]

## Prediction

In [20]:
pred = dl_model.predict(df_filtered)
prediction = pd.DataFrame(pred, columns=['chinook', 'coho', 'steelhead'])
prediction.head()



Unnamed: 0,chinook,coho,steelhead
0,0.516241,0.310101,0.173658
1,0.576029,0.257166,0.166805
2,0.875823,0.090771,0.033406
3,0.875823,0.090771,0.033406
4,0.878637,0.08844,0.032923


In [21]:
max_val = []
max_label = []
for i in range(prediction.shape[0]):
    max_val.append(prediction.iloc[i].max())
    max_label.append(prediction.iloc[i].idxmax())

## Result

In [22]:
final_pred_df = pd.DataFrame({
    'tag_id_long' : df['tag_id_long'],
    'species' : df["species"],
    'predicted_label': max_label,
    'confidence': max_val
})
final_pred_df

Unnamed: 0,tag_id_long,species,predicted_label,confidence
0,,co,chinook,0.516241
1,,co,chinook,0.576029
2,,ck,chinook,0.875823
3,,ck,chinook,0.875823
4,,ck,chinook,0.878637
...,...,...,...,...
57389,989.001038747135,co,chinook,0.536905
57390,989.001042042947,ck,chinook,0.648501
57391,989.001042516590,ck,chinook,0.739064
57392,989.001042048086,co,coho,0.427051


## Save result to CSV

In [24]:
final_pred_df.to_csv("../imputation_model/prediction/predicted_species_imputation.csv", index=False)