# SPECIES PREDICTION MODEL PIPELINE (Deep Learning)

NOTE: The data and EDA is based on a dummy dataset. The pipeline is created and flexible for more data.

## Imports

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    LabelEncoder
)

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv("../species_prediction_model/data/trial_data_riya.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../species_prediction_model/data/trial_data_riya.csv'

In [None]:
df.isnull().sum()

In [None]:
df.species.unique()

This dataset only has 2 labels - co and rbt

In [None]:
for col in df.columns:
    if col not in ['tag_id_long']:
        print(col, ":", df[col].unique())

In [None]:
df= df.drop(['weight_g', 'tag_id_long'], axis = 1)
df = df.dropna()

In [None]:
df

## Pipeline

In [None]:
X = df.drop('species', axis = 1)
y = df['species']

In [None]:
numeric_feats = ["water_temp_start", "fork_length_mm"]  # apply scaling
categorical_feats = ["watershed", "river", "site", "method", "local"]  # apply one-hot encoding
# passthrough_feats = [""]  # do not apply any transformation
# drop_feats = [ "species" ]  
# levels = ["", ""]

In [None]:
ct = make_column_transformer (    
    (StandardScaler(), numeric_feats),  
    #("passthrough", passthrough_feats),     
    (OneHotEncoder(), categorical_feats),  
    #("drop", drop_feats), 
    #OrdinalEncoder(categories=[levels], dtype=int)
)

In [None]:
transformed = ct.fit_transform(X)

In [None]:
column_names = (
    numeric_feats
    # + passthrough_feats    
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
column_names

In [None]:
X_transformed = pd.DataFrame(transformed, columns=column_names)
X_transformed.head()

### Dummy model

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_transformed, y)
dummy_clf.score(X_transformed, y)

(unbalanced dataset - 82% is the same label)

### Deep learning - tensorflow

In [None]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
y_label = tf.keras.utils.to_categorical(y_enc)

In [None]:
le.classes_ 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_label, test_size=0.2, random_state=42)

In [None]:
num_features = X_transformed.shape[1]
dl_model = tf.keras.Sequential([
    layers.Input(shape=(num_features,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(2, activation='softmax')  #change based on number of labels
])

In [None]:
dl_model.compile(optimizer=Adam(learning_rate=0.0001),        
    loss='categorical_crossentropy',  
    metrics=['accuracy']) 

dl_model.summary()

In [None]:
history = dl_model.fit(X_train, y_train, 
                    epochs = 20, 
                    batch_size = 32, 
                    validation_split=0.2)

In [None]:
def plot_acc(history):
    fig = plt.figure(0, (12, 4))

    ax = plt.subplot(1, 2, 1)
    sns.lineplot(x=history.epoch, y=history.history['accuracy'], label='train')
    sns.lineplot(x=history.epoch, y=history.history['val_accuracy'], label='valid')
    plt.title('Accuracy')
    plt.tight_layout()

    ax = plt.subplot(1, 2, 2)
    sns.lineplot(x=history.epoch, y=history.history['loss'], label='train')
    sns.lineplot(x=history.epoch, y=history.history['val_loss'], label='valid')
    plt.title('Loss')
    plt.tight_layout()

    plt.show()

In [None]:
plot_acc(history)

In [None]:
y_pred = dl_model.predict(X_test)

In [None]:
y_pred[0:5] #confidence levels

2 methods:
1. We convert all decimals to int, that way they'll be a whole number (1 or 0). Cannot give a label for sure (all may be 0). This could be a good option to make sure that we are not naively assigning labels
2. The prediction needs to be converted to 1 and 0. For each row, the highest value is the label and the "predicted" number is the confidence level.

In [None]:
prediction = pd.DataFrame(y_pred, columns=le.classes_)
prediction.head()

In [None]:
y_test[0:5]