In [48]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [22]:
df = pd.read_csv("../species_prediction_model/data/field_big.csv")
df.head()

Unnamed: 0,watershed,river,site,method,local,water_temp_start,fork_length_mm,species
0,englishman,center creek,center creek,smolt trap,in-river,10.2,85.0,co
1,englishman,center creek,center creek,smolt trap,in-river,10.2,85.0,co
2,englishman,center creek,center creek,smolt trap,in-river,10.2,85.0,co
3,englishman,center creek,center creek,smolt trap,in-river,10.2,86.0,co
4,englishman,center creek,center creek,smolt trap,in-river,10.2,87.0,co


In [23]:
df = df.dropna()

In [24]:
fish_features = {
    'species':['ck',
               'co',
               'cm',
               'pink',
               'so',
               'stl',
               'ct',
               'rbt'],
    'eye_size':['large',
                'large',
                'medium',
                'medium',
                'very large',
                'small',
                'small',
                'small'],
    'snout_shape':['pointy',
                   'short and blunt',
                   'NA',
                   'NA',
                   'NA',
                   'short and rounded',
                   'long and pointy',
                   'short and rounded'],
    'parr_marks':['slightly faded',
                  'slightly faded',
                  'faded',
                  'NA',
                  'slightly faded',
                  'faded',
                  'faded',
                  'NA'],
    'parr_marks_length':['long',
                         'long',
                         'short',
                         'NA',
                         'irregular',
                         'short',
                         'short',
                         'short'],
    'spotting_density':['medium',
                        'medium',
                        'medium',
                        'NA',
                        'NA',
                        'high',
                        'high',
                        'high'],
    'fin_type':['anal fin',
                'anal fin',
                'caudal fin',
                'caudal fin',
                'caudal fin',
                'caudal fin',
                'caudal fin',
                'caudal fin'],
    'parr_marks_spacing':['wider than interspaces',
                          'narrower than interspaces',
                          'NA',
                          'half',
                          'variable',
                          'variable',
                          'variable',
                          'NA'],
    'spotting_characteristic':['circle',
                               'circle',
                               'variable',
                               'NA',
                               'row',
                               'irregular',
                               'irregular',
                               'NA']

}

unique = pd.DataFrame(fish_features)

In [25]:
def one_hot_encoding(df,col,prefix):
  df = df.copy()
  dummies = pd.get_dummies(df[col],prefix=col,dtype='int')
  df = pd.concat([df,dummies],axis=1)
  df = df.drop(col,axis=1)

  return df

In [26]:
full_processed = df.merge(unique, how='left',on='species')
full_processed.head()

Unnamed: 0,watershed,river,site,method,local,water_temp_start,fork_length_mm,species,eye_size,snout_shape,parr_marks,parr_marks_length,spotting_density,fin_type,parr_marks_spacing,spotting_characteristic
0,englishman,center creek,center creek,smolt trap,in-river,10.2,85.0,co,large,short and blunt,slightly faded,long,medium,anal fin,narrower than interspaces,circle
1,englishman,center creek,center creek,smolt trap,in-river,10.2,85.0,co,large,short and blunt,slightly faded,long,medium,anal fin,narrower than interspaces,circle
2,englishman,center creek,center creek,smolt trap,in-river,10.2,85.0,co,large,short and blunt,slightly faded,long,medium,anal fin,narrower than interspaces,circle
3,englishman,center creek,center creek,smolt trap,in-river,10.2,86.0,co,large,short and blunt,slightly faded,long,medium,anal fin,narrower than interspaces,circle
4,englishman,center creek,center creek,smolt trap,in-river,10.2,87.0,co,large,short and blunt,slightly faded,long,medium,anal fin,narrower than interspaces,circle


In [27]:
full_processed.isna().sum()

watershed                   0
river                       0
site                        0
method                      0
local                       0
water_temp_start            0
fork_length_mm              0
species                     0
eye_size                   76
snout_shape                76
parr_marks                 76
parr_marks_length          76
spotting_density           76
fin_type                   76
parr_marks_spacing         76
spotting_characteristic    76
dtype: int64

In [28]:
full_processed = full_processed.replace(np.nan, None)

In [29]:
co_sampled = full_processed[full_processed['species'] == 'co'].sample(n=500, random_state=42)
ck_sampled = full_processed[full_processed['species'] == 'ck'].sample(n=500, random_state=42)
labels_to_filter = ['stl', 'ct', 'rbt', 'cm']
others_combined = full_processed[full_processed['species'].isin(labels_to_filter)]
others_combined['species'].value_counts()
full_small = pd.concat([co_sampled, ck_sampled, others_combined], axis=0)
full_small = full_small.reset_index(drop=True)
full_small['species'].value_counts()

species
co     500
ck     500
stl    479
ct     479
rbt    449
cm      10
Name: count, dtype: int64

In [30]:
for col in full_small.columns:
  if col != 'species' and col != 'fork_length_mm' and col != 'water_temp_start':
    print(col)
    full_small = one_hot_encoding(full_small,col,col)

watershed
river
site
method
local
eye_size
snout_shape
parr_marks
parr_marks_length
spotting_density
fin_type
parr_marks_spacing
spotting_characteristic


In [35]:
full_small = full_small.drop("water_temp_start", axis = 1)

In [36]:
full_small.head()

Unnamed: 0,fork_length_mm,species,watershed_cowichan,watershed_englishman,watershed_nanaimo,watershed_puntledge,river_center creek,river_cowichan,river_englishman,river_haslam creek,...,fin_type_anal fin,fin_type_caudal fin,parr_marks_spacing_NA,parr_marks_spacing_narrower than interspaces,parr_marks_spacing_variable,parr_marks_spacing_wider than interspaces,spotting_characteristic_NA,spotting_characteristic_circle,spotting_characteristic_irregular,spotting_characteristic_variable
0,105.0,co,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
1,90.0,co,0,1,0,0,1,0,0,0,...,1,0,0,1,0,0,0,1,0,0
2,95.0,co,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
3,102.0,co,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
4,81.0,co,0,1,0,0,1,0,0,0,...,1,0,0,1,0,0,0,1,0,0


In [37]:
X = full_small.drop('species', axis = 1)
y = full_small['species']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# 'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 6
decision_tree = DecisionTreeClassifier(
    max_depth = 7,
    min_samples_split = 6,
    min_samples_leaf = 4,
    random_state = 42)

decision_tree.fit(X_train, y_train)
print("Train accuracy:", decision_tree.score(X_train, y_train))

Train accuracy: 1.0


In [41]:
import pickle
pickle.dump(decision_tree, open('model/dt_riya_new.h5', 'wb'))

In [45]:
scaler = StandardScaler()
X['fork_length_mm'] = scaler.fit_transform(X[['fork_length_mm']])

In [46]:
X.head()

Unnamed: 0,fork_length_mm,watershed_cowichan,watershed_englishman,watershed_nanaimo,watershed_puntledge,river_center creek,river_cowichan,river_englishman,river_haslam creek,river_nanaimo,...,fin_type_anal fin,fin_type_caudal fin,parr_marks_spacing_NA,parr_marks_spacing_narrower than interspaces,parr_marks_spacing_variable,parr_marks_spacing_wider than interspaces,spotting_characteristic_NA,spotting_characteristic_circle,spotting_characteristic_irregular,spotting_characteristic_variable
0,-0.303568,0,0,1,0,0,0,0,0,1,...,1,0,0,1,0,0,0,1,0,0
1,-0.615799,0,1,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
2,-0.511722,0,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
3,-0.366014,0,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
4,-0.803138,0,1,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0


In [49]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
y_label = tf.keras.utils.to_categorical(y_enc)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.2, random_state=42)

In [51]:
num_features = X.shape[1]
dl_model = tf.keras.Sequential([
    layers.Input(shape=(num_features,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(6, activation='softmax')  #change based on number of labels
])

In [52]:
dl_model.compile(optimizer=Adam(learning_rate=0.0001),        
    loss='categorical_crossentropy',  
    metrics=['accuracy']) 

dl_model.summary()

In [53]:
history = dl_model.fit(X_train, y_train, 
                    epochs = 20, 
                    batch_size = 32, 
                    validation_split=0.2)

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.2499 - loss: 1.7619 - val_accuracy: 0.7468 - val_loss: 1.5137
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7398 - loss: 1.4517 - val_accuracy: 0.8837 - val_loss: 1.2344
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8614 - loss: 1.1730 - val_accuracy: 0.9199 - val_loss: 0.9549
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9291 - loss: 0.8786 - val_accuracy: 0.9354 - val_loss: 0.6923
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9355 - loss: 0.6374 - val_accuracy: 0.9483 - val_loss: 0.4886
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9530 - loss: 0.4527 - val_accuracy: 0.9664 - val_loss: 0.3459
Epoch 7/20
[1m49/49[0m [32m━━━━━━━━

In [55]:
dl_model.save('model/dl_riya_new.keras')