In [21]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import json
import sys
sys.path.append("..")
from scripts.process import *
from scripts.predict import *

In [22]:
field = pd.read_csv('../data/field.csv')

with open('../data/det.json', 'r') as file:
    det_table = json.load(file)



In [23]:
det = pd.DataFrame(det_table)
det.head()

Unnamed: 0,species,eye_size,snout_shape,parr_marks,parr_marks_length,spotting_density,fin_type,parr_marks_spacing,spotting_characteristic
0,ck,large,pointy,slightly faded,long,medium,anal fin,wider than interspaces,circle
1,co,large,short and blunt,slightly faded,long,medium,anal fin,narrower than interspaces,circle
2,cm,medium,,faded,short,medium,caudal fin,,variable
3,pink,medium,,,,,caudal fin,half,
4,so,very large,,slightly faded,irregular,,caudal fin,variable,row


In [24]:
for col in det.columns:
    if col != 'species' and col != 'fork_length_mm' and col != 'water_temp_start':
        det = one_hot_encoding(det, col, col)

In [25]:
det.columns

Index(['species', 'eye_size_large', 'eye_size_medium', 'eye_size_small',
       'eye_size_very large', 'snout_shape_NA', 'snout_shape_long and pointy',
       'snout_shape_pointy', 'snout_shape_short and blunt',
       'snout_shape_short and rounded', 'parr_marks_NA', 'parr_marks_faded',
       'parr_marks_slightly faded', 'parr_marks_length_NA',
       'parr_marks_length_irregular', 'parr_marks_length_long',
       'parr_marks_length_short', 'spotting_density_NA',
       'spotting_density_high', 'spotting_density_medium', 'fin_type_anal fin',
       'fin_type_caudal fin', 'parr_marks_spacing_NA',
       'parr_marks_spacing_half',
       'parr_marks_spacing_narrower than interspaces',
       'parr_marks_spacing_variable',
       'parr_marks_spacing_wider than interspaces',
       'spotting_characteristic_NA', 'spotting_characteristic_circle',
       'spotting_characteristic_irregular', 'spotting_characteristic_row',
       'spotting_characteristic_variable'],
      dtype='object')

In [26]:
field.head()

Unnamed: 0,watershed,river,site,method,local,water_temp_start,fork_length_mm,species
0,puntledge,puntledge,little mexico,beach seine,marine,13.2,88.0,ck
1,puntledge,puntledge,little mexico,beach seine,marine,13.2,88.0,ck
2,puntledge,puntledge,little mexico,beach seine,marine,13.2,89.0,ck
3,puntledge,puntledge,little mexico,beach seine,marine,13.2,89.0,ck
4,puntledge,puntledge,little mexico,beach seine,marine,13.2,89.0,ck


In [27]:
processed_data = processing(data=field, det_data=det_table)

In [28]:
processed_data.head()

Unnamed: 0,water_temp_start,fork_length_mm,species,watershed_black creek,watershed_campbell river,watershed_chemainus,watershed_cowichan,watershed_englishman,watershed_koksilah,watershed_nanaimo,...,parr_marks_spacing_NA,parr_marks_spacing_half,parr_marks_spacing_narrower than interspaces,parr_marks_spacing_variable,parr_marks_spacing_wider than interspaces,spotting_characteristic_NA,spotting_characteristic_circle,spotting_characteristic_irregular,spotting_characteristic_row,spotting_characteristic_variable
0,13.2,88.0,ck,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,13.2,88.0,ck,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,13.2,89.0,ck,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,13.2,89.0,ck,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,13.2,89.0,ck,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [30]:
det_data,prob_data = preprocess_data(processed_data)

In [71]:
def voting_classifier_deterministic(data,columns):
    random_numbers = [42, 231, 351, 701, 996, 523, 710, 686, 568, 268]

    det_data, _ = preprocess_data(data)
    det_data = pd.DataFrame(det_data, columns=columns)
    display(det_data)
    X = det_data.drop('species', axis=1)
    y = det_data['species']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    models = []
    for num in random_numbers:
        max_depth = 8
        dt = DecisionTreeClassifier(max_depth=max_depth, random_state=num)
        dt.fit(X_train, y_train)
        models.append(dt)
        
    all_predictions = []
    for dt in models:
        predictions = dt.predict(X)
        all_predictions.append(predictions)

    final_predictions = []
    for i in range(len(X)):
        row_predictions = [pred[i] for pred in all_predictions]
        prediction = max(set(row_predictions), key=row_predictions.count)
        final_predictions.append(prediction)

    det_data['prediction'] = final_predictions

    return det_data

In [72]:
voting_classifier_deterministic(processed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
1,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
2,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
3,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
4,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62266,stl,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
62267,stl,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
62268,stl,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
62269,stl,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,prediction
0,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,ck
1,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,ck
2,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,ck
3,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,ck
4,ck,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,ck
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62266,stl,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,stl
62267,stl,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,stl
62268,stl,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,stl
62269,stl,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,stl
