In [None]:
# Onnx output
import pandas
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [None]:
# Data
data = pandas.read_csv('./data/james_river_data.txt',
                       comment="#",
                       sep='\t',
                       dtype={"discharge_cfs": int, "flow_rate_ft": float}
                      )
data = data.drop(index=0) # Drop the one time record with metadata under the headers
data['datetime'] = pandas.to_datetime(data['datetime'])
data = data.rename(columns = {"147077_00060": "discharge_cfs", 
                              "147077_00060_cd": "discharge_read_type",
                              "147078_00065": "flow_rate_ft",
                              "147078_00065_cd": "flow_rate_read_type"})

# Clean data
print(f"Before filtering we have {len(data.index)} records")
data = data.dropna()
print(f"After filtering we have {len(data.index)} records")

data = data.loc[data['flow_rate_read_type'] == 'A'] # Remove 'P' == predicted values from the future
data = data.loc[data['discharge_read_type'] == 'A'] # Remove 'P' == predicted values from the future

# Convert types
data = data.convert_dtypes()
data = data.astype({"discharge_cfs": int, "flow_rate_ft": float})

In [None]:
# Derive some new variables
data["month"] = data["datetime"].dt.month
data["day"] = data["datetime"].dt.day
supervised_regression_data = data[["month", "day", "flow_rate_ft"]]

# Split the data
train_input, test_input, train_output, test_output = train_test_split(supervised_regression_data[["month", "day"]],
                                                                      supervised_regression_data["flow_rate_ft"])

# Train the model
dt = DecisionTreeRegressor()
dt_model = dt.fit(train_input, train_output)

dt_score = dt_model.score(test_input, test_output)
print(f"accuracy {dt_score}")

In [None]:
# Serialize to onnx

initial_type = [('input', FloatTensorType([None, 2]))]

onx = convert_sklearn(dt_model, initial_types=initial_type, target_opset=13)
with open("../app/public/dt_james.onnx", "wb") as f:
    f.write(onx.SerializeToString())