In [1]:
import json
import os
import pathlib
import random
import time
import uuid
import numpy as np

from bs4 import BeautifulSoup
from google.colab import drive
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import altair as alt
from altair import datum
import tensorflow as tf
from tensorflow.keras import layers

In [2]:
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
DATA_PATH_FINAL = '/content/gdrive/MyDrive/MIDS/capstone'

# User data Load: This will be only available for the last 3 seasons


In [7]:
user_input = pd.read_csv(f"{DATA_PATH_FINAL}/user_input.csv", keep_default_na=False)

user_input.head()

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects
0,2021-04-17,Above Treeline,S
1,2021-04-17,Above Treeline,NE
2,2021-04-17,Above Treeline,SE
3,2021-04-17,Above Treeline,E
4,2021-04-17,Above Treeline,W


In [8]:
user_input.shape

(10104, 3)

In [24]:
user_input = user_input[user_input['date_time'] == '2023-04-27']

# Additional Features

In [4]:
additional_feature_per_date = pd.read_csv(f"{DATA_PATH_FINAL}/additional_feature_per_date.csv", keep_default_na=False)
additional_feature_per_date.head()

Unnamed: 0,date_time,above_treeline_cat,near_treeline_cat,below_treeline_cat,likelihood_0_cat,likelihood_1_cat,likelihood_2_cat,size_0_cat,size_1_cat,size_2_cat,...,problem_discussion_2_bert_all_case_emb_758,problem_discussion_2_bert_all_case_emb_759,problem_discussion_2_bert_all_case_emb_760,problem_discussion_2_bert_all_case_emb_761,problem_discussion_2_bert_all_case_emb_762,problem_discussion_2_bert_all_case_emb_763,problem_discussion_2_bert_all_case_emb_764,problem_discussion_2_bert_all_case_emb_765,problem_discussion_2_bert_all_case_emb_766,problem_discussion_2_bert_all_case_emb_767
0,2021-04-17,2.0,2.0,2.0,2.0,-1.0,-1.0,3.0,-1.0,-1.0,...,0.158837,0.583574,-1.081375,-0.569019,0.808689,-0.241232,-0.506336,-0.373096,-0.38232,0.336898
1,2021-04-16,2.0,2.0,2.0,2.0,-1.0,-1.0,3.0,-1.0,-1.0,...,0.158837,0.583574,-1.081375,-0.569019,0.808689,-0.241232,-0.506336,-0.373096,-0.38232,0.336898
2,2021-04-14,2.0,2.0,2.0,2.0,2.0,-1.0,3.0,3.0,-1.0,...,0.158837,0.583574,-1.081375,-0.569019,0.808689,-0.241232,-0.506336,-0.373096,-0.38232,0.336898
3,2021-04-13,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.158837,0.583574,-1.081375,-0.569019,0.808689,-0.241232,-0.506336,-0.373096,-0.38232,0.336898
4,2021-04-12,2.0,2.0,2.0,2.0,-1.0,-1.0,3.0,-1.0,-1.0,...,0.158837,0.583574,-1.081375,-0.569019,0.808689,-0.241232,-0.506336,-0.373096,-0.38232,0.336898


In [25]:
additional_feature_per_date = additional_feature_per_date[additional_feature_per_date['date_time'] == '2023-04-27']

In [26]:
additional_feature_per_date.shape

(1, 3156)

# Merge user input + features

In [27]:
model_2_feature = (
    pd.merge(
        user_input,
        additional_feature_per_date,
        how="left",
        on="date_time",
    )
)

In [28]:
model_2_feature.shape

(24, 3158)

# Convert df to datasets

In [29]:
import tensorflow as tf

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = None  # Initialize labels as None

    # Remove the target column if it exists
    if 'target' in df.columns:
        labels = df.pop('target')  # Remove and store the target column

    # Convert the dataframe to a dictionary of lists
    df_dict = {key: value.tolist() for key, value in df.items()}

    # Create a dataset from the dictionary and labels
    ds = tf.data.Dataset.from_tensor_slices((df_dict, labels))

    # Shuffle the dataset if specified
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))

    # Batch and prefetch the dataset
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)

    return ds


In [30]:
ds = df_to_dataset(model_2_feature, False)

# Model prediction

In [13]:
model = tf.keras.models.load_model(f"{DATA_PATH_FINAL}/tf_rand_val_nn_v2")

In [31]:
model_output= model.predict(ds)



In [32]:
output = pd.DataFrame(model_output)

In [33]:
output

Unnamed: 0,0
0,0.029534
1,0.03354
2,0.031394
3,0.031938
4,0.029534
5,0.031713
6,0.029534
7,0.029534
8,0.026871
9,0.030645


# Finding the yes?

In [21]:
output.rename(columns={0: 'score'}, inplace=True)
output[-20:]

Unnamed: 0,score
10084,0.011722
10085,0.012604
10086,0.011722
10087,0.011722
10088,0.010647
10089,0.012171
10090,0.011332
10091,0.011545
10092,0.010647
10093,0.011449


# ignore

In [None]:
aspects_elevations = (
    pd.merge(
        base_data_base_model[["combined_terrain_elevations"]].drop_duplicates(),
        base_data_base_model[["combined_terrain_aspects"]].drop_duplicates(),
        how="cross",
    )
    .dropna(how="any")
    .reset_index(drop=True)
)

In [None]:
user_input = (
    pd.merge(
        base_data_base_model[['date_time']].drop_duplicates(),
        aspects_elevations,
        how="cross",
    )
    .dropna(how="any")
    .reset_index(drop=True)
)
user_input.to_csv(f"{DATA_PATH_FINAL}/user_input.csv", index=False)

In [None]:
user_input = pd.read_csv(f"{DATA_PATH_FINAL}/user_input.csv", keep_default_na=False)

In [None]:
user_input[user_input['combined_terrain_aspects'].isna()]

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects


In [None]:
user_input.shape

(10104, 3)

In [None]:
user_input.head(2)

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects
0,2021-04-17,Above Treeline,S
1,2021-04-17,Above Treeline,NE


## Load additional features (TO DO: I will work on the current season features engineering )

In [None]:
base_data_base_model["date_time"].nunique()

421

In [None]:
additional_feature_per_date = (
    base_data_base_model
    .drop(columns=["combined_terrain_aspects", "combined_terrain_elevations", "is_avy_obs"])
    .drop_duplicates()
)
additional_feature_per_date.to_csv(f"{DATA_PATH_FINAL}/additional_feature_per_date.csv", index=False)

In [None]:
additional_feature_per_date = pd.read_csv(f"{DATA_PATH_FINAL}/additional_feature_per_date.csv", keep_default_na=False)

In [None]:
additional_feature_per_date.isna().sum().sum()

0

## Concat User information + additional features

In [None]:
additional_feature = (
    pd.merge(
        user_input,
        additional_feature_per_date,
        how="left",
        on="date_time",
    )
)

In [None]:
len(set(additional_feature.columns)) == len(additional_feature.columns)

True

In [None]:
additional_feature.to_csv(f"{DATA_PATH_FINAL}/additional_feature.csv", index=False)

In [None]:
additional_feature = pd.read_csv(f"{DATA_PATH_FINAL}/additional_feature.csv", keep_default_na=False)

## Convert the dataframe to datasets

In [None]:
import tensorflow as tf

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = None  # Initialize labels as None

    # Remove the target column if it exists
    if 'target' in df.columns:
        labels = df.pop('target')  # Remove and store the target column

    # Convert the dataframe to a dictionary of lists
    df_dict = {key: value.tolist() for key, value in df.items()}

    # Create a dataset from the dictionary and labels
    ds = tf.data.Dataset.from_tensor_slices((df_dict, labels))

    # Shuffle the dataset if specified
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))

    # Batch and prefetch the dataset
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)

    return ds


In [None]:
ds = df_to_dataset(additional_feature, False)

## Load Model and predict

In [None]:
model = tf.keras.models.load_model(f"{DATA_PATH_FINAL}/tf_rand_val_nn_v1")

  inputs = self._flatten_to_reference_inputs(inputs)




array([[0.02322091],
       [0.02672279],
       [0.02476688],
       ...,
       [0.0121054 ],
       [0.01129102],
       [0.01129102]], dtype=float32)