# Imports

In [1]:
import json
import os
import pathlib
import random
import time
import uuid
import numpy as np

from bs4 import BeautifulSoup
from google.colab import drive
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import altair as alt
from altair import datum
import tensorflow as tf
from tensorflow.keras import layers
from math import radians, sin, cos, sqrt, atan2

In [None]:
!python --version

Python 3.10.12


In [2]:
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
DATA_PATH_FINAL = '/content/gdrive/MyDrive/MIDS/capstone'

In [4]:
DATA_PATH_MODEL_3 = '/content/gdrive/MyDrive/MIDS/capstone/Model_3_data'

# STEP 1: User inputs data

In [25]:
user_input_model_3 = pd.read_csv(f"{DATA_PATH_FINAL}/user_input_model_3.csv", keep_default_na=False)

user_input_model_3.head()

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects,lat,lng
0,2021-04-17,Above Treeline,S,39.301333,-120.315558
1,2021-04-17,Above Treeline,NE,39.301333,-120.315558
2,2021-04-17,Above Treeline,SE,39.301333,-120.315558
3,2021-04-17,Above Treeline,E,39.301333,-120.315558
4,2021-04-17,Above Treeline,W,39.301333,-120.315558


# STEP 2 with examples

## ignore

In [None]:
weather_data = pd.read_csv(f"{DATA_PATH_FINAL}/weather_avg_data.csv", keep_default_na=False)

weather_data.head()

Unnamed: 0,STID,Date,ELEVATION,Variable,Latitude,Longitude,Value
0,001PG,2020-09-13,1409.0,air_temp,38.0589,-120.5344,31.875
1,001PG,2020-09-14,1409.0,air_temp,38.0589,-120.5344,21.97805555555556
2,001PG,2020-09-15,1409.0,air_temp,38.0589,-120.5344,22.06814583333333
3,001PG,2020-09-16,1409.0,air_temp,38.0589,-120.5344,22.2225625
4,001PG,2020-09-17,1409.0,air_temp,38.0589,-120.5344,21.447902777777777


In [None]:
unique_stid_data = weather_data.drop_duplicates(subset=['STID'])
unique_stid_lat_lon = unique_stid_data[['STID', 'Latitude', 'Longitude']]
unique_stid_lat_lon

Unnamed: 0,STID,Latitude,Longitude
0,001PG,38.05890,-120.53440
1256,011PG,38.48640,-120.05260
2513,021PG,38.51908,-122.03184
3785,022PG,38.13141,-120.11951
5049,024PG,39.06731,-122.97210
...,...,...,...
2514621,PC300,41.07232,-122.36596
2515205,PCKCA,40.33000,-121.92000
2518123,PDEC1,39.75361,-121.62472
2522662,PDLC1,38.72833,-121.59444


In [None]:
unique_stid_lat_lon.to_csv(f"{DATA_PATH_FINAL}/unique_stid_lat_lon.csv", index=False)

## load station id lookup table

In [26]:
unique_stid_lat_lon = pd.read_csv(f"{DATA_PATH_FINAL}/unique_stid_lat_lon.csv", keep_default_na=False)

unique_stid_lat_lon.head()

Unnamed: 0,STID,Latitude,Longitude
0,001PG,38.0589,-120.5344
1,011PG,38.4864,-120.0526
2,021PG,38.51908,-122.03184
3,022PG,38.13141,-120.11951
4,024PG,39.06731,-122.9721


## haversine function

In [27]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    radius_earth = 3959  # Radius of Earth in kilometers (6371) or for miles (3959)

    # Calculate the distance
    distance = radius_earth * c

    return distance

## example of weather data combination

In [83]:
user_example = user_input_model_3[user_input_model_3['date_time']>'2023-01-01']
user_example

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects,lat,lng
23064,2023-04-29,Above Treeline,S,38.646159,-119.911092
23065,2023-04-29,Above Treeline,S,39.345402,-120.294259
23066,2023-04-29,Above Treeline,S,38.963809,-120.111139
23067,2023-04-29,Above Treeline,NE,38.646159,-119.911092
23068,2023-04-29,Above Treeline,NE,39.345402,-120.294259
...,...,...,...,...,...
35635,2023-01-02,Below Treeline,SW,38.604404,-119.892359
35636,2023-01-02,Below Treeline,NW,39.097737,-119.893460
35637,2023-01-02,Below Treeline,NW,39.589174,-120.487339
35638,2023-01-02,Below Treeline,NW,38.975439,-120.124545


In [84]:

# Convert latitude and longitude columns to float, handling empty strings
user_example['lat'] = pd.to_numeric(user_example['lat'], errors='coerce')
user_example['lng'] = pd.to_numeric(user_example['lng'], errors='coerce')
unique_stid_lat_lon['Latitude'] = pd.to_numeric(unique_stid_lat_lon['Latitude'], errors='coerce')
unique_stid_lat_lon['Longitude'] = pd.to_numeric(unique_stid_lat_lon['Longitude'], errors='coerce')

# Drop rows with missing latitude or longitude values
user_example = user_example.dropna(subset=['lat', 'lng'])
unique_stid_lat_lon = unique_stid_lat_lon.dropna(subset=['Latitude', 'Longitude'])

user_example['lat_lng'] = list(zip(user_example['lat'], user_example['lng']))
unique_stid_lat_lon['lat_lng'] = list(zip(unique_stid_lat_lon['Latitude'], unique_stid_lat_lon['Longitude']))

nearest_stid = []
for _, row1 in user_example.iterrows():
    distances = [haversine(row1['lat'], row1['lng'], *row2['lat_lng']) for _, row2 in unique_stid_lat_lon.iterrows()]
    nearest_index = np.argmin(distances)
    nearest_stid.append(unique_stid_lat_lon.iloc[nearest_index]['STID'])

# Add the nearest STID to df1
user_example['STID'] = nearest_stid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_example['lat'] = pd.to_numeric(user_example['lat'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_example['lng'] = pd.to_numeric(user_example['lng'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_example['lat_lng'] = list(zip(user_example['lat'], u

## example of finding nearest_stid

In [85]:
user_example

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects,lat,lng,lat_lng,STID
23064,2023-04-29,Above Treeline,S,38.646159,-119.911092,"(38.646159, -119.911092)",E7441
23065,2023-04-29,Above Treeline,S,39.345402,-120.294259,"(39.345402, -120.294259)",CF046
23066,2023-04-29,Above Treeline,S,38.963809,-120.111139,"(38.963809, -120.111139)",LIB17
23067,2023-04-29,Above Treeline,NE,38.646159,-119.911092,"(38.646159, -119.911092)",E7441
23068,2023-04-29,Above Treeline,NE,39.345402,-120.294259,"(39.345402, -120.294259)",CF046
...,...,...,...,...,...,...,...
35635,2023-01-02,Below Treeline,SW,38.604404,-119.892359,"(38.604404, -119.892359)",BLAC1
35636,2023-01-02,Below Treeline,NW,39.097737,-119.893460,"(39.097737, -119.89346)",JPL02
35637,2023-01-02,Below Treeline,NW,39.589174,-120.487339,"(39.589174, -120.487339)",MYMC1
35638,2023-01-02,Below Treeline,NW,38.975439,-120.124545,"(38.975439, -120.124545)",LIB17


## ignore

In [None]:
pivot_weather = (
    weather_data
    .pivot_table(
        index=["STID", "Date", "ELEVATION", "Latitude", "Longitude"],
        columns=["Variable"],
        values="Value",
        aggfunc="first",
    )
    .reset_index()
)
pivot_weather.rename(columns={'Date': 'date_time'}, inplace=True)

In [None]:
pivot_weather.to_csv(f"{DATA_PATH_FINAL}/pivot_weather.csv", index=False)

## load pivot weather table

In [35]:
pivot_weather = pd.read_csv(f"{DATA_PATH_FINAL}/pivot_weather.csv", keep_default_na=False)

pivot_weather.head()


  pivot_weather = pd.read_csv(f"{DATA_PATH_FINAL}/pivot_weather.csv", keep_default_na=False)


Unnamed: 0,STID,date_time,ELEVATION,Latitude,Longitude,air_temp,precip_accum,precip_accum_24_hour,precip_accum_fifteen_minute,precip_accum_one_hour,snow_accum,snow_depth,snow_water_equiv,solar_radiation,weather_cond_code
0,001PG,2020-09-13,1409.0,38.0589,-120.5344,31.875,,,,,,,,,
1,001PG,2020-09-14,1409.0,38.0589,-120.5344,21.97805555555556,,,,,,,,,
2,001PG,2020-09-15,1409.0,38.0589,-120.5344,22.06814583333333,,,,,,,,,
3,001PG,2020-09-16,1409.0,38.0589,-120.5344,22.2225625,,,,,,,,,
4,001PG,2020-09-17,1409.0,38.0589,-120.5344,21.447902777777777,,,,,,,,,


In [72]:
removed = ['STID','Latitude','Longitude','lat','lng','lat_lng']
merged_df = pd.merge(user_example, pivot_weather, on=['STID', 'date_time'], how='left').drop(columns = removed)
merged_df


Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects,ELEVATION,air_temp,precip_accum,precip_accum_24_hour,precip_accum_fifteen_minute,precip_accum_one_hour,snow_accum,snow_depth,snow_water_equiv,solar_radiation,weather_cond_code
0,2023-04-27,Above Treeline,E,5924.0,,1255.776,,,0.0,,,,,
1,2023-04-27,Near Treeline,E,5924.0,,1255.776,,,0.0,,,,,
2,2023-04-27,Below Treeline,E,5924.0,,1255.776,,,0.0,,,,,


# model_3_additional_features

## ignore

In [5]:
model_3_data_big = pd.read_csv(f"{DATA_PATH_MODEL_3}/model_3_final_data_big.csv",
    keep_default_na=False,
)
model_3_data_big.head()

Unnamed: 0,date_time,is_avy_obs,lat,lng,combined_terrain_aspects,combined_terrain_elevations,above_treeline_cat,near_treeline_cat,below_treeline_cat,likelihood_0_cat,...,precip_accum,precip_accum_24_hour,precip_accum_fifteen_minute,precip_accum_one_hour,precip_accum_one_minute,snow_accum,snow_depth,snow_water_equiv,solar_radiation,weather_cond_code
0,2021-04-17,No,39.301333,-120.315558,SE,Below Treeline,2.0,2.0,2.0,2.0,...,349.758,,,0.0,,,,,,
1,2021-04-17,No,39.301333,-120.315558,SE,Near Treeline,2.0,2.0,2.0,2.0,...,349.758,,,0.0,,,,,,
2,2021-04-17,No,39.301333,-120.315558,SE,Above Treeline,2.0,2.0,2.0,2.0,...,349.758,,,0.0,,,,,,
3,2021-04-17,No,39.301333,-120.315558,NE,Below Treeline,2.0,2.0,2.0,2.0,...,349.758,,,0.0,,,,,,
4,2021-04-17,No,39.301333,-120.315558,NE,Near Treeline,2.0,2.0,2.0,2.0,...,349.758,,,0.0,,,,,,


In [24]:
model_3_data_big[model_3_data_big['is_avy_obs'] == 'Yes'][['is_avy_obs','date_time','lat','lng','combined_terrain_aspects','combined_terrain_elevations']].sort_values(by='date_time', ascending=False)[:20]

Unnamed: 0,is_avy_obs,date_time,lat,lng,combined_terrain_aspects,combined_terrain_elevations
3904,Yes,2023-04-29,38.963809,-120.111139,SE,Near Treeline
3855,Yes,2023-04-29,39.345402,-120.294259,SW,Below Treeline
3860,Yes,2023-04-29,39.345402,-120.294259,E,Near Treeline
3858,Yes,2023-04-29,39.345402,-120.294259,S,Near Treeline
3857,Yes,2023-04-29,39.345402,-120.294259,S,Below Treeline
3856,Yes,2023-04-29,39.345402,-120.294259,SW,Near Treeline
3859,Yes,2023-04-29,39.345402,-120.294259,E,Below Treeline
3854,Yes,2023-04-29,39.345402,-120.294259,NE,Near Treeline
3853,Yes,2023-04-29,39.345402,-120.294259,NE,Below Treeline
3852,Yes,2023-04-29,39.345402,-120.294259,SE,Near Treeline


In [23]:
model_3_data_big[model_3_data_big['is_avy_obs'] == 'No'][['is_avy_obs','date_time','lat','lng','combined_terrain_aspects','combined_terrain_elevations']].sort_values(by='date_time', ascending=False)[:20]

Unnamed: 0,is_avy_obs,date_time,lat,lng,combined_terrain_aspects,combined_terrain_elevations
3840,No,2023-04-29,38.646159,-119.911092,SE,Near Treeline
3846,No,2023-04-29,38.646159,-119.911092,SW,Near Treeline
3850,No,2023-04-29,38.646159,-119.911092,E,Near Treeline
3849,No,2023-04-29,38.646159,-119.911092,E,Below Treeline
3848,No,2023-04-29,38.646159,-119.911092,S,Near Treeline
3839,No,2023-04-29,38.646159,-119.911092,SE,Below Treeline
3847,No,2023-04-29,38.646159,-119.911092,S,Below Treeline
3845,No,2023-04-29,38.646159,-119.911092,SW,Below Treeline
3844,No,2023-04-29,38.646159,-119.911092,W,Near Treeline
3843,No,2023-04-29,38.646159,-119.911092,W,Below Treeline


In [None]:
to_drop = merged_df.columns[4:].tolist() + ['lat', 'lng','Latitude','Longitude','STID','lat_lng']
model_3_add_features= model_3_data_big.drop(columns=to_drop)
model_3_add_features.shape

(6291, 3929)

In [None]:
model_3_add_features

Unnamed: 0,date_time,is_avy_obs,combined_terrain_aspects,combined_terrain_elevations,above_treeline_cat,near_treeline_cat,below_treeline_cat,likelihood_0_cat,likelihood_1_cat,likelihood_2_cat,...,759,760,761,762,763,764,765,766,767,ELEVATION
0,2021-04-17,No,SE,Below Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
1,2021-04-17,No,SE,Near Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
2,2021-04-17,No,SE,Above Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
3,2021-04-17,No,NE,Below Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
4,2021-04-17,No,NE,Near Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6286,2022-11-29,No,NE,Near Treeline,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,-0.053291,-0.012634,-0.077910,0.204538,-0.022752,0.023703,-0.259631,0.445080,-0.015220,6904.0
6287,2022-11-29,No,E,Below Treeline,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,-0.053291,-0.012634,-0.077910,0.204538,-0.022752,0.023703,-0.259631,0.445080,-0.015220,6904.0
6288,2022-11-29,No,E,Near Treeline,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,-0.053291,-0.012634,-0.077910,0.204538,-0.022752,0.023703,-0.259631,0.445080,-0.015220,6904.0
6289,2022-11-29,No,NW,Below Treeline,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,-0.053291,-0.012634,-0.077910,0.204538,-0.022752,0.023703,-0.259631,0.445080,-0.015220,6904.0


In [None]:
model_3_add_features.to_csv(f"{DATA_PATH_FINAL}/model_3_add_features.csv", index=False)

## load additional features

## load the new file

In [43]:
model_3_add_features = pd.read_csv(f"{DATA_PATH_FINAL}/model_3_add_features.csv", keep_default_na=False)

model_3_add_features.head()



Unnamed: 0,date_time,is_avy_obs,combined_terrain_aspects,combined_terrain_elevations,above_treeline_cat,near_treeline_cat,below_treeline_cat,likelihood_0_cat,likelihood_1_cat,likelihood_2_cat,...,759,760,761,762,763,764,765,766,767,ELEVATION
0,2021-04-17,No,SE,Below Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
1,2021-04-17,No,SE,Near Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
2,2021-04-17,No,SE,Above Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
3,2021-04-17,No,NE,Below Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0
4,2021-04-17,No,NE,Near Treeline,2.0,2.0,2.0,2.0,-1.0,-1.0,...,-0.106177,0.128691,0.110274,0.108625,-0.304962,0.188047,-0.174135,0.113692,-0.197008,5924.0


In [73]:
combined_example = pd.merge(merged_df, model_3_add_features, on=['date_time','combined_terrain_aspects','combined_terrain_elevations'], how='left', suffixes=('', '_drop'))
combined_example = combined_example.filter(regex='^(?!.*_drop)')
combined_example

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects,ELEVATION,air_temp,precip_accum,precip_accum_24_hour,precip_accum_fifteen_minute,precip_accum_one_hour,snow_accum,...,758,759,760,761,762,763,764,765,766,767
0,2023-04-27,Above Treeline,E,5924.0,,1255.776,,,0.0,,...,,,,,,,,,,
1,2023-04-27,Near Treeline,E,5924.0,,1255.776,,,0.0,,...,0.245767,-0.238754,0.284347,-0.131478,0.204196,-0.345296,0.105697,-0.308777,0.074278,-0.060686
2,2023-04-27,Near Treeline,E,5924.0,,1255.776,,,0.0,,...,0.049554,-0.361491,0.064377,0.450251,0.434774,-0.26967,-0.084135,-0.382803,0.660986,0.096422
3,2023-04-27,Below Treeline,E,5924.0,,1255.776,,,0.0,,...,0.245767,-0.238754,0.284347,-0.131478,0.204196,-0.345296,0.105697,-0.308777,0.074278,-0.060686


In [75]:
combined_example = combined_example.drop(combined_example.index[0])
combined_example

Unnamed: 0,date_time,combined_terrain_elevations,combined_terrain_aspects,ELEVATION,air_temp,precip_accum,precip_accum_24_hour,precip_accum_fifteen_minute,precip_accum_one_hour,snow_accum,...,758,759,760,761,762,763,764,765,766,767
1,2023-04-27,Near Treeline,E,5924.0,,1255.776,,,0.0,,...,0.245767,-0.238754,0.284347,-0.131478,0.204196,-0.345296,0.105697,-0.308777,0.074278,-0.060686
2,2023-04-27,Near Treeline,E,5924.0,,1255.776,,,0.0,,...,0.049554,-0.361491,0.064377,0.450251,0.434774,-0.26967,-0.084135,-0.382803,0.660986,0.096422
3,2023-04-27,Below Treeline,E,5924.0,,1255.776,,,0.0,,...,0.245767,-0.238754,0.284347,-0.131478,0.204196,-0.345296,0.105697,-0.308777,0.074278,-0.060686


# Step 4 Scoring

In [45]:
with open(f"{DATA_PATH_MODEL_3}/model_3_feature_dict.yaml", "r") as f:
    feature_dict_3 = json.load(f)

In [46]:
def df_to_dataset(dataframe, feature_dict, buffer_size=None, shuffle=True, batch_size=32):
    """
    buffer_size is for shuffle.
    Bigger buffer_size means everything needs to be in memory then only shuffle.
    Smaller is faster but less random
    """
    labels = dataframe["target"]
    df = {}
    for key, cols in feature_dict.items():
        if key in ["num_vars_norm", "num_vars"]:
            df[key] = dataframe[cols]
        else:
            for col in cols:
                df[col] = dataframe[col].tolist()
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=buffer_size)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [76]:
# Should fix these before saving the CSV files

# Replace empty string with -1 for int column
combined_example[feature_dict_3["num_vars_norm"]] = (
    combined_example[feature_dict_3["num_vars_norm"]].replace("", -1).astype(float).astype(int)
)
combined_example['target'] = combined_example['is_avy_obs'].replace({'Yes': 1, 'No': 0})

In [77]:
ds = df_to_dataset(combined_example, feature_dict_3, shuffle=False)

In [78]:
model = tf.keras.models.load_model(f"{DATA_PATH_FINAL}/model_3_v1")

In [79]:
model.predict(ds)



array([[0.00134262],
       [0.00152055],
       [0.00132055]], dtype=float32)

# IGNORE!!


In [None]:
columns_to_drop = ['air_temp_high_24_hour','air_temp_high_6_hour','air_temp_low_24_hour','air_temp_low_6_hour','dew_point_temperature','precip_accum_one_minute']
combined_example.drop(columns=columns_to_drop, inplace=True)

In [None]:
feature_dict_wo_dup = {}

for key, ori_cols in feature_dict_3.items():
    feature_dict_wo_dup[key] = []
    for col in ori_cols:
        # Should drop these
        if model_3_data_big[col].nunique() != 1:
            feature_dict_wo_dup[key].append(col)
        else:
            print(f'{col} is all unique value. Should drop from feature_dict_3')

air_temp_high_24_hour is all unique value. Should drop from feature_dict_3
air_temp_high_6_hour is all unique value. Should drop from feature_dict_3
air_temp_low_24_hour is all unique value. Should drop from feature_dict_3
air_temp_low_6_hour is all unique value. Should drop from feature_dict_3
dew_point_temperature is all unique value. Should drop from feature_dict_3
precip_accum_one_minute is all unique value. Should drop from feature_dict_3


In [None]:
feature_dict_3['num_vars_norm']

['total_problems',
 'total_shaded',
 'ELEVATION',
 'air_temp',
 'precip_accum',
 'precip_accum_24_hour',
 'precip_accum_fifteen_minute',
 'precip_accum_one_hour',
 'snow_accum',
 'snow_depth',
 'snow_water_equiv',
 'solar_radiation',
 'weather_cond_code']

In [None]:
for value in columns_to_drop:
    feature_dict_3['num_vars_norm'].remove(value)

In [None]:
b

In [None]:
b

# ignore

In [None]:
model = tf.keras.models.load_model(f"{DATA_PATH_FINAL}/model_3_v1")

In [None]:
model_3_data_big = pd.read_csv(f"{DATA_PATH_MODEL_3}/model_3_final_data_big.csv", keep_default_na=False)
model_3_data_big.shape

(6291, 3951)

In [None]:
combined_example.shape

(3, 3945)

In [None]:
def df_to_dataset(dataframe, feature_dict, buffer_size=None, shuffle=True, batch_size=32):
    """
    buffer_size is for shuffle.
    Bigger buffer_size means everything needs to be in memory then only shuffle.
    Smaller is faster but less random
    """
    labels = dataframe["target"]
    df = {}
    for key, cols in feature_dict.items():
        if key in ["num_vars_norm", "num_vars"]:
            df[key] = dataframe[cols]
        else:
            for col in cols:
                df[col] = dataframe[col].tolist()
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=buffer_size)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [None]:
# Should fix these before saving the CSV files

# Replace empty string with -1 for int column
model_3_data_big[feature_dict_3["num_vars_norm"]] = (
    model_3_data_big[feature_dict_3["num_vars_norm"]].replace("", -1).astype(float).astype(int)
)

# Maybe it's okay to keep these
column_to_drop = ['lat', 'lng', 'lat_lng', 'STID', 'Latitude', 'Longitude']
model_3_data_big.drop(columns=column_to_drop, inplace=True)
model_3_data_big.shape

# Can do this before saving the CSV (optional)
model_3_data_big['target'] = model_3_data_big['is_avy_obs'].replace({'Yes': 1, 'No': 0})

In [None]:
model_3_data_big.shape

(6291, 3946)

In [None]:
missing_columns = set(model_3_data_big.columns) - set(combined_example.columns)
missing_columns

{'obs_by_num'}

In [None]:
feature_dict_wo_dup = {}

for key, ori_cols in feature_dict_3.items():
    feature_dict_wo_dup[key] = []
    for col in ori_cols:
        # Should drop these
        if model_3_data_big[col].nunique() != 1:
            feature_dict_wo_dup[key].append(col)
        else:
            print(f'{col} is all unique value. Should drop from feature_dict_3')

air_temp_high_24_hour is all unique value. Should drop from feature_dict_3
air_temp_high_6_hour is all unique value. Should drop from feature_dict_3
air_temp_low_24_hour is all unique value. Should drop from feature_dict_3
air_temp_low_6_hour is all unique value. Should drop from feature_dict_3
dew_point_temperature is all unique value. Should drop from feature_dict_3
precip_accum_one_minute is all unique value. Should drop from feature_dict_3


In [None]:
model.predict(ds)



array([[4.4685623e-04],
       [4.5433195e-04],
       [4.5287109e-04],
       ...,
       [1.8155391e-05],
       [1.7858943e-05],
       [1.8147965e-05]], dtype=float32)

In [None]:
model_3_data_big.shape

(6291, 3946)

In [None]:
pd.DataFrame(model.predict(ds)).shape



(6291, 1)