In [1]:
import pandas as pd
import datetime as dt

In [2]:
# Choosing not to hide API Key with Amazon secret because free API, seems like unnecssary effort? Also no idea how that works tbh
weather_json = pd.read_json("https://api.weatherapi.com/v1/forecast.json?key=70f3d6e807464250af5163556231505&q=Leuven&days=10&aqi=no&alerts=no")


In our historic weather data we have:

*  'StandardScaler__LC_HUMIDITY',
*  'StandardScaler__LC_DWPTEMP',
*   'StandardScaler__LC_n',
*  'StandardScaler__LC_RAD',
*  'StandardScaler__LC_RAININ',
*  'StandardScaler__LC_DAILYRAIN',
*  'StandardScaler__LC_WINDDIR',
*  'StandardScaler__LC_WINDSPEED',
*  'StandardScaler__LC_RAD60',
*  'StandardScaler__LC_TEMP_QCL0',
*  'StandardScaler__LC_TEMP_QCL1',
*  'StandardScaler__LC_TEMP_QCL2',
*  'StandardScaler__LC_TEMP_QCL3'




Which we can match with API:

* humidity
* dewpoint_c
* We should drop _LC_N
* uv ([divide by 40m²/W for per second, then multiply for exposure ](https://www.researchgate.net/post/How-can-I-convert-Ultra-Violet-index-into-Ultra-Violet-irradiation-Dose#:~:text=The%20UV%20index%20is%20a,daily%20mean%20of%20UV%20irradiance.))
* precip_mm 
* totalprecip_mm (from day not hour json part), can only do that for t=0 because otherwise it is a forecast? E.g. could sum over the expected amount of rain till that point on the day
* Need unscaled + there appears to be an issue with the way we handel it because the values make no sense. Can then use wind_degree
* wind_kph*1000/3600
* Should take either this or RAD imo. Just the same thing
* temp_c (Imo use this for all 3 simply take average/median of 4 here)



In [47]:
def fetch_forecast(day:int, hour:int):
  """
  Day must be within next 10 days with 0 being today and 9 being the max 
  hour of day 0 till 23 where hour corresponds to period of e.g. 00:00:00 till 00:59:59 etc.

  Returns df with correctly ordered columns for current model
  """
  # If used in website we could optimize for cache
  api_pull =  pd.read_json("https://api.weatherapi.com/v1/forecast.json?key=70f3d6e807464250af5163556231505&q=London&days=10&aqi=no&alerts=no")
  df = pd.json_normalize(api_pull["forecast"]["forecastday"], "hour") 
  
  # Selecting only relevant date
  df["date"]=pd.to_datetime(df["time"]).dt.date
  df=df.loc[(df["date"]==(dt.date.today()+dt.timedelta(days=day)))]
  
  # Adding up the expected amount of rain on the day so far
  total_rain = (df.iloc[:hour])["precip_mm"].sum()
  df = df.iloc[[hour]]
  df["LC_DAILYRAIN"] = total_rain
  
  # Selecting relevant columns in order and renaming
  df= df[["time", "humidity", "dewpoint_c","uv","precip_mm","LC_DAILYRAIN","wind_degree","wind_kph","temp_c"]]
  df.rename(columns={"humidity":"LC_HUMIDITY","dewpoint_c":"LC_DWPTEMP","uv":"LC_RAD","precip_mm":"LC_RAININ","wind_degree":"LC_WINDDIR",
                     "wind_kph":"LC_WINDSPEED","temp_c":"LC_TEMP_QCL0"},inplace=True)
  
  # Multipliyng UV index with 90 (*0.025 *3600) yields  hourly W/m² radiation = RAD60, normal RAD unclear will use same methodology for now
  df["LC_RAD"] = df["LC_RAD"]*90
  df.insert(8, column="RAD_60", value=df["LC_RAD"])
  df["LC_RAD60"] =df["LC_RAD"]
  
  # Time settings
  df["time"]= pd.to_datetime(df["time"])
  df["hour"]= df["time"].dt.hour
  df["month"]=df["time"].dt.month
  df["weekday"] = df["time"].dt.strftime('%a')
  df= df.drop("time",axis=1)

  # Changing wind into m/s from kmph, doing it this way to make mistakes in formula more obivous
  df["LC_WINDSPEED"] = df["LC_WINDSPEED"]*1000/3600

  # Using temperature we have for all measurments for now
  df["LC_TEMP_QCL1"] = df["LC_TEMP_QCL0"]
  df["LC_TEMP_QCL2"] = df["LC_TEMP_QCL0"]
  df["LC_TEMP_QCL3"] = df["LC_TEMP_QCL0"]

  # Doing this so columns have right order for current model
  df= df[['hour', 'month', 'weekday', 'LC_HUMIDITY', 'LC_DWPTEMP', 'LC_RAD', 'LC_RAININ', 'LC_DAILYRAIN', 
          'LC_WINDSPEED', 'LC_RAD60', 'LC_TEMP_QCL0', 'LC_TEMP_QCL1', 'LC_TEMP_QCL2', 'LC_TEMP_QCL3']]
  return df


In [79]:
df= fetch_forecast(0, 23)
df

Unnamed: 0,hour,month,weekday,LC_HUMIDITY,LC_DWPTEMP,LC_RAD,LC_RAININ,LC_DAILYRAIN,LC_WINDSPEED,LC_RAD60,LC_TEMP_QCL0,LC_TEMP_QCL1,LC_TEMP_QCL2,LC_TEMP_QCL3
23,23,5,Sun,84,7.6,90.0,0.0,0.0,3.388889,90.0,10.2,10.2,10.2,10.2


In [80]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

one_hot_var = ["hour", "month", "weekday"]
numerical_var = [col for col in df.columns if col not in one_hot_var]

t = ColumnTransformer(
    transformers = [
        ('OneHot', OneHotEncoder(handle_unknown='ignore'), one_hot_var),
        ('StandardScaler', StandardScaler(), numerical_var), 
        
    ] )

# fit the encoder
t.fit(df)
x_live = pd.DataFrame(t.transform(df), columns=t.get_feature_names_out())
x_live

Unnamed: 0,OneHot__hour_23,OneHot__month_5,OneHot__weekday_Sun,StandardScaler__LC_HUMIDITY,StandardScaler__LC_DWPTEMP,StandardScaler__LC_RAD,StandardScaler__LC_RAININ,StandardScaler__LC_DAILYRAIN,StandardScaler__LC_WINDSPEED,StandardScaler__LC_RAD60,StandardScaler__LC_TEMP_QCL0,StandardScaler__LC_TEMP_QCL1,StandardScaler__LC_TEMP_QCL2,StandardScaler__LC_TEMP_QCL3
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# Adding other columns, not too elegant tbh
one_hot_cols = ['OneHot__hour_0', 'OneHot__hour_1', 'OneHot__hour_2',
       'OneHot__hour_3', 'OneHot__hour_4', 'OneHot__hour_5',
       'OneHot__hour_6', 'OneHot__hour_7', 'OneHot__hour_8',
       'OneHot__hour_9', 'OneHot__hour_10', 'OneHot__hour_11',
       'OneHot__hour_12', 'OneHot__hour_13', 'OneHot__hour_14',
       'OneHot__hour_15', 'OneHot__hour_16', 'OneHot__hour_17',
       'OneHot__hour_18', 'OneHot__hour_19', 'OneHot__hour_20',
       'OneHot__hour_21', 'OneHot__hour_22', 'OneHot__hour_23',
       'OneHot__month_1', 'OneHot__month_2', 'OneHot__month_3',
       'OneHot__month_4', 'OneHot__month_5', 'OneHot__month_6',
       'OneHot__month_7', 'OneHot__month_8', 'OneHot__month_9',
       'OneHot__month_10', 'OneHot__month_11', 'OneHot__month_12',
       'OneHot__weekday_Fri', 'OneHot__weekday_Mon',
       'OneHot__weekday_Sat', 'OneHot__weekday_Sun',
       'OneHot__weekday_Thu', 'OneHot__weekday_Tue',
       'OneHot__weekday_Wed']

for col in one_hot_cols:
  if col not in x_live.columns:
    x_live[col]=0
x_live = x_live[['OneHot__hour_0', 'OneHot__hour_1', 'OneHot__hour_2', 'OneHot__hour_3',
       'OneHot__hour_4', 'OneHot__hour_5', 'OneHot__hour_6', 'OneHot__hour_7',
       'OneHot__hour_8', 'OneHot__hour_9', 'OneHot__hour_10',
       'OneHot__hour_11', 'OneHot__hour_12', 'OneHot__hour_13',
       'OneHot__hour_14', 'OneHot__hour_15', 'OneHot__hour_16',
       'OneHot__hour_17', 'OneHot__hour_18', 'OneHot__hour_19',
       'OneHot__hour_20', 'OneHot__hour_21', 'OneHot__hour_22',
       'OneHot__hour_23', 'OneHot__month_1', 'OneHot__month_2',
       'OneHot__month_3', 'OneHot__month_4', 'OneHot__month_5',
       'OneHot__month_6', 'OneHot__month_7', 'OneHot__month_8',
       'OneHot__month_9', 'OneHot__month_10', 'OneHot__month_11',
       'OneHot__month_12', 'OneHot__weekday_Fri', 'OneHot__weekday_Mon',
       'OneHot__weekday_Sat', 'OneHot__weekday_Sun', 'OneHot__weekday_Thu',
       'OneHot__weekday_Tue', 'OneHot__weekday_Wed',
       'StandardScaler__LC_HUMIDITY', 'StandardScaler__LC_DWPTEMP',
       'StandardScaler__LC_RAD', 'StandardScaler__LC_RAININ',
       'StandardScaler__LC_DAILYRAIN', 'StandardScaler__LC_WINDSPEED',
       'StandardScaler__LC_RAD60', 'StandardScaler__LC_TEMP_QCL0',
       'StandardScaler__LC_TEMP_QCL1', 'StandardScaler__LC_TEMP_QCL2',
       'StandardScaler__LC_TEMP_QCL3']]

In [82]:
# load model
# Really make sure we use the right versions 
gb = pickle.load(open("gb_50.sav","rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [85]:
gb.predict(x_live)

AttributeError: ignored