# Weather prediction model for regions in Kenya
This weather model is built and trained on historical weather data for regions in Kenya. Other than just the 47 counties, quite a number of other regions are included too hence allowing for precision.
<br>

### Data Source:
Data for this task was queried from the ```ERA5_LAND_DAILY``` dataset from **[Google Earth Engine]("https://code.earthengine.google.com/")**
<br>
The **Earth Engine** has been such a useful tool for map visualization and providing for exporting data for any region of interest. For this task, historical weather data for approximately 70 regions from year 2000 - 2020 was used.

In [1]:
# . 
# Importing required libraries

import os
import json
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers

tf.get_logger().setLevel(level=0)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

#### Loading the data:
-> The data is well arranged in the order of date in an ascending manner

-> The ```province```, ```polygon```, ```date``` are not so necessary for the training task.

-> Loading the data into pandas and removing the unnecessary fields.

In [3]:
dataset = pd.read_csv("./datasets/gee-weather-data.csv")
dataset.drop(columns=["province", "polygon", "date"], inplace=True)

In [4]:
#. A preview of the dataset
dataset.head(2)

Unnamed: 0,county,min_2m_temp (K),max_2m_temp (K),mean_2m_temp (K),total_precip,mean_2m_dewpoint (K),mean_sea_level_pressure (hPa),mean_surface_pressure (hPa),mean_u_wind_10m,mean_v_wind_10m
0,Vihiga,287.819366,302.907715,294.954794,0.000583,288.950359,1009.367205,855.294223,0.176298,0.2609
1,Vihiga,287.940735,302.816772,295.125493,8.8e-05,289.081064,1008.52213,854.696495,-0.004849,0.336365


#### Checking available regions

In [5]:
dataset["county"].unique()

array(['Vihiga', 'Narok', 'Trans Mara', 'Kiambu', 'Thika', 'Meru Central',
       'Meru North', 'Maragua', 'Muranga', 'Embu', 'Mbeere', 'Bondo',
       'Central Kisii', 'Gucha', 'Kisumu', 'Nyando', 'Siaya', 'Bomet',
       'Buret', 'Meru South', 'Tharaka', 'Garissa', 'Ijara',
       'Nandi North', 'Nandi South', 'Kirinyaga', 'Nyandarua', 'Nyeri',
       'Kilifi', 'Kwale', 'Lamu', 'Malindi', 'Mombasa', 'Taita Taveta',
       'Tana River', 'Isiolo', 'Kitui', 'Machakos', 'Makueni', 'Marsabit',
       'Moyale', 'Mwingi', 'Nairobi', 'Mandera', 'Wajir', 'Homa Bay',
       'Kuria', 'Migori', 'Nyamira', 'Rachuonyo', 'Suba', 'Baringo',
       'Kajiado', 'Keiyo', 'Kericho', 'Koibatek', 'Laikipia', 'Marakwet',
       'Nakuru', 'Samburu', 'Trans Nzoia', 'Turkana', 'Uasin Gishu',
       'West Pokot', 'Bungoma', 'Busia', 'Butere Mumias', 'Kakamega',
       'Lugari', 'Mt Elgon', 'Teso'], dtype=object)

### Preprocessing the county/ region names to a standard format
* Stripping any whitespaces
* Converting the names to lowercase

In [6]:
def process_county_names(county_name: str) -> str:
    return county_name.strip().lower()

dataset["county"] = dataset["county"].apply(process_county_names)

In [7]:
#. Viewing the updated dataframe
dataset.head(2)

Unnamed: 0,county,min_2m_temp (K),max_2m_temp (K),mean_2m_temp (K),total_precip,mean_2m_dewpoint (K),mean_sea_level_pressure (hPa),mean_surface_pressure (hPa),mean_u_wind_10m,mean_v_wind_10m
0,vihiga,287.819366,302.907715,294.954794,0.000583,288.950359,1009.367205,855.294223,0.176298,0.2609
1,vihiga,287.940735,302.816772,295.125493,8.8e-05,289.081064,1008.52213,854.696495,-0.004849,0.336365


### Getting a list of regions

In [8]:
regions = list(dataset["county"].unique())

In [9]:
"Avalable regions: {}".format(len(regions))

'Avalable regions: 71'

#### Data normalization:
* Region/ county name encoder: ```region_en = LabelEncoder()```
* Temperature values normalizer: ```temp_norm = MinMaxScaler(feature_range=(0,1))```
* Precipitation values normalizer: ```precip_norm = MinMaxScaler(feature_range=(0,1))```
* Pressure values normalizer: ```pressure_norm = MinMaxScaler(feature_range=(0,1))```
* Wind speed values normalizer: ```wsv_norm = MinMaxScaler(feature_range=(0,1))```

In [10]:
# .
# region name encoder
region_en = LabelEncoder()
# temperature values normalizer
temp_norm = MinMaxScaler(feature_range=(0,1))
# precipitation values normalizer
precip_norm = MinMaxScaler(feature_range=(0,1))
# pressure values normalizer
pressure_norm = MinMaxScaler(feature_range=(0,1))
# wind speed values normalizer
wsv_norm = MinMaxScaler(feature_range=(0,1))

In [11]:
dataset["county"] = region_en.fit_transform(dataset["county"])

temp_cols = ["min_2m_temp (K)", "max_2m_temp (K)", "mean_2m_temp (K)", "mean_2m_dewpoint (K)"]
dataset[temp_cols] = temp_norm.fit_transform(dataset[temp_cols])

dataset[["total_precip"]] = precip_norm.fit_transform(dataset[["total_precip"]])

pressure_cols = ["mean_sea_level_pressure (hPa)", "mean_surface_pressure (hPa)"]
dataset[pressure_cols] = pressure_norm.fit_transform(dataset[pressure_cols])

wind_speed_cols = ["mean_u_wind_10m", "mean_v_wind_10m"]
dataset[wind_speed_cols] = wsv_norm.fit_transform(dataset[wind_speed_cols])

In [12]:
dataset.head(2)

Unnamed: 0,county,min_2m_temp (K),max_2m_temp (K),mean_2m_temp (K),total_precip,mean_2m_dewpoint (K),mean_sea_level_pressure (hPa),mean_surface_pressure (hPa),mean_u_wind_10m,mean_v_wind_10m
0,68,0.510381,0.574655,0.494683,9.6e-05,0.668998,0.247375,0.327376,0.610349,0.898598
1,68,0.514984,0.571626,0.502485,1.5e-05,0.673353,0.20338,0.324865,0.591938,0.898598


#### Saving the normalizer and the region name encoder for later use

In [13]:
def save_normalizers(dest: str, **kwargs):
    (not os.path.exists(dest)) and os.mkdir(dest)
    en_region, en_region_path = kwargs["region_en"]
    norm_temp, norm_temp_path = kwargs["temp_norm"]
    norm_precip, norm_precip_path = kwargs["precip_norm"]
    norm_pressure, norm_pressure_path = kwargs["pressure_norm"]
    norm_wind_speed, norm_wind_speed_path = kwargs["wsv_norm"]
    pickle.dump(en_region, open("{}{}.sav".format(dest, en_region_path), "wb"))
    pickle.dump(norm_temp, open("{}{}.sav".format(dest, norm_temp_path), "wb"))
    pickle.dump(norm_precip, open("{}{}.sav".format(dest, norm_precip_path), "wb"))
    pickle.dump(norm_pressure, open("{}{}.sav".format(dest, norm_pressure_path), "wb"))
    pickle.dump(norm_wind_speed, open("{}{}.sav".format(dest, norm_wind_speed_path), "wb"))  

In [14]:
save_normalizers("./components/", **{
    "region_en": (region_en, "region_en"), 
    "temp_norm": (temp_norm, "temp_norm"), 
    "precip_norm": (precip_norm, "precip_norm"), 
    "pressure_norm": (pressure_norm, "pressure_norm"), 
    "wsv_norm": (wsv_norm, "wsv_norm")
})  

In [15]:
dataset.head(2)

Unnamed: 0,county,min_2m_temp (K),max_2m_temp (K),mean_2m_temp (K),total_precip,mean_2m_dewpoint (K),mean_sea_level_pressure (hPa),mean_surface_pressure (hPa),mean_u_wind_10m,mean_v_wind_10m
0,68,0.510381,0.574655,0.494683,9.6e-05,0.668998,0.247375,0.327376,0.610349,0.898598
1,68,0.514984,0.571626,0.502485,1.5e-05,0.673353,0.20338,0.324865,0.591938,0.898598


In [16]:
# nyeri = dataset[dataset["county"] == region_en.transform(['nyeri'])[0]]
# nyeri.head(3)

In [17]:
#. 
# Setting a window of 7 days
WINDOW_SIZE = 7
SEED = 42
weather_cols = list(dataset.keys()) #. [1:]

def train_test_sets(dframe, columns: list, window_size: int=7, seed: int=None):
    # Shifting the data in steps on window_size
    data = pd.concat([dframe[columns].shift(-i) for i in range(window_size + 1)], axis=1)
    data.columns = [f"{col}_{i}" for i in range(window_size + 1) for col in columns]
    data.dropna(inplace=True)
    
    # Width of shifted data (sdw)
    _, sdw = data.shape
    X_samples = data.iloc[:, -sdw: -len(columns)]
    Y_samples = data.iloc[:, window_size * len(columns):]
    
    # Splitting the data into train and test sets
    if seed:
        X_train, X_test, Y_train, Y_test = train_test_split(X_samples, Y_samples, test_size=0.2, random_state=seed)
    else:
        X_train, X_test, Y_train, Y_test = train_test_split(X_samples, Y_samples, test_size=0.2, shuffle=False)
    # reshaping to 3D 
    X_train = X_train.values.reshape((-1, window_size, len(columns)))
    X_test = X_test.values.reshape((-1, window_size, len(columns)))
    Y_train = Y_train.values.reshape((-1, 1, len(columns)))
    Y_test = Y_test.values.reshape((-1, 1, len(columns)))
    return X_train, X_test, Y_train, Y_test

In [18]:
X_train, X_test, Y_train, Y_test = train_test_sets(dataset, weather_cols, seed=SEED)

In [19]:
X_train.shape, Y_train.shape

((425767, 7, 10), (425767, 1, 10))

In [52]:
window_shape = (WINDOW_SIZE, len(weather_cols), 1)

model = tf.keras.models.Sequential([
    layers.BatchNormalization(input_shape=window_shape), 
    layers.Conv2D(64, (3, 3), 1, 'same', activation='relu'), 
    layers.AveragePooling2D(padding='same', strides=1), 
    layers.Conv2D(128, (3, 3), 1, activation="relu"), 
    layers.AveragePooling2D(strides=1, padding='same'), 
    layers.Conv2D(256, (3, 3), 1, activation='relu'), 
    layers.AveragePooling2D(strides=1, padding='same'), 
    layers.Conv2D(512, (3, 3), 1, activation='relu'), 
    layers.GlobalAveragePooling2D(), 
    layers.Flatten(), 
    layers.Dense(512, activation='relu'), 
    layers.Dense(512, activation='relu'), 
    layers.Dense(128, activation='relu'), 
    layers.Dense(len(weather_cols), activation='relu')
])
model.summary()

Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_8 (Batc  (None, 7, 10, 1)         4         
 hNormalization)                                                 
                                                                 
 conv2d_113 (Conv2D)         (None, 7, 10, 64)         640       
                                                                 
 average_pooling2d_93 (Avera  (None, 7, 10, 64)        0         
 gePooling2D)                                                    
                                                                 
 conv2d_114 (Conv2D)         (None, 5, 8, 128)         73856     
                                                                 
 average_pooling2d_94 (Avera  (None, 5, 8, 128)        0         
 gePooling2D)                                                    
                                                     

In [None]:
tf.keras.utils.plot_model(model, "weather-model.png")

In [54]:
model.compile(optimizer='adam', loss='mae')

In [None]:
history = model.fit(
    X_train, 
    Y_train, 
    epochs=3, 
    batch_size=64, 
    validation_split=0.2
)

Epoch 1/3
Epoch 2/3

In [None]:
evals = model.evaluate(X_test, Y_test)

In [None]:
model.save("weather-model")

In [None]:
mean_u_wind_10m	mean_v_wind_10m
query = {
    "region": "Nyeri", 
    "min_temp": 289, 
    "max_temp": 
    "mean_temp":
    "precipitation": 
    "dewpoint":
    "sea_level_pressure": 
    "surface_pressure": 
    "u_wind_component": 
    "v_wind_component": 
}