# Import libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import tensorflow as tf
import zipfile

# Download and Load DataSet

In [None]:
# !curl -O https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip
# !unzip jena_climate_2009_2016.csv.zip

In [None]:
# download file 
zip_path = tf.keras.utils.get_file(fname='jena_climate_2009_2016.csv', 
                        origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip')
zip_file = zipfile.ZipFile(file=zip_path, mode='r')
zip_file.extractall()

In [None]:
raw_dataset = pd.read_csv('jena_climate_2009_2016.csv')

# Data Visualization

In [None]:
raw_dataset

In [None]:
# check for null values
raw_dataset.info()

In [None]:
raw_dataset.describe()

In [None]:
col_short_description = {'Date Time': 'Date Time', 
                         'p (mbar)': 'Internal Presure', 
                         'T (degC)': 'Temperature in Celsius',
                         'Tpot (K)': 'Temperature in Kelvin', 
                         'Tdew (degC)': 'Temperature in Celsius relative to humidity' , 
                         'rh (%)': 'Relative Humidity', 
                         'VPmax (mbar)': 'Saturation vapor pressure', 
                         'VPact (mbar)': 'Vapor pressure', 
                         'VPdef (mbar)': 'Vapor pressure deficit', 
                         'sh (g/kg)': 'Specific humidity', 
                         'H2OC (mmol/mol)': 'Water vapor concentration',
                         'rho (g/m**3)': 'Airtight', 
                         'wv (m/s)': 'Wind Speed', 
                         'max. wv (m/s)': 'Maximum Wind Speed',
                         'wd (deg)': 'Wind Direction in Degrees'}

In [None]:
numerical_features = ['p (mbar)',
 'T (degC)',
 'Tpot (K)',
 'Tdew (degC)',
 'rh (%)',
 'VPmax (mbar)',
 'VPact (mbar)',
 'VPdef (mbar)',
 'sh (g/kg)',
 'H2OC (mmol/mol)',
 'rho (g/m**3)',
 'wv (m/s)',
 'max. wv (m/s)',
 'wd (deg)']

In [None]:
# box plot 
plt.figure(figsize = (20,15))
for i in  range(len(numerical_features)):
  ax = plt.subplot(3, 5, i+1)
  ax.boxplot(raw_dataset[numerical_features[i]])
  ax.set_title(col_short_description[numerical_features[i]])

In [None]:
# hist plot 
plt.figure(figsize = (20,15))
for i in  range(len(numerical_features)):
  ax = plt.subplot(3, 5, i+1)
  ax.hist(raw_dataset[numerical_features[i]])
  ax.set_title(col_short_description[numerical_features[i]])

In [None]:
#  plot 
plt.figure(figsize = (20,15))
for i in  range(len(numerical_features)):
  ax = plt.subplot(3, 5, i+1)
  ax.plot(raw_dataset[numerical_features[i]])
  ax.set_title(col_short_description[numerical_features[i]])

In [None]:
# heatmap
corr = raw_dataset.corr()
fig, axs = plt.subplots(figsize = (25,15))
sns.heatmap(data = corr, annot =True, fmt ='.2f', annot_kws={'size':12})

# Prepare Data

In [None]:
# full dataset shape
raw_dataset.shape

In [None]:
# train , test split
from sklearn.model_selection import train_test_split
raw_dataset_train, raw_dataset_test = train_test_split(raw_dataset, train_size =0.715)
raw_dataset_train.shape, raw_dataset_test.shape

In [None]:
# select features to use for training and seperate out the labels
features = ['p (mbar)',
 'T (degC)',           
 'Tpot (K)',
 'Tdew (degC)',
 'rh (%)',
 'VPmax (mbar)',
 'VPact (mbar)',
 'VPdef (mbar)',
 'sh (g/kg)',
 'H2OC (mmol/mol)',
 'rho (g/m**3)',
  #  'wv (m/s)',
  #  'max. wv (m/s)',
  #  'wd (deg)'
]
# label of the model
output_feature = ['T (degC)']

def select_features(dataset):
   return dataset.loc[:,features], dataset.loc[:, output_feature]
dataset_train, output_train = select_features(dataset = raw_dataset_train)
dataset_test, output_test = select_features(dataset = raw_dataset_test)
dataset, output = select_features(dataset = raw_dataset)
dataset.shape, output.shape, dataset_train.shape, output_train.shape, dataset_test.shape, output_test.shape

In [None]:
# features scaling
from sklearn.preprocessing import MinMaxScaler
in_scaler = MinMaxScaler()
in_scaler.fit(dataset_train)
def input_feature_scaling(dataset):
  return in_scaler.transform(dataset)
dataset_train_scaled = input_feature_scaling(dataset = dataset_train)
dataset_test_scaled = input_feature_scaling(dataset=dataset_test)
dataset_scaled = input_feature_scaling(dataset=dataset)
dataset_train_scaled.shape, dataset_test_scaled.shape, dataset_scaled.shape

In [None]:
# output features scaling
from sklearn.preprocessing import MinMaxScaler
out_scaler = MinMaxScaler()
out_scaler.fit(output_train)
def input_feature_scaling(dataset):
  return out_scaler.transform(dataset)
output_train_scaled = input_feature_scaling(dataset = output_train)
output_test_scaled = input_feature_scaling(dataset = output_test)
output_scaled = input_feature_scaling(dataset = output)
output_train_scaled.shape, output_test_scaled.shape, output_scaled.shape

In [None]:
# every hour
sampling_rate = 6
# 5 days past history
sequence_length = 5*24
# predict weather after 12 hour
predict_length = 12
batch_size = 256
# as we are using 5 days 12 hours gap between input and output lets slice the output
output_start = sampling_rate*(sequence_length+predict_length)
# trainsize - sequence_length*sampling_rate +1
output_end = sampling_rate*(predict_length)+len(dataset_train_scaled)+1
y_train = output_scaled[output_start:output_end, :]
X_train = tf.keras.preprocessing.timeseries_dataset_from_array(data=dataset_train_scaled, targets = y_train,
                                                               sequence_length = sequence_length, sampling_rate = sampling_rate,
                                                               batch_size = batch_size) 

X_train.cardinality(), y_train.shape

In [None]:
# as we are using 5 days 12 hours gap between input and output lets slice the output
output_start = sampling_rate*(sequence_length+predict_length)
y_test = output_test_scaled[output_start:]
X_test = tf.keras.preprocessing.timeseries_dataset_from_array(data=dataset_test_scaled, targets = y_test,
                                                               sequence_length = sequence_length, sampling_rate = sampling_rate,
                                                               batch_size = batch_size) 

X_test.cardinality(), y_test.shape

# Model Training

In [None]:
inputs = tf.keras.layers.Input(shape= (sequence_length, len(features)))
x = tf.keras.layers.LSTM(units =16)(inputs)
x = tf.keras.layers.Dropout(rate = .5)(x)
x = tf.keras.layers.Dense(units =1)(x)
model = tf.keras.Model(inputs = inputs, outputs = x)
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'mse', metrics = [tf.keras.metrics.RootMeanSquaredError()])

In [None]:
history = model.fit(X_train, epochs =5, validation_data = X_test)

In [None]:
# plot the train and test loss
fig, axs = plt.subplots(figsize = (15,15))
axs.plot(history.history['loss'], color ='b', label = 'training loss')
axs.plot(history.history['val_loss'], color ='r', label ='test loss')
axs.set_title('Training vs Test loss')
axs.set_xlabel('epochs')
axs.set_ylabel('mse')
axs.legend()

In [None]:
# plot the predictions
for batch_features, batch_labels in X_test.take(5):
  fig, axs = plt.subplots()
  one_ts_observation = batch_features[0:1,]
  prediction = model.predict(one_ts_observation)
  plt.plot(predict_length, out_scaler.inverse_transform(prediction)[0,0], label ='prediction', marker ='x', color ='r', markersize = 16)
  plt.plot(predict_length, out_scaler.inverse_transform(batch_labels)[0,0], label = 'actual', marker ='o', color ='g', markersize =16)
  plt.plot(range(-sequence_length, 0), out_scaler.inverse_transform(one_ts_observation[0, :, 1:2])[:, 0], label ='past', color ='b')
  fig.legend()
