<a href="https://colab.research.google.com/github/awaw24/Weather_Forecast_Project/blob/main/Projekt_przewidywanie_pogody.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Źródło danych:**
https://www.kaggle.com/datasets/selfishgene/historical-hourly-weather-data

Inicjalizacja

In [None]:
!pip install kaggle

In [None]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found, przestaw środowisko wykonawcze do obsługi T4 GPU (wykonuje obliczenia ponad 2x szybciej). Kliknij, Środowisko wykonawcze --> Zmień typ środowiska wykonawczego --> Akcelerator sprzętowy == T4 GPU --> ZAPISZ')
print('Found GPU at: {}'.format(device_name))

# Ustawienia konfiguracyjne dla wykresów
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

In [None]:
from google.colab import files
# Upload the kaggle.json file
#uploaded = files.upload()
!mkdir -p ~/.kaggle
#!cp kaggle.json ~/.kaggle/
!echo '{"username":"arkadiuszpizon","key":"3c3a9f417acea444ab7079f157abb429"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d selfishgene/historical-hourly-weather-data
!unzip -qn historical-hourly-weather-data.zip

Wczytanie plików

In [None]:
df_temperature = pd.read_csv('temperature.csv')
df_humidity = pd.read_csv('humidity.csv')
df_pressure = pd.read_csv('pressure.csv')
df_weather_description = pd.read_csv('weather_description.csv')
df_wind_direction = pd.read_csv('wind_direction.csv')
df_wind_speed = pd.read_csv('wind_speed.csv')

Wstępna weryfkiacja

In [None]:
df_temperature.head()

Analiza pod względem braku danych dla poszczególnych miast

In [None]:
dfs = [df_temperature, df_humidity, df_pressure, df_weather_description, df_wind_direction, df_wind_speed]
new_column_names = ["temp", "humid", "press", "descr", "w_dir", "w_speed"]
nan_counts = pd.concat([df.isnull().sum() for df in dfs], axis=1, keys=new_column_names)

nan_counts['Row_Sum'] = nan_counts.sum(axis=1)
nan_counts_sorted = nan_counts.sort_values(by='Row_Sum', ascending=True)

print("NaN occurrences in each column:")
print(nan_counts_sorted)

Wybieranie danych dla **San Diego** i połączenie dataframe'ów w jedno

In [None]:
df_temperature = pd.read_csv('temperature.csv')
df_humidity = pd.read_csv('humidity.csv')
df_pressure = pd.read_csv('pressure.csv')
df_weather_description = pd.read_csv('weather_description.csv')
df_wind_direction = pd.read_csv('wind_direction.csv')
df_wind_speed = pd.read_csv('wind_speed.csv')

city = 'San Diego'
df_datetime = pd.to_datetime(df_temperature['datetime'], format='%Y-%m-%d %H:%M:%S')
df_temperature = df_temperature[[city]].rename(columns={city: 'temperature'})
df_humidity = df_humidity[[city]].rename(columns={city: 'humidity'})
df_pressure = df_pressure[[city]].rename(columns={city: 'pressure'})
df_weather_description = df_weather_description[[city]].rename(columns={city: 'description'})
df_wind_direction = df_wind_direction[[city]].rename(columns={city: 'wind_dir'})
df_wind_speed = df_wind_speed[[city]].rename(columns={city: 'wind_speed'})

df_weather = pd.concat([df_datetime, df_temperature, df_humidity, df_pressure, df_weather_description, df_wind_direction, df_wind_speed], axis=1)

Dane na których pracuję to __df_weather__ i dotyczą miasta **San Diego**

In [None]:
df_weather.head()

In [None]:
weather_count = df_weather[['description']].value_counts()
print(weather_count)

Analiza pod kątem występowania braku danych w DataFramie dla San Diego

Wyświetlenie liczby brakujących danych w poszczególnych kolumnach

In [None]:
df_weather.isna().sum()

Wyświetlenie wszystkich wierszy w których występują braki

In [None]:
nan_values = df_weather[df_weather.isna().any(axis=1)]

pd.set_option('display.max_rows', None)

print (nan_values)

In [None]:
hum_mean = df_weather.loc[:,"humidity"].mean()
press_mean = df_weather.loc[:,"pressure"].mean()
df_weather = df_weather.fillna(value={'humidity':hum_mean,'pressure':press_mean,'temperature':291.530000,'description': "sky is clear",'wind_dir':0.0,'wind_speed':0.0 })

Wyświetlenie liczby brakujących danych w poszczególnych kolumnach

In [None]:
df_weather.isna().sum()

Wyświetlenie wszystkich wierszy w których występują braki

In [None]:
nan_values = df_weather[df_weather.isna().any(axis=1)]

pd.set_option('display.max_rows', None)

print (nan_values)

Dodanie kolumn: **rain_exists** oraz **cloud_ exists**

Sprawdzenie wartości przed zmianą

In [None]:
df_weather.head()

Tworzenie kolumny **rain_exists**

In [None]:
df_weather.loc[(df_weather['description'] == 'light rain') | (df_weather['description'] == 'moderate rain') | (df_weather['description'] == 'light intensity drizzle') | (df_weather['description'] == 'drizzle') | (df_weather['description'] == 'squalls') | (df_weather['description'] == 'heavy intensity rain') | (df_weather['description'] == 'thunderstorm') | (df_weather['description'] == 'shower rain') | (df_weather['description'] == 'very heavy rain') | (df_weather['description'] == 'thunderstorm with light rain') | (df_weather['description'] == 'thunderstorm with rain') | (df_weather['description'] == 'light intensity shower rain'), 'rain_exists'] = 1
df_weather.loc[(df_weather['description'] != 'light rain') & (df_weather['description'] != 'moderate rain') & (df_weather['description'] != 'light intensity drizzle') & (df_weather['description'] != 'drizzle') & (df_weather['description'] != 'squalls') & (df_weather['description'] != 'heavy intensity rain') & (df_weather['description'] != 'thunderstorm') & (df_weather['description'] != 'shower rain') & (df_weather['description'] != 'very heavy rain') & (df_weather['description'] != 'thunderstorm with light rain') & (df_weather['description'] != 'thunderstorm with rain') & (df_weather['description'] != 'light intensity shower rain'), 'rain_exists'] = 0

df_weather['rain_exists'] = df_weather['rain_exists'].astype('bool')

Tworzenie kolumny **cloud_exists**

In [None]:
df_weather.loc[(df_weather['description'] == 'light rain') | (df_weather['description'] == 'moderate rain') | (df_weather['description'] == 'light intensity drizzle') | (df_weather['description'] == 'drizzle') | (df_weather['description'] == 'squalls') | (df_weather['description'] == 'heavy intensity rain') | (df_weather['description'] == 'thunderstorm') | (df_weather['description'] == 'shower rain') | (df_weather['description'] == 'very heavy rain') | (df_weather['description'] == 'thunderstorm with light rain') | (df_weather['description'] == 'thunderstorm with rain') | (df_weather['description'] == 'light intensity shower rain') | (df_weather['description'] == 'few clouds') | (df_weather['description'] == 'scattered clouds') | (df_weather['description'] == 'broken clouds') | (df_weather['description'] == 'overcast clouds') | (df_weather['description'] == 'proximity thunderstorm'), 'cloud_exists'] = 1
df_weather.loc[(df_weather['description'] != 'light rain') & (df_weather['description'] != 'moderate rain') & (df_weather['description'] != 'light intensity drizzle') & (df_weather['description'] != 'drizzle') & (df_weather['description'] != 'squalls') & (df_weather['description'] != 'heavy intensity rain') & (df_weather['description'] != 'thunderstorm') & (df_weather['description'] != 'shower rain') & (df_weather['description'] != 'very heavy rain') & (df_weather['description'] != 'thunderstorm with light rain') & (df_weather['description'] != 'thunderstorm with rain') & (df_weather['description'] != 'light intensity shower rain') & (df_weather['description'] != 'few clouds') & (df_weather['description'] != 'scattered clouds') & (df_weather['description'] != 'broken clouds') & (df_weather['description'] != 'overcast clouds') & (df_weather['description'] != 'proximity thunderstorm'), 'cloud_exists'] = 0

df_weather['cloud_exists'] = df_weather['cloud_exists'].astype('bool')

Usuwanie kolumny **description**

In [None]:
df_weather.drop(['description'], inplace=True, axis=1)

Sprawdzenie wartości po zmianie

In [None]:
df_weather.head()

Sprawdzenie typów danych w kolumnach **rain** i **cloud**

In [None]:
df_weather.info(verbose=True)

In [None]:
plot_cols = ['temperature', 'humidity', 'pressure']
plot_features = df_weather[plot_cols]
plot_features.index = df_weather['datetime']
_ = plot_features.plot(subplots=True)

plot_features = df_weather[plot_cols][:2400]
plot_features.index = df_weather['datetime'][:2400]
_ = plot_features.plot(subplots=True)

Przeliczenie wiatru z wartści kierunku w stopniach na współrzędne **x** oraz **y**

In [None]:
weather_count = df_weather[['wind_speed']].value_counts()
print(weather_count)

In [None]:
plt.hist2d(df_weather['wind_dir'], df_weather['wind_speed'], bins=(50, 15), vmax=100)
plt.colorbar()
plt.xlabel('Wind Direction [deg]')
plt.ylabel('Wind Velocity [m/s]')

In [None]:
wv = df_weather.pop('wind_speed')

# Konwersja na radiany
wd_rad = df_weather.pop('wind_dir')*np.pi / 180

# Wyliczanie składowe x oraz y
df_weather['Wx'] = wv*np.cos(wd_rad)
df_weather['Wy'] = wv*np.sin(wd_rad)

In [None]:
plt.hist2d(df_weather['Wx'], df_weather['Wy'], bins=(30, 30), vmax=30, range=[[-30, 30], [-30, 30]])
plt.colorbar()
plt.xlabel('Wind X [m/s]')
plt.ylabel('Wind Y [m/s]')
ax = plt.gca()
ax.axis('tight')

In [None]:
df_weather.head()

Konwersja czasu na funkcję **Sin** i **Cos**



In [None]:
df = pd.DataFrame.from_dict(df_weather)

date_time = pd.to_datetime(df.pop('datetime'), format='%d.%m.%Y %H:%M:%S')

In [None]:
timestamp_s = date_time.map(pd.Timestamp.timestamp)

In [None]:
day = 24*60*60
year = (365.2425)*day

df_weather['day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
df_weather['day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
df_weather['year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
df_weather['year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

In [None]:
plt.plot(np.array(df_weather['day sin'])[:24])
plt.plot(np.array(df_weather['day cos'])[:24])
plt.xlabel('Time [h]')
plt.title('Time of day signal')

In [None]:
fft = tf.signal.rfft(df_weather['temperature'])
f_per_dataset = np.arange(0, len(fft))

n_samples_h = len(df_weather['temperature'])
hours_per_year = 24*365.2524
years_per_dataset = n_samples_h/(hours_per_year)

f_per_year = f_per_dataset/years_per_dataset
plt.step(f_per_year, np.abs(fft))
plt.xscale('log')
plt.ylim(0, 200000)
plt.xlim([0.01, max(plt.xlim())])
plt.xticks([1, 365.2524], labels=['1/Year', '1/day'])
_ = plt.xlabel('Frequency (log scale)')

Sprawdzenie wartości po zmianie

In [None]:
df_weather.head()

Konwersja temperatury na stopnie Celsjusza

In [None]:
df_weather['temperature'] = df_weather['temperature'].apply(lambda x: (x - 273.15))

In [None]:
df_timestamp = pd.DataFrame()
df_timestamp['datetime'] = pd.to_datetime(df_weather['datetime'])
df_weather.drop(columns=['datetime'], inplace=True)

Sprawdzenie wartości po zmianie

In [None]:
df_weather.head()

In [None]:
df_timestamp.head()

Podział na zestawy danych

In [None]:
column_indices = {name: i for i, name in enumerate(df_weather.columns)}

n = len(df_weather)
df_train = df_weather[0:int(n*0.7)]
df_val = df_weather[int(n*0.7):int(n*0.9)]
df_test = df_weather[int(n*0.9):]

num_features = df_weather.shape[1]

In [None]:
df_train.head(8)

Normalziacja

In [None]:
train_mean = df_train.mean()
train_std = df_train.std()

#tu trzeba pamiętać że wartości z df_train zzapisywane są do train_df

# Inicjalziacja ramek danych

train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()

columns_to_normalize = ['temperature', 'humidity', 'pressure', 'Wx', 'Wy']

for col in columns_to_normalize:
    train_df[col] = (df_train[col] - train_mean[col]) / train_std[col]
    val_df[col] = (df_val[col] - train_mean[col]) / train_std[col]
    test_df[col] = (df_test[col] - train_mean[col]) / train_std[col]

columns_to_leave_unchanged = ['rain_exists', 'cloud_exists', 'day sin', 'day cos', 'year sin', 'year cos']

for col in columns_to_leave_unchanged:
    train_df[col] = df_train[col]
    val_df[col] = df_val[col]
    test_df[col] = df_test[col]

In [None]:
train_df.head(10)

In [None]:
train_df.info(verbose=True)

In [None]:
from matplotlib.ticker import FixedLocator, FixedFormatter

df_std = (df - train_mean) / train_std
df_std = df_std.melt(var_name='Column', value_name='Normalized')
plt.figure(figsize=(12, 6))
ax = sns.violinplot(x='Column', y='Normalized', data=df_std)
#_ = ax.set_xticklabels(df.keys(), rotation=90)
tick_positions = range(len(df.keys()))
ax.set_xticks(tick_positions)
ax.xaxis.set_major_locator(FixedLocator(tick_positions))
ax.xaxis.set_major_formatter(FixedFormatter(df.keys()))
ax.tick_params(axis='x', rotation=90)

Tworzenie okien czasowych

In [None]:
class WindowGenerator():
  def __init__(self, input_width, label_width, shift,
               train_df=train_df, val_df=val_df, test_df=test_df,
               label_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in
                                    enumerate(label_columns)}
    self.column_indices = {name: i for i, name in
                           enumerate(train_df.columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])

In [None]:
w1 = WindowGenerator(input_width=24, label_width=1, shift=24,
                     label_columns=['temperature'])
w1

In [None]:
w2 = WindowGenerator(input_width=6, label_width=1, shift=1,
                     label_columns=['temperature'])
w2

Funkcja dzieląca okna

In [None]:
def split_window(self, features):
  inputs = features[:, self.input_slice, :]
  labels = features[:, self.labels_slice, :]
  if self.label_columns is not None:
    labels = tf.stack(
        [labels[:, :, self.column_indices[name]] for name in self.label_columns],
        axis=-1)

    # Operacje wycinania nie zachowują informacji o wymiarach, dlatego ustawiamy wymiary ręcznie. W ten sposób `tf.data.Datasets` są łatwiejsze do inspekcji.

  inputs.set_shape([None, self.input_width, None])
  labels.set_shape([None, self.label_width, None])

  return inputs, labels

WindowGenerator.split_window = split_window

In [None]:
# Stack three slices, the length of the total window.
example_window = tf.stack([np.array(train_df[:w2.total_window_size]),
                           np.array(train_df[100:100+w2.total_window_size]),
                           np.array(train_df[200:200+w2.total_window_size])])

example_inputs, example_labels = w2.split_window(example_window)

print('All shapes are: (batch, time, features)')
print(f'Window shape: {example_window.shape}')
print(f'Inputs shape: {example_inputs.shape}')
print(f'Labels shape: {example_labels.shape}')

Plot

In [None]:
w2.example = example_inputs, example_labels

In [None]:
def plot(self, model=None, plot_col='temperature', max_subplots=3):
  inputs, labels = self.example
  plt.figure(figsize=(12, 8))
  plot_col_index = self.column_indices[plot_col]
  max_n = min(max_subplots, len(inputs))
  for n in range(max_n):
    plt.subplot(max_n, 1, n+1)
    plt.ylabel(f'{plot_col} [normed]')
    plt.plot(self.input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)

    if self.label_columns:
      label_col_index = self.label_columns_indices.get(plot_col, None)
    else:
      label_col_index = plot_col_index

    if label_col_index is None:
      continue

    plt.scatter(self.label_indices, labels[n, :, label_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
    if model is not None:
      predictions = model(inputs)
      plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)

    if n == 0:
      plt.legend()

  plt.xlabel('Time [h]')

WindowGenerator.plot = plot

In [None]:
w2.plot()

In [None]:
w2.plot(plot_col='pressure')

Funkcja tworząca datasety

In [None]:
def make_dataset(self, data):
  data = np.array(data, dtype=np.float32)
  ds = tf.keras.utils.timeseries_dataset_from_array(
      data=data,
      targets=None,
      sequence_length=self.total_window_size,
      sequence_stride=1,
      shuffle=True,
      batch_size=32,)

  ds = ds.map(self.split_window)

  return ds

WindowGenerator.make_dataset = make_dataset

In [None]:
@property
def train(self):
  return self.make_dataset(self.train_df)

@property
def val(self):
  return self.make_dataset(self.val_df)

@property
def test(self):
  return self.make_dataset(self.test_df)

@property
def example(self):
  """Get and cache an example batch of `inputs, labels` for plotting."""
  result = getattr(self, '_example', None)
  if result is None:
    # No example batch was found, so get one from the `.train` dataset
    result = next(iter(self.train))
    # And cache it for next time
    self._example = result
  return result

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example

In [None]:
w2.train.element_spec

In [None]:
for example_inputs, example_labels in w2.train.take(1):
  print(f'Inputs shape (batch, time, features): {example_inputs.shape}')
  print(f'Labels shape (batch, time, features): {example_labels.shape}')

Recurrent neural network

Multi-output model

Linear - regresja liniowa

In [None]:
linear = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1)
])

In [None]:
MAX_EPOCHS = 20

def compile_and_fit(model, window, patience=2):
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

  model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(),
                metrics=[tf.metrics.MeanAbsoluteError()])

  history = model.fit(window.train, epochs=MAX_EPOCHS,
                      validation_data=window.val,
                      callbacks=[early_stopping])
  return history

In [None]:
single_step_window = WindowGenerator(
    # `WindowGenerator` returns all features as labels if you
    # don't set the `label_columns` argument.
    input_width=1, label_width=1, shift=1)

wide_window = WindowGenerator(
    input_width=24, label_width=24, shift=1)

for example_inputs, example_labels in wide_window.train.take(1):
  print(f'Inputs shape (batch, time, features): {example_inputs.shape}')
  print(f'Labels shape (batch, time, features): {example_labels.shape}')

In [None]:
history = compile_and_fit(linear, single_step_window)

val_performance = {}
performance = {}
val_performance['Linear'] = linear.evaluate(single_step_window.val)
performance['Linear'] = linear.evaluate(single_step_window.test, verbose=0)

In [None]:
wide_window.plot(linear)

In [None]:
plt.bar(x = range(len(train_df.columns)),
        height=linear.layers[0].kernel[:,0].numpy())
axis = plt.gca()
axis.set_xticks(range(len(train_df.columns)))
_ = axis.set_xticklabels(train_df.columns, rotation=90)

In [None]:
for name, value in performance.items():
  print(f'{name:12s}: {value[1]:0.4f}')

RSS

In [None]:
%%time
wide_window = WindowGenerator(
    input_width=24, label_width=24, shift=1)

lstm_model = tf.keras.models.Sequential([
    # Shape [batch, time, features] => [batch, time, lstm_units]
    tf.keras.layers.LSTM(32, return_sequences=True),
    # Shape => [batch, time, features]
    tf.keras.layers.Dense(units=num_features)
])

history = compile_and_fit(lstm_model, wide_window)



In [None]:
IPython.display.clear_output()
val_performance['LSTM'] = lstm_model.evaluate( wide_window.val)
performance['LSTM'] = lstm_model.evaluate( wide_window.test, verbose=0)

In [None]:
wide_window.plot(lstm_model)

Wykres porównania średniego błędu bezwzględnego dla modelu regresji liniowej z modelem sieci rekurencyjnej

In [None]:
x = np.arange(len(performance))
width = 0.3
metric_name = 'mean_absolute_error'
metric_index = lstm_model.metrics_names.index('mean_absolute_error')
val_mae = [v[metric_index] for v in val_performance.values()]
test_mae = [v[metric_index] for v in performance.values()]

plt.ylabel('mean_absolute_error [temperature, normalized]')
plt.bar(x - 0.17, val_mae, width, label='Validation')
plt.bar(x + 0.17, test_mae, width, label='Test')
plt.xticks(ticks=x, labels=performance.keys(),
           rotation=45)
_ = plt.legend()

Wywołanie funkcji forecast_weather z wyuczonym modelem LSTM i oknem

In [None]:
def forecast_weather(model, window, day, hour):
    # Utwórz zbiór danych dla określonego dnia
    input_data = window.test.take(1)
    for inputs, labels in input_data:
        # Pobierz dane wejściowe dla określonego dnia i godziny
        inputs_for_day_hour = inputs[:, (day - 1) * 24 + hour - window.input_width:(day - 1) * 24 + hour, :]
        print("inputs_for_day_hour:", inputs_for_day_hour.shape)
        # Przewiduj pogodę dla wybranego dnia i godziny
        predictions = model.predict(inputs_for_day_hour)
        print("Predictions Dimensions:", predictions.shape)
        # Sprawdź, czy tablica predykcji nie jest pusta
        if predictions.size == 0:
            print("Brak dostępnych prognoz dla określonego dnia i godziny.")
            return
        # Denormalizuj prognozy, używając średniej i odchylenia standardowego z train_df
        units = ['°C', '%', 'MPa', '%', '%', 'm/s', 'm/s']
        for i in range(7):
            predicted_temperature = predictions[0, 0, i] * train_std[i] + train_mean[i]
            print(f"Predicted {train_std.index[i]}: {predicted_temperature:.2f} {units[i]}")
# Wywołaj funkcję forecast_weather z wyuczonym modelem LSTM i oknem
forecast_weather(lstm_model, wide_window, day=2, hour=8)