<H2>Environment Setup</H2>

In [1]:
%pip install pandas
%pip install numpy
%pip install tensorflow
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/e2/7a/c7762c698fb1ac41a7e3afee51dc72aa3ec74ae8d2f57ce19a9cded3a4af/tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Obtaining depende

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

2023-10-10 21:11:07.674225: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-10 21:11:07.704499: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-10 21:11:07.704551: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-10 21:11:07.704571: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-10 21:11:07.710082: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-10 21:11:07.710965: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [3]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [4]:
path_zip = tf.keras.utils.get_file(
    origin='https://zenodo.org/record/4964287/files/ECAD_DATA.zip',
    fname='DATA.zip',
    cache_subdir='ECAD',
    extract=True)

Downloading data from https://zenodo.org/record/4964287/files/ECAD_DATA.zip


In [5]:
path_ecad = os.path.dirname(path_zip)

path_oslo = os.path.join(path_ecad, 'OSLO')
path_oslo

'/home/codespace/.keras/ECAD/OSLO'

In [6]:
list_files_oslo = os.listdir(path_oslo)
list_files_oslo

['RR_STAID000193.txt',
 'stations.txt',
 'FX_STAID000193.txt',
 'TX_STAID000193.txt',
 'elements.txt',
 'TG_STAID000193.txt',
 'PP_STAID000193.txt',
 'SD_STAID000193.txt',
 'QQ_STAID000193.txt',
 'sources.txt',
 'HU_STAID000193.txt',
 'metadata.txt',
 'DD_STAID000193.txt',
 'SS_STAID000193.txt',
 'CC_STAID000193.txt',
 'TN_STAID000193.txt',
 'FG_STAID000193.txt']

<H2>Extract Transform Load</H2>

In [7]:
def read_data(file_name):
    """
    """
    header_line_number = None
    # open file
    with open(file_name, 'r') as file:
        # find header
        for line_number, line_text in enumerate(file):
            if 'SOUID' in line_text:
                header_line_number = line_number
    # read csv
    df = pd.DataFrame()
    if header_line_number:
        df = pd.read_csv(file_name, header=header_line_number, skipinitialspace=True, index_col='DATE', na_values='-9999', skip_blank_lines=False)
    return df

In [8]:
variables = {'CC' : 'CLOUD COVER',
             'DD' : 'WIND DIRECTION',
             'FG' : 'WIND SPEED',
             'FX' : 'WIND GUST',
             'HU' : 'HUMIDITY',
             'PP' : 'SEA LEVEL PRESSURE',
             'QQ' : 'GLOBAL RADIATION',
             'RR' : 'PRECIPITATION AMOUNT',
             'SS' : 'SUNSHINE',
             'TG' : 'MEAN TEMPERATURE',
             'TN' : 'MINIMUM TEMPERATURE',
             'TX' : 'MAXIMUM TEMPERATURE'}

In [9]:
data = None

for file_name in list_files_oslo:
    if file_name[0:2] in variables:
        df = read_data(os.path.join(path_oslo, file_name))
        df.drop('SOUID', inplace=True, axis=1)
        if data is None:
            data = df
        else:
            data = pd.merge(left=data, right=df, how='outer', on='DATE')

In [10]:
data.head().T

DATE,18991201,18991202,18991203,18991204,18991205
RR,,,,,
Q_RR,9.0,9.0,9.0,9.0,9.0
FX,,,,,
Q_FX,,,,,
TX,,,,,
Q_TX,,,,,
TG,,,,,
Q_TG,,,,,
PP,,,,,
Q_PP,,,,,


In [11]:
data.shape

(44591, 24)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44591 entries, 18991201 to 20211231
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RR      44285 non-null  float64
 1   Q_RR    44316 non-null  float64
 2   FX      14334 non-null  float64
 3   Q_FX    32963 non-null  float64
 4   TX      30712 non-null  float64
 5   Q_TX    30712 non-null  float64
 6   TG      30740 non-null  float64
 7   Q_TG    30740 non-null  float64
 8   PP      23466 non-null  float64
 9   Q_PP    30712 non-null  float64
 10  QQ      24172 non-null  float64
 11  Q_QQ    25568 non-null  float64
 12  HU      25658 non-null  float64
 13  Q_HU    30712 non-null  float64
 14  DD      30462 non-null  float64
 15  Q_DD    32963 non-null  float64
 16  SS      24062 non-null  float64
 17  Q_SS    30712 non-null  float64
 18  CC      25658 non-null  float64
 19  Q_CC    30712 non-null  float64
 20  TN      30771 non-null  float64
 21  Q_TN    30771 non-null  float6

In [13]:
data.index = pd.to_datetime(data.index.astype(str))

In [14]:
qi = [column for column in data.columns if column[0:2] == 'Q_']
qi

['Q_RR',
 'Q_FX',
 'Q_TX',
 'Q_TG',
 'Q_PP',
 'Q_QQ',
 'Q_HU',
 'Q_DD',
 'Q_SS',
 'Q_CC',
 'Q_TN',
 'Q_FG']

In [15]:
data[qi] =data[qi].astype('Int64')

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 44591 entries, 1899-12-01 to 2021-12-31
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RR      44285 non-null  float64
 1   Q_RR    44316 non-null  Int64  
 2   FX      14334 non-null  float64
 3   Q_FX    32963 non-null  Int64  
 4   TX      30712 non-null  float64
 5   Q_TX    30712 non-null  Int64  
 6   TG      30740 non-null  float64
 7   Q_TG    30740 non-null  Int64  
 8   PP      23466 non-null  float64
 9   Q_PP    30712 non-null  Int64  
 10  QQ      24172 non-null  float64
 11  Q_QQ    25568 non-null  Int64  
 12  HU      25658 non-null  float64
 13  Q_HU    30712 non-null  Int64  
 14  DD      30462 non-null  float64
 15  Q_DD    32963 non-null  Int64  
 16  SS      24062 non-null  float64
 17  Q_SS    30712 non-null  Int64  
 18  CC      25658 non-null  float64
 19  Q_CC    30712 non-null  Int64  
 20  TN      30771 non-null  float64
 21  Q_TN    30771 non-

In [17]:
data['FG'] = data['FG'] / 10
data['FX'] = data['FX'] / 10
data['PP'] = data['PP'] / 10
data['RR'] = data['RR'] / 10
data['SS'] = data['SS'] / 10 
data['TG'] = data['TG'] / 10
data['TN'] = data['TN'] / 10
data['TX'] = data['TX'] / 10

In [18]:
data.rename(columns={'CC' : 'CLOUD COVER (okta)',
                     'DD' : 'WIND DIRECTION (degrees)',
                     'FG' : 'WIND SPEED (m/s)',
                     'FX' : 'WIND GUST (m/s)',
                     'HU' : 'RELATIVE HUMIDITY',
                     'PP' : 'SEA LEVEL PRESSURE (hPa)',
                     'QQ' : 'GLOBAL RADIATION (W/m2)',
                     'RR' : 'PRECIPITATION AMOUNT (mm)',
                     'SS' : 'SUNSHINE (hours)',
                     'TG' : 'MEAN TEMPERATURE (C)',
                     'TN' : 'MINIMUM TEMPERATURE (C)',
                     'TX' : 'MAXIMUM TEMPERATURE (C)'},
            inplace=True)

In [19]:
data.sample(10).T

DATE,1916-06-03,1906-04-05,2014-10-21,2005-07-25,1944-10-19,1906-11-02,1934-02-11,1925-07-31,1992-10-14,1923-02-25
PRECIPITATION AMOUNT (mm),1.7,0.0,5.0,1.0,0.0,10.3,0.0,2.0,0.0,0.0
Q_RR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WIND GUST (m/s),,,9.3,8.6,,,,,10.8,
Q_FX,,,0.0,0.0,9.0,,9.0,,0.0,
MAXIMUM TEMPERATURE (C),,,11.0,17.0,11.6,,,,6.2,
Q_TX,,,0.0,0.0,0.0,,,,0.0,
MEAN TEMPERATURE (C),,,9.0,14.8,8.4,,,,3.8,
Q_TG,,,0.0,0.0,0.0,,,,0.0,
SEA LEVEL PRESSURE (hPa),,,999.6,1000.1,,,,,1000.1,
Q_PP,,,0.0,0.0,9.0,,,,0.0,


In [27]:
data.to_parquet('../data/owf.Extract.01.parquet', index=True)