In [2]:
import zipfile
import os
from pathlib import Path
import pprint
import pandas as pd
import glob

BASE_DATA_PATH = r'C:\D\Whitireia\ARP\data'

In [65]:
def unzip_given_file_to_folder(file_path, folder):
    with zipfile.ZipFile(file_path, 'r') as z:
        z.extractall(folder)

In [66]:
zipped_data = os.chdir(os.path.join(BASE_DATA_PATH, r'zipped_data'))
for file in os.listdir(zipped_data):
    if file.endswith('.zip'):
        file_path = os.path.abspath(file)
        folder_to_unzip = os.path.join(BASE_DATA_PATH, r'unzipped_data_folders', file.split('.')[0])
        unzip_given_file_to_folder(file_path=file_path, folder=folder_to_unzip)
    else:
        print(f"{file} - Not a ZIP file!")

#### In the next three steps every uzipped folder containing a weather parameter was checked. The only one file with the most amount of values was taken from the multiple data files. As a result, there is one file corresponding each parameter per station. For example, station id: 41559 parameter: Wind has had 3 data files, but after transforming there is the only one file for Wind for 41559. 
### The selection has been made for each of the 16 stations.

In [None]:
file = r'C:\D\Whitireia\ARP\data\unzipped_data_folders\3145_Wind\3145__Wind__D.csv'
parameter = 'Speed [m/s]'
# Speed [m/s]
# Mean Temperature [Deg C]
# Rainfall [mm]
# Relative humidity [%]
df = pd.read_csv(file, usecols=['Observation time UTC',	parameter, 'Data Source'], parse_dates=['Observation time UTC'])
print(file)
print(df['Data Source'].value_counts())
display(df.sort_values(['Observation time UTC'], ascending=False))
print(df.info())

C:\D\Whitireia\ARP\data\unzipped_data_folders\3145_Wind\3145__Wind__D.csv
Data Source
D    8571
Name: count, dtype: int64


Unnamed: 0,Observation time UTC,Data Source,Speed [m/s]
8462,2024-02-29 20:00:00+00:00,D,1.03
8570,2024-02-28 20:00:00+00:00,D,4.63
8569,2024-02-27 20:00:00+00:00,D,4.63
8568,2024-02-26 20:00:00+00:00,D,6.69
8567,2024-02-25 20:00:00+00:00,D,4.63
...,...,...,...
4,1995-06-14 21:00:00+00:00,D,2.57
3,1995-06-13 21:00:00+00:00,D,0.00
2,1995-06-02 21:00:00+00:00,D,6.69
1,1995-06-01 21:00:00+00:00,D,4.63


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8571 entries, 0 to 8570
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   Observation time UTC  8571 non-null   datetime64[ns, UTC]
 1   Data Source           8571 non-null   object             
 2   Speed [m/s]           8571 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(1)
memory usage: 201.0+ KB
None


### Data have been adjusted to daily values if they differed

In [10]:
df = df.drop(columns='Data Source')
daily_df = df.groupby(pd.Grouper(key='Observation time UTC', freq='d')).mean(parameter).dropna(axis=0).reset_index().sort_values('Observation time UTC', ascending=False)
# display(daily_df.loc[300:250])
display(daily_df)
print(daily_df.info())

Unnamed: 0,Observation time UTC,Speed [m/s]
8570,2024-02-29 00:00:00+00:00,1.03
8569,2024-02-28 00:00:00+00:00,4.63
8568,2024-02-27 00:00:00+00:00,4.63
8567,2024-02-26 00:00:00+00:00,6.69
8566,2024-02-25 00:00:00+00:00,4.63
...,...,...
4,1995-06-14 00:00:00+00:00,2.57
3,1995-06-13 00:00:00+00:00,0.00
2,1995-06-02 00:00:00+00:00,6.69
1,1995-06-01 00:00:00+00:00,4.63


<class 'pandas.core.frame.DataFrame'>
Index: 8571 entries, 8570 to 0
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   Observation time UTC  8571 non-null   datetime64[ns, UTC]
 1   Speed [m/s]           8571 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1)
memory usage: 200.9 KB
None


### Saving each file corresponding each parameter per station to a new folder

In [None]:
file_name = file.split('\\')[-1]
file_path = os.path.join(r'C:\D\Whitireia\ARP\data\station_folders', file_name)
daily_df.to_csv(file_path, index=False)
df1 = pd.read_csv(file_path)
print(file_path)
df1

C:\D\Whitireia\ARP\data\station_folders\3145__Wind__D.csv


Unnamed: 0,Observation time UTC,Speed [m/s]
0,2024-02-29 00:00:00+00:00,1.03
1,2024-02-28 00:00:00+00:00,4.63
2,2024-02-27 00:00:00+00:00,4.63
3,2024-02-26 00:00:00+00:00,6.69
4,2024-02-25 00:00:00+00:00,4.63
...,...,...
8566,1995-06-14 00:00:00+00:00,2.57
8567,1995-06-13 00:00:00+00:00,0.00
8568,1995-06-02 00:00:00+00:00,6.69
8569,1995-06-01 00:00:00+00:00,4.63


### In the next step, Wind data for the Station 12442 are being calculated as a mean of the nearest neighbours 8567 and 3145.

In [5]:
df1 = pd.read_csv(os.path.join(BASE_DATA_PATH, r'station_folders\8567__Wind__D.csv'))
df2 = pd.read_csv(os.path.join(BASE_DATA_PATH, r'station_folders\3145__Wind__D.csv'))
display(df1)
display(df2)

df3 = pd.merge(df1, df2, on='Observation time UTC', how='inner', suffixes=('_df1', '_df2'))
df3['Speed [m/s]'] = df3[['Speed [m/s]_df1', 'Speed [m/s]_df2']].mean(axis=1)
display(df3)
df3 = df3[['Observation time UTC', 'Speed [m/s]']]
display(df3)
df3.to_csv(os.path.join(BASE_DATA_PATH, r'station_folders\12442__Wind__D.csv'), index=False)
df3.info()

Unnamed: 0,Observation time UTC,Speed [m/s]
0,2024-12-09 00:00:00+00:00,5.44
1,2024-12-08 00:00:00+00:00,7.72
2,2024-12-07 00:00:00+00:00,6.69
3,2024-12-06 00:00:00+00:00,3.15
4,2024-12-05 00:00:00+00:00,4.90
...,...,...
11507,1993-03-22 00:00:00+00:00,5.21
11508,1993-03-21 00:00:00+00:00,7.16
11509,1993-03-20 00:00:00+00:00,3.87
11510,1993-03-19 00:00:00+00:00,4.84


Unnamed: 0,Observation time UTC,Speed [m/s]
0,2024-02-29 00:00:00+00:00,1.03
1,2024-02-28 00:00:00+00:00,4.63
2,2024-02-27 00:00:00+00:00,4.63
3,2024-02-26 00:00:00+00:00,6.69
4,2024-02-25 00:00:00+00:00,4.63
...,...,...
8566,1995-06-14 00:00:00+00:00,2.57
8567,1995-06-13 00:00:00+00:00,0.00
8568,1995-06-02 00:00:00+00:00,6.69
8569,1995-06-01 00:00:00+00:00,4.63


Unnamed: 0,Observation time UTC,Speed [m/s]_df1,Speed [m/s]_df2,Speed [m/s]
0,2024-02-29 00:00:00+00:00,3.03,1.03,2.030
1,2024-02-28 00:00:00+00:00,4.74,4.63,4.685
2,2024-02-27 00:00:00+00:00,3.83,4.63,4.230
3,2024-02-26 00:00:00+00:00,4.32,6.69,5.505
4,2024-02-25 00:00:00+00:00,2.09,4.63,3.360
...,...,...,...,...
8520,1995-06-14 00:00:00+00:00,1.31,2.57,1.940
8521,1995-06-13 00:00:00+00:00,1.00,0.00,0.500
8522,1995-06-02 00:00:00+00:00,5.23,6.69,5.960
8523,1995-06-01 00:00:00+00:00,3.48,4.63,4.055


Unnamed: 0,Observation time UTC,Speed [m/s]
0,2024-02-29 00:00:00+00:00,2.030
1,2024-02-28 00:00:00+00:00,4.685
2,2024-02-27 00:00:00+00:00,4.230
3,2024-02-26 00:00:00+00:00,5.505
4,2024-02-25 00:00:00+00:00,3.360
...,...,...
8520,1995-06-14 00:00:00+00:00,1.940
8521,1995-06-13 00:00:00+00:00,0.500
8522,1995-06-02 00:00:00+00:00,5.960
8523,1995-06-01 00:00:00+00:00,4.055


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8525 entries, 0 to 8524
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Observation time UTC  8525 non-null   object 
 1   Speed [m/s]           8525 non-null   float64
dtypes: float64(1), object(1)
memory usage: 133.3+ KB


#### In the next 2 steps, parameter: Temperature for station 31857 is being calculated.

In [9]:
file = r'C:\D\Whitireia\ARP\data\unzipped_data_folders\31857_Temperature\31857__Temperature__all_data.csv'
df = pd.read_csv(file, usecols=['Observation time UTC', 'Maximum Temperature [Deg C]', 'Minimum Temperature [Deg C]'], parse_dates=['Observation time UTC'])
print(file)
df = df.sort_values(['Observation time UTC'], ascending=False)
display(df)
print(df.info())

C:\D\Whitireia\ARP\data\unzipped_data_folders\31857_Temperature\31857__Temperature__all_data.csv


Unnamed: 0,Observation time UTC,Maximum Temperature [Deg C],Minimum Temperature [Deg C]
6134,2024-10-31 20:00:00+00:00,19.2,
6133,2024-10-30 20:00:00+00:00,17.9,2.4
6132,2024-10-29 20:00:00+00:00,15.0,4.8
6131,2024-10-28 20:00:00+00:00,16.0,6.2
6130,2024-10-27 20:00:00+00:00,18.4,5.8
...,...,...,...
4,2007-09-05 21:00:00+00:00,9.5,
3,2007-09-04 21:00:00+00:00,14.0,
2,2007-09-03 21:00:00+00:00,16.2,
1,2007-09-02 21:00:00+00:00,15.1,


<class 'pandas.core.frame.DataFrame'>
Index: 6138 entries, 6134 to 0
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   Observation time UTC         6138 non-null   datetime64[ns, UTC]
 1   Maximum Temperature [Deg C]  6134 non-null   float64            
 2   Minimum Temperature [Deg C]  6083 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(2)
memory usage: 191.8 KB
None


In [10]:
df = df.dropna(axis=0, ignore_index=True)
df['Mean Temperature [Deg C]'] = df[['Maximum Temperature [Deg C]', 'Minimum Temperature [Deg C]']].mean(axis=1)
result_df = df[['Observation time UTC', 'Mean Temperature [Deg C]']]
result_df = result_df.groupby(pd.Grouper(key='Observation time UTC', freq='d')).mean('Mean Temperature [Deg C]').dropna(axis=0).reset_index().sort_values('Observation time UTC', ascending=False)

display(result_df)
result_df.to_csv(os.path.join(BASE_DATA_PATH, r'station_folders\31857__Temperature__all_data.csv'), index=False)
result_df.info()

Unnamed: 0,Observation time UTC,Mean Temperature [Deg C]
6078,2024-10-30 00:00:00+00:00,10.15
6077,2024-10-29 00:00:00+00:00,9.90
6076,2024-10-28 00:00:00+00:00,11.10
6075,2024-10-27 00:00:00+00:00,12.10
6074,2024-10-26 00:00:00+00:00,15.95
...,...,...
4,2007-10-29 00:00:00+00:00,10.75
3,2007-10-28 00:00:00+00:00,5.25
2,2007-10-27 00:00:00+00:00,6.75
1,2007-10-26 00:00:00+00:00,12.05


<class 'pandas.core.frame.DataFrame'>
Index: 6079 entries, 6078 to 0
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   Observation time UTC      6079 non-null   datetime64[ns, UTC]
 1   Mean Temperature [Deg C]  6079 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1)
memory usage: 142.5 KB


#### Station 18234 parameter Screen_Observations, changing name of the column from the "Mean Relative Humidity [percent]" to the standard name "Relative humidity [%]"

In [None]:
df = pd.read_csv(os.path.join(BASE_DATA_PATH, r'station_folders\18234__Screen_Observations__D.csv'))
display(df)
df.columns = ['Observation time UTC', 'Relative humidity [%]']
df.to_csv(os.path.join(BASE_DATA_PATH, r'station_folders\18234__Screen_Observations__D.csv'), index=False)
display(df.head())

Unnamed: 0,Observation time UTC,Mean Relative Humidity [percent]
0,2024-12-05 00:00:00+00:00,69.142857
1,2024-12-04 00:00:00+00:00,85.833333
2,2024-12-03 00:00:00+00:00,85.125000
3,2024-12-02 00:00:00+00:00,79.416667
4,2024-12-01 00:00:00+00:00,77.208333
...,...,...
8082,2000-06-10 00:00:00+00:00,77.541667
8083,2000-06-09 00:00:00+00:00,77.416667
8084,2000-06-08 00:00:00+00:00,80.000000
8085,2000-06-07 00:00:00+00:00,88.125000


Unnamed: 0,Observation time UTC,Relative humidity [%]
0,2024-12-05 00:00:00+00:00,69.142857
1,2024-12-04 00:00:00+00:00,85.833333
2,2024-12-03 00:00:00+00:00,85.125000
3,2024-12-02 00:00:00+00:00,79.416667
4,2024-12-01 00:00:00+00:00,77.208333
...,...,...
8082,2000-06-10 00:00:00+00:00,77.541667
8083,2000-06-09 00:00:00+00:00,77.416667
8084,2000-06-08 00:00:00+00:00,80.000000
8085,2000-06-07 00:00:00+00:00,88.125000


#### In the next step parameter: Rain station_id: 41559 is being calculated per day as a SUM of hourly observations for this day.

In [None]:
file = r'C:\D\Whitireia\ARP\data\unzipped_data_folders\41559_Rain\41559__Rain__H.csv'
parameter = 'Rainfall [mm]'
df = pd.read_csv(file, usecols=['Observation time UTC',	parameter], parse_dates=['Observation time UTC'])
print(file)
display(df.sort_values(['Observation time UTC'], ascending=False).head())
print(df.info())

daily_df = df.groupby(pd.Grouper(key='Observation time UTC', freq='d')).sum(parameter).dropna(axis=0).reset_index().sort_values('Observation time UTC', ascending=False)
display(daily_df)

daily_df.to_csv(os.path.join(BASE_DATA_PATH, r'station_folders\41559__Rain__H.csv'), index=False)
print(daily_df.info())

C:\D\Whitireia\ARP\data\unzipped_data_folders\41559_Rain\41559__Rain__H.csv


Unnamed: 0,Observation time UTC,Rainfall [mm]
64990,2024-12-08 06:00:00+00:00,1.4
64989,2024-12-08 05:00:00+00:00,0.0
64988,2024-12-08 04:00:00+00:00,0.0
64987,2024-12-08 03:00:00+00:00,0.0
64986,2024-12-08 02:00:00+00:00,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64991 entries, 0 to 64990
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   Observation time UTC  64991 non-null  datetime64[ns, UTC]
 1   Rainfall [mm]         64991 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1)
memory usage: 1015.6 KB
None


Unnamed: 0,Observation time UTC,Rainfall [mm]
2783,2024-12-08 00:00:00+00:00,1.4
2782,2024-12-07 00:00:00+00:00,0.0
2781,2024-12-06 00:00:00+00:00,0.0
2780,2024-12-05 00:00:00+00:00,0.0
2779,2024-12-04 00:00:00+00:00,0.2
...,...,...
4,2017-04-30 00:00:00+00:00,9.8
3,2017-04-29 00:00:00+00:00,20.2
2,2017-04-28 00:00:00+00:00,1.4
1,2017-04-27 00:00:00+00:00,0.0


<class 'pandas.core.frame.DataFrame'>
Index: 2784 entries, 2783 to 0
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   Observation time UTC  2784 non-null   datetime64[ns, UTC]
 1   Rainfall [mm]         2784 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1)
memory usage: 65.2 KB
None
