Author: Ewa Szewczyk \
Development notebook for automated data downloading, first preparation of data & merging, based on *data_ingestion.ipynb*

## Imports

In [87]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from zipfile import BadZipFile

## Stations

In [88]:
def move_right(row):
    rzeka_index = 2
    if str(row["river"])[0].isdigit():
        for i in range(len(row) - 1, rzeka_index, -1):
            row.iloc[i] = row.iloc[i - 1]
        row.iloc[rzeka_index] = None
    return row


def dms_to_dd(coord):
    degrees, minutes, seconds = coord.split()
    dd = float(degrees) + float(minutes) / 60 + float(seconds) / (60 * 60)
    return dd


def create_gdf(df):
    # Creating column with Point geometry
    geometry = [
    Point(lon, lat)
    for lon, lat in zip(
        df["lon"], df["lat"]
    )]   

    # Creating GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

    return gdf

def save_gdf(gdf, name,iname): 
    gdf.to_file('../data/'+name, index=iname)

def get_column_names():
    return ['N','ID', 'name', 'river', 'lat', 'lon', 'altitude']


# main download function
    
def download_stations_data():
    col_names = get_column_names()

    stations = pd.read_csv('https://danepubliczne.imgw.pl/pl/datastore/getfiledown/Arch/Telemetria/Meteo/kody_stacji.csv', sep=";",
                        encoding="cp1250", index_col=0, header=0, names=col_names)
    
    

    stations = stations.apply(move_right, axis=1)

    stations["lon"] = stations["lon"].apply(dms_to_dd)
    stations["lat"] = stations["lat"].apply(dms_to_dd)

    stations_gdf = create_gdf(stations)

    save_gdf(stations_gdf, 'stations.shp', 'N')

    return stations_gdf

In [89]:
stations = download_stations_data()
stations.head()

Unnamed: 0_level_0,ID,name,river,lat,lon,altitude,geometry
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,250180590,RYBNIK-STODOŁY,Ruda,50.154444,18.483056,216,POINT (18.48306 50.15444)
2,254140010,DZIWNÓW,,54.022222,14.731667,7,POINT (14.73167 54.02222)
3,250160530,ZIELENIEC,Bystrzyca Dusznicka,50.33,16.394167,840,POINT (16.39417 50.33000)
4,250160630,MIĘDZYGÓRZE,Wilczka,50.218333,16.773056,800,POINT (16.77306 50.21833)
5,250160650,MIĘDZYLESIE,Nysa Kłodzka,50.153333,16.670833,453,POINT (16.67083 50.15333)


## Precipitation data

In [90]:
# Helpers
def save_df(df, name): 
    df.to_csv('../data/'+name)


def get_colnames():
    columns = [
    "station_code",
    "station_name",
    "year",
    "month",
    "day",
    "24h_precipitation_mm",
    "SMDB_status",
    "precip_type",
    "snow_cover_cm",
    "PKSN_status",
    "fresh_snow_cover_cm",
    "HSS_status",
    "snow_type_code",
    "GATS_status",
    "snow_cover_type_code",
    "RPSN_status"]

    return columns

def get_urls():
    base_url = 'https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/'
    parent_dirs = ['1991_1995/', '1996_2000/'] + [str(i) + '/' for i in range(2001,2024)]
    child_dirs = [[str(i) for i in range(1991,1996)], [str(i) for i in range(1996,2001)]] + [[str(i) + '_' + str(j).zfill(2) for j in range(1,13)] for i in range(2001,2024)]
    ending = '_o.zip'

    return base_url, parent_dirs, child_dirs, ending

# Main download function

def download_precip_data():
    columns = get_colnames()

    base_url, parent_dirs, child_dirs, ending = get_urls()

    dfs = []

    for i in range(len(parent_dirs)):
        p = parent_dirs[i]
        cds = child_dirs[i]
        for cd in cds:
            url = base_url + p + cd + ending

            try:
                precip = pd.read_csv(url, header=None, names=columns, encoding="cp1250", compression={"method": "zip"})
            except BadZipFile:
                print(f'{cd} is corrupted, going to the next file')
                continue 

            dfs.append(precip)
        
    precipitation_data = pd.concat(dfs)

    save_df(precipitation_data, 'precipitation_data.csv')

    return precipitation_data
    

In [91]:
precipitation_data = download_precip_data()
precipitation_data.head()

2023_03 is corrupted, going to the next file
2023_04 is corrupted, going to the next file
2023_11 is corrupted, going to the next file


Unnamed: 0,station_code,station_name,year,month,day,24h_precipitation_mm,SMDB_status,precip_type,snow_cover_cm,PKSN_status,fresh_snow_cover_cm,HSS_status,snow_type_code,GATS_status,snow_cover_type_code,RPSN_status
0,249180020,WARSZOWICE,1991,1,2,5.5,,W,0,8.0,0,8.0,,8.0,,8.0
1,249180020,WARSZOWICE,1991,1,3,3.5,,W,0,8.0,0,8.0,,8.0,,8.0
2,249180020,WARSZOWICE,1991,1,4,1.9,,W,0,8.0,0,8.0,,8.0,,8.0
3,249180020,WARSZOWICE,1991,1,7,3.3,,W,0,8.0,0,8.0,,8.0,,8.0
4,249180020,WARSZOWICE,1991,1,10,0.0,,W,0,8.0,0,8.0,,8.0,,8.0


## Changes file -> dict

In [92]:
# Helper functions
def download_changes_file():
    changes = pd.read_table(
    "https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/Opis.txt",
    skiprows=72,
    header=None,
    skipinitialspace=True,
    names=["Zmiany"])

    return changes

def split_officials(changes):
    # Dataframe division to changes and official names (these one MAY be unuseful)
    ix = changes[changes["Zmiany"].str.contains("Oficjalna")].index[0]

    changes_not_ofc = changes.iloc[:ix]
    changes_ofc = changes.iloc[ix:]

    return changes_not_ofc, changes_ofc

def create_station_dict(station_names):
    station_dict = dict()
    for station_name in station_names:
        words = station_name.split()
        # Searching for first occurance of "Stacja"
        first_station_index = words.index("Stacja")
        # Second occurance of "stacja"
        second_station_index = words.index("stacja", first_station_index + 1)
        # Filling dictionary
        station_dict[words[second_station_index + 1].rstrip(",")] = words[
            first_station_index + 1
        ].rstrip(",")
    return station_dict

def download_changes_data():
    changes_df = download_changes_file()
    changes_not_ofc, changes_ofc = split_officials(changes_df)
    changes_not_ofc_dict = create_station_dict(changes_not_ofc["Zmiany"].values)

    return changes_not_ofc_dict

In [93]:
map_dict = download_changes_data()
map_dict

{'Katowice': 'Katowice-Muchowiec',
 'Łódź': 'Łódź-Lublinek',
 'Poznań': 'Poznań-Ławica',
 'Warszawa': 'Warszawa-Okęcie',
 'Wrocław': 'Wrocław-Strachowice',
 'Elbląg': 'Elbląg-Milejewo',
 'Resko': 'Resko-Smólsko',
 'Kołobrzeg': 'Kołobrzeg-Dźwirzyno'}

### Using map_dict to make changes

In [97]:
def implement_changes(precipitation, changes, col):
    precipitation[col] = (precipitation[col].map(changes).fillna(precipitation[col]))

    return precipitation

In [98]:
precipitation_data_wc = implement_changes(precipitation_data, map_dict, 'station_name')

## Data merging (stations & precipitation)

In [99]:
def merge_stations_precipitation(stations, precipitation):
    merged_gdf = precipitation.merge(stations, how="inner", left_on="Kod stacji", right_on="ID")
    return merged_gdf

## Reading data

In [100]:
def read_gdf(file):
    g = gpd.read_file('../data/' + file)
    gdf = gpd.GeoDataFrame(g, geometry="geometry", crs="EPSG:4326")
    return gdf

def read_pd(file, iname=None):
    df = pd.read_csv('../data/' + file,index_col=iname)
    return df

In [101]:
read_gdf('stations.shp')

read_pd('precipitation_data.csv', iname='Unnamed: 0')


  df = pd.read_csv('../data/' + file,index_col=iname)


Unnamed: 0,station_code,station_name,year,month,day,24h_precipitation_mm,SMDB_status,precip_type,snow_cover_cm,PKSN_status,fresh_snow_cover_cm,HSS_status,snow_type_code,GATS_status,snow_cover_type_code,RPSN_status
0,249180020,WARSZOWICE,1991,1,2,5.5,,W,0,8.0,0,8.0,,8.0,,8.0
1,249180020,WARSZOWICE,1991,1,3,3.5,,W,0,8.0,0,8.0,,8.0,,8.0
2,249180020,WARSZOWICE,1991,1,4,1.9,,W,0,8.0,0,8.0,,8.0,,8.0
3,249180020,WARSZOWICE,1991,1,7,3.3,,W,0,8.0,0,8.0,,8.0,,8.0
4,249180020,WARSZOWICE,1991,1,10,0.0,,W,0,8.0,0,8.0,,8.0,,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14122,254230020,SEJNY,2023,12,26,8.3,,S,0,9.0,0,9.0,,9.0,.,
14123,254230020,SEJNY,2023,12,27,0.5,,S,2,,2,,5.0,,*,
14124,254230020,SEJNY,2023,12,28,0.2,,W,0,9.0,0,9.0,,9.0,.,
14125,254230020,SEJNY,2023,12,29,8.9,,W,0,9.0,0,9.0,,9.0,.,
