# Weather Data Preprocessing

## TODO: are report types important?, handle compound fields (look at https://www.airppm.org/for-participants/training-sets), trim dateTime, handle missing and incorrect values (999999 etc.)

## TODO: final goal -> combine the weather files with the air quality measurements to start training a model

## Imports

In [1]:
import pandas as pd
from pathlib import Path

In [19]:
weather_dir = Path("../data/raw/Weather_Krakow")

weather_files = sorted(weather_dir.glob("*.csv"))

dfs = []

for f in weather_files:
    df = pd.read_csv(f, dtype=str)
    df['year_file'] = f.stem
    dfs.append(df)
    
# Normalize column names
dfs = [df.rename(columns=lambda x: x.strip()) for df in dfs]
weather_data = pd.concat(dfs, ignore_index=True, sort=False)

# drop station related or constant columns
weather_data = weather_data.drop(columns=[
    "STATION", # only 1 station
    "SOURCE", # always 4
    "CALL_SIGN", # always 99999
    "QUALITY_CONTROL", # always V020
    "NAME", #always BALICE, PL
    "ED1", #either missing or 25,U,9999,9
    "LATITUDE", "LONGITUDE", "ELEVATION",
])

print(weather_data.shape)
print(weather_data.columns)    

(130328, 36)
Index(['DATE', 'REPORT_TYPE', 'WND', 'CIG', 'VIS', 'TMP', 'DEW', 'SLP', 'AA1',
       'AA2', 'AJ1', 'AY1', 'AY2', 'AZ1', 'AZ2', 'GA1', 'GA2', 'GA3', 'GE1',
       'GF1', 'IA1', 'KA1', 'KA2', 'MA1', 'MD1', 'MW1', 'MW2', 'MW3', 'OC1',
       'OD1', 'OD2', 'OD3', 'REM', 'EQD', 'year_file', 'IA2'],
      dtype='object')


In [20]:
weather_data.describe()

Unnamed: 0,DATE,REPORT_TYPE,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,...,MW2,MW3,OC1,OD1,OD2,OD3,REM,EQD,year_file,IA2
count,130328,130328,130328,130328,130328,130328,130328,130328,7254,1005,...,3419,35,668,13028,3076,596,130328,4989,130328,39
unique,87027,3,1526,250,80,493,390,627,118,202,...,32,8,48,40,40,18,129313,24,5,18
top,2023-12-31T23:00:00,FM-15,"999,9,V,0005,1","99999,9,9,N",9999199,101,1,999999,6000091,24000021,...,611,711,1601,59900301999,59900901999,59901001999,SYN004BUFR,Q01 003SCCGA1,2020,"999,+0050,9"
freq,2,86875,8283,41868,37791,3797,4758,86877,2462,195,...,998,12,47,2105,354,86,1011,4920,26230,6


In [11]:
# % of missing values per column
missing_percent = weather_data.isna().mean() * 100
print(missing_percent.sort_values(ascending=False))

MW3                99.973145
IA2                99.970076
AJ1                99.841937
IA1                99.812013
OD3                99.542692
AZ1                99.525045
AZ2                99.525045
OC1                99.487447
AA2                99.228869
ED1                98.690228
OD2                97.639801
MW2                97.376619
KA1                97.218556
KA2                97.218556
GA3                96.700632
EQD                96.171966
AA1                94.434043
OD1                90.003683
AY1                87.645019
AY2                87.645019
GA2                74.032441
MW1                72.427260
MD1                66.660273
GA1                30.107882
GE1                30.107882
GF1                24.924805
MA1                 0.000767
STATION             0.000000
DATE                0.000000
VIS                 0.000000
CIG                 0.000000
WND                 0.000000
QUALITY_CONTROL     0.000000
CALL_SIGN           0.000000
REPORT_TYPE   

In [16]:
print(weather_data.tail())

            STATION                 DATE SOURCE   LATITUDE  LONGITUDE  \
130323  12566099999  2023-12-31T22:00:00      4  50.077731  19.784836   
130324  12566099999  2023-12-31T22:30:00      4  50.077731  19.784836   
130325  12566099999  2023-12-31T23:00:00      4  50.077731  19.784836   
130326  12566099999  2023-12-31T23:00:00      4  50.077731  19.784836   
130327  12566099999  2023-12-31T23:30:00      4  50.077731  19.784836   

       ELEVATION        NAME REPORT_TYPE CALL_SIGN QUALITY_CONTROL  ...  MW2  \
130323    241.09  BALICE, PL       FM-15     99999            V020  ...  NaN   
130324    241.09  BALICE, PL       FM-15     99999            V020  ...  NaN   
130325    241.09  BALICE, PL       FM-12     99999            V020  ...  NaN   
130326    241.09  BALICE, PL       FM-15     99999            V020  ...  NaN   
130327    241.09  BALICE, PL       FM-15     99999            V020  ...  NaN   

        MW3  OC1              OD1  OD2  OD3  \
130323  NaN  NaN              NaN