In [100]:
import os
import glob
import pandas as pd
import numpy as np

In [101]:
col_name_map = {'tmedia':'tavg', 'umedia':'humidity_mean', 'umin':'humidity_min', 'umax':'humidity_max', 'ptot':'prcp', 'vmedia':'wspd', 'date':'time'}
final_features = ['time', 'tavg', 'tmin', 'tmax', 'prcp', 'wspd','humidity_mean', 'humidity_min', 'humidity_max']

In [111]:
main_df = pd.DataFrame(pd.date_range(start='1990-01-01', end=pd.Timestamp.today(), freq='D').date, columns = ['time'])

dfs = []
for filepath in glob.glob('data/*.csv'):
    df = pd.read_csv(filepath)
    df.rename(columns=col_name_map, inplace=True)
    df = df[[col for col in final_features if col in df.columns]]
    prefix = os.path.basename(filepath)[:4]
    # Prefix all columns except the 'time' column
    df.columns = [col if col == 'time' else prefix + '_' + col for col in df.columns]
    print(f"{filepath} null cells ratio:", df.isnull().sum().sum() / df.size)
    df['time'] = pd.to_datetime(df['time'])
    df['time'] = df['time'].dt.date
    df.sort_index(inplace=True)
    df.ffill(inplace=True)
    main_df = main_df.merge(df, on='time', how='left')

main_df.set_index('time', inplace = True)
main_df.dropna(how='all', inplace=True)
main_df.to_csv('processed_large_feat_space/climate_data_maur_susa.csv')

data\AVIGLIANA.csv null cells ratio: 0.028272410675117952
data\BORGONE.csv null cells ratio: 0.016921837228041903
data\MODANE.csv null cells ratio: 0.1111111111111111
data\PRERICHARD.csv null cells ratio: 0.025730994152046785
data\SALBELTRAND.csv null cells ratio: 0.027969410128883897
data\SOLLIERES.csv null cells ratio: 0.053892695436150176
data\ST_FRANCOIS.csv null cells ratio: 0.09921271879538332
data\ST_MICHEL.csv null cells ratio: 0.1111111111111111


In [110]:
dfs = []
for filepath in glob.glob('data/*.csv'):
    df = pd.read_csv(filepath)
    df.rename(columns=col_name_map, inplace=True)
    df = df[[col for col in final_features if col in df.columns]]
    print(f"{filepath} null cells ratio:", df.isnull().sum().sum() / df.size)
    df['time'] = pd.to_datetime(df['time']).dt.date
    df.sort_index(inplace=True)
    df.ffill(inplace=True)
    file_id = os.path.basename(filepath)[:4]
    df['source'] = file_id
    dfs.append(df)

main_df = pd.concat(dfs, ignore_index=True)
# Create binary indicator columns for each CSV file
source_dummies = pd.get_dummies(main_df['source'])
main_df = pd.concat([main_df, source_dummies], axis=1)
main_df.drop(columns=['source'], inplace=True)
main_df.set_index('time', inplace=True)
main_df.to_csv('processed_small_feat_space/climate_data_maur_susa.csv')

data\AVIGLIANA.csv null cells ratio: 0.028272410675117952
data\BORGONE.csv null cells ratio: 0.016921837228041903
data\MODANE.csv null cells ratio: 0.1111111111111111
data\PRERICHARD.csv null cells ratio: 0.025730994152046785
data\SALBELTRAND.csv null cells ratio: 0.027969410128883897
data\SOLLIERES.csv null cells ratio: 0.053892695436150176
data\ST_FRANCOIS.csv null cells ratio: 0.09921271879538332
data\ST_MICHEL.csv null cells ratio: 0.1111111111111111
