In [7]:
import pandas as pd
import numpy as np
import os

In [9]:
def get_all_weather_csvs(directory):
    """
    Get all CSV files in the specified directory.

    Args:
        directory (str): Path to the directory containing CSV files.

    Returns:
        List[str]: List of file paths to all CSV files.
    """
    csv_files = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            csv_files.append(os.path.join(directory, file))
    return csv_files

In [11]:
def clean_month_weather_data_hourly(csv_file):
    """
    Clean and process hourly weather data from a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Processed hourly weather data.
    """    
    try:
        df = pd.read_csv(csv_file, low_memory=False)
        df.rename(columns=lambda x: x.strip(), inplace=True)
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
        
        hourly_data = df[["DATE", "HourlyPrecipitation", "HourlyWindSpeed"]].copy()
        hourly_data.replace("T", 0, inplace=True)
        hourly_data.replace(regex=r"[^\d.]", value=np.nan, inplace=True)
        
        hourly_data.columns = ["date", "precipitation", "wind_speed"]
        hourly_data["precipitation"] = pd.to_numeric(hourly_data["precipitation"], errors="coerce")
        hourly_data["wind_speed"] = pd.to_numeric(hourly_data["wind_speed"], errors="coerce")
        
        return hourly_data
    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")
        return pd.DataFrame()


In [29]:
def clean_month_weather_data_daily(csv_file):
    try:
        # 读取文件
        df = pd.read_csv(csv_file, low_memory=False)
        
        # 修整列名，解析日期
        df.rename(columns=lambda x: x.strip(), inplace=True)
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
        
        # 提取需要的列：这里假设 Daily 数据已经存在
        daily_data = df[["DATE", "DailyPrecipitation", "DailyAverageWindSpeed", "DailySnowfall"]].copy()
        
        # 处理特殊值 "T" 和其他非数字字符
        daily_data.replace("T", 0, inplace=True)
        daily_data.replace(regex=r"[^\d.]", value=np.nan, inplace=True)
        
        # 重命名列
        daily_data.columns = ["date", "precipitation", "average_wind_speed", "snowfall"]
        
        # 转换为数值类型
        daily_data["precipitation"] = pd.to_numeric(daily_data["precipitation"], errors="coerce")
        daily_data["average_wind_speed"] = pd.to_numeric(daily_data["average_wind_speed"], errors="coerce")
        daily_data["snowfall"] = pd.to_numeric(daily_data["snowfall"], errors="coerce")
        
        # 提取非空值的数据
        daily_data = daily_data.dropna(subset=["precipitation", "average_wind_speed", "snowfall"], how="all")
        
        # 确保日期列只保留日期部分
        daily_data["date"] = daily_data["date"].dt.date
        
        return daily_data
    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")
        return pd.DataFrame()

In [31]:
def load_and_clean_weather_data(directory):
    """
    Load and clean weather data from all CSV files in the directory.

    Args:
        directory (str): Path to the directory containing CSV files.

    Returns:
        tuple: Two DataFrames - hourly and daily weather data.
    """
    
    weather_csv_files = get_all_weather_csvs(directory)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        print(f"Processing {csv_file}...")
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # Concatenate dataframes
    hourly_data = pd.concat(hourly_dataframes, ignore_index=True)
    daily_data = pd.concat(daily_dataframes, ignore_index=True)
    
    return hourly_data, daily_data

In [27]:
if __name__ == "__main__":
    
    WEATHER_CSV_DIR = "weather/"  # Path to your weather data directory

    # Process weather data
    hourly_weather_data, daily_weather_data = load_and_clean_weather_data(WEATHER_CSV_DIR)

    # Replace NaN values with 0
    hourly_weather_data.fillna(0, inplace=True)
    daily_weather_data.fillna(0, inplace=True)

    # Save cleaned data to CSV
    hourly_weather_data.to_csv("cleaned_hourly_weather_data.csv", index=False)
    daily_weather_data.to_csv("cleaned_daily_weather_data.csv", index=False)
    print("Cleaned data saved.")

Processing weather/2020_weather.csv...
Processing weather/2023_weather.csv...
Processing weather/2021_weather.csv...
Processing weather/2024_weather.csv...
Processing weather/2022_weather.csv...
Cleaned data saved.
