In [8]:
import pandas as pd
from datetime import datetime
from pathlib import Path

def clean_icos_file(filename: str, datadir: str = "../data"):
    """
    清洗 ICOS ObsPack Europe L2 文件（CO2 或 CH4）
    - 输入: datadir 下的原始文件
    - 输出: datadir 下生成 cleaned_*.csv
    """
    datadir = Path(datadir)
    infile = datadir / filename
    if not infile.exists():
        raise FileNotFoundError(f"找不到文件: {infile}")

    # 1) 找列名
    with open(infile, "r", encoding="utf-8", errors="ignore") as f:
        colnames = None
        for line in f:
            if line.startswith("#Site;"):
                colnames = line[1:].strip().split(";")
                break

    # 2) 读数据
    df = pd.read_csv(
        infile,
        sep=";",
        comment="#",
        header=None,
        names=colnames,
        na_values=[-999.99, -999.990, -9.99],
        engine="python"
    )

    # 3) 生成整点时间
    def make_dt(row):
        try:
            return datetime(
                int(row["Year"]), int(row["Month"]), int(row["Day"]),
                int(row["Hour"])
            )
        except Exception:
            return pd.NaT

    df["timestamp"] = df.apply(make_dt, axis=1)

    # 4) 找气体列
    gas_col = None
    for g in ["co2", "ch4"]:
        if g in df.columns:
            gas_col = g
            break
    if gas_col is None:
        raise ValueError("未找到 co2 或 ch4 列")

    # 5) 筛选有效值
    df = df[(df["Flag"] == "O") & df[gas_col].notna()]

    # 6) 只保留两列
    df_out = df[["timestamp", gas_col]].rename(columns={gas_col: "value"})
    
    # 7) 输出文件（不会互相覆盖）
    gas_name = gas_col.upper()  # 'CO2' or 'CH4'
    outfile = datadir / f"cleaned_{infile.stem}_{gas_name}.csv"
    # 或：outfile = datadir / f"cleaned_{infile.name}.csv"
    
    df_out.to_csv(outfile, index=False, date_format="%Y-%m-%d %H:00")
    print(f"已保存: {outfile}, 共 {len(df_out)} 行")
    print("时间范围:", df_out['timestamp'].min(), "→", df_out['timestamp'].max())

    return df_out

# 使用示例（在 code 文件夹运行时）：
clean_icos_file("ICOS_ATC_OBSPACK-Europe-L2-2025_ZSF_3.0_CTS.CO2")


已保存: ../data/cleaned_ICOS_ATC_OBSPACK-Europe-L2-2025_ZSF_3.0_CTS_CO2.csv, 共 182949 行
时间范围: 2002-01-01 00:00:00 → 2025-03-31 23:00:00
已保存: ../data/cleaned_ICOS_ATC_OBSPACK-Europe-L2-2025_ZSF_3.0_CTS_CH4.csv, 共 182836 行
时间范围: 2002-01-01 00:00:00 → 2025-03-31 23:00:00


Unnamed: 0,timestamp,value
0,2002-01-01 00:00:00,1849.73
1,2002-01-01 01:00:00,1841.23
2,2002-01-01 02:00:00,1841.17
3,2002-01-01 03:00:00,1840.72
4,2002-01-01 04:00:00,1841.00
...,...,...
203748,2025-03-31 07:00:00,2028.47
203761,2025-03-31 20:00:00,2038.23
203762,2025-03-31 21:00:00,2041.23
203763,2025-03-31 22:00:00,2037.20
