# 检查是否有遗漏或者重复

In [5]:
import csv
from collections import defaultdict
from datetime import datetime, timedelta

def check_csv_dates(file_path, date_field_index):
    dates = defaultdict(int)
    missing_dates = []
    
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # 跳过表头
        
        for row_num, row in enumerate(reader, start=2):  # 从第二行开始计数
            try:
                date = row[date_field_index]
                dates[date] += 1
            except (IndexError, ValueError):
                print(f"行号 {row_num} 的日期字段有问题")
    
    # 检查重复日期
    for date, count in dates.items():
        if count > 1:
            print(f"日期 {date} 重复了 {count} 次")
    
    # 检查遗漏日期
    start_date = datetime.strptime('20210101', '%Y%m%d')
    end_date = datetime.strptime('20240731', '%Y%m%d')
    current_date = start_date
    while current_date <= end_date:
        if str(current_date.strftime('%Y%m%d')) not in dates:
            missing_dates.append(current_date.strftime('%Y%m%d'))
        current_date += timedelta(days=1)
    
    if missing_dates:
        print("遗漏的日期:")
        for date in missing_dates:
            print(date)

# 检查CSV文件
check_csv_dates('/root/Download/AlgaeBloomForecast/weather_data.csv', date_field_index=0)

# 去重


In [2]:
import csv

def remove_duplicate_dates(file_path, date_field_index):
    unique_rows = []
    seen_dates = set()
    
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)  # 跳过表头
        
        for row in reader:
            date = row[date_field_index]
            if date not in seen_dates:
                unique_rows.append(row)
                seen_dates.add(date)
    
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        writer.writerows(unique_rows)

# 去重处理
remove_duplicate_dates('/root/Download/AlgaeBloomForecast/weather_data.csv', date_field_index=0)

# 按日期排序

In [6]:
import csv
from datetime import datetime

def sort_csv_by_date(input_file, output_file, date_field_index):
    with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)  # 读取表头
        data = [row for row in reader]
    
    # 将日期字符串转换为datetime对象
    for row in data:
        try:
            row[date_field_index] = datetime.strptime(row[date_field_index], '%Y%m%d')
        except (IndexError, ValueError):
            print(f"行中的日期字段有问题: {row}")
    
    # 按日期排序
    sorted_data = sorted(data, key=lambda x: x[date_field_index])
    
    # 将排序后的数据写入新的CSV文件
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        writer.writerows(sorted_data)

# 按日期排序并写入新的CSV文件
sort_csv_by_date('/root/Download/AlgaeBloomForecast/weather_data.csv', '/root/Download/AlgaeBloomForecast/sorted_weather_data.csv', date_field_index=0)