In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import csv
import os

# Data Cleaning and File Conversions

In [3]:
# Convert countries text file to csv

import csv
import os


def parse_countries_file(input_path, output_path):
    """
    Parse the GHCND countries file and convert it to CSV format.

    Args:
        input_path (str): Path to the input ghcnd-countries.txt file
        output_path (str): Path to save the output CSV file
    """
    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    countries_data = []

    # Read and parse the input file
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # Skip empty lines
                code = line[0:2].strip()
                name = line[3:].strip()
                countries_data.append([code, name])

    # Write to CSV file
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["code", "name"])  # Write header
        writer.writerows(countries_data)


input_file = "data/climate/ghcnd-countries.txt"
output_file = "data/climate/ghcnd-countries.csv"


parse_countries_file(input_file, output_file)
print(f"Successfully converted {input_file} to {output_file}")

Successfully converted data/climate/ghcnd-countries.txt to data/climate/ghcnd-countries.csv


In [4]:
# Convert states text file to csv
def parse_states_file(input_path, output_path):
    """
    Parse the GHCND states file and convert it to CSV format.

    Args:
        input_path (str): Path to the input ghcnd-states.txt file
        output_path (str): Path to save the output CSV file
    """
    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    states_data = []

    # Read and parse the input file
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # Skip empty lines
                code = line[0:2].strip()
                name = line[3:].strip()
                states_data.append([code, name])

    # Write to CSV file
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["code", "name"])  # Write header
        writer.writerows(states_data)
input_file = "data/climate/ghcnd-states.txt"
output_file = "data/climate/ghcnd-states.csv"
parse_states_file(input_file, output_file)
print(f"Successfully converted {input_file} to {output_file}")

Successfully converted data/climate/ghcnd-states.txt to data/climate/ghcnd-states.csv


In [7]:
def parse_inventory_file(input_path, output_path):
    """
    Parse the GHCND inventory file and convert it to CSV format.

    Args:
        input_path (str): Path to the input ghcnd-inventory.txt file
        output_path (str): Path to save the output CSV file
    """
    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    inventory_data = []

    # Read and parse the input file
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # Skip empty lines
                # Parse fixed-width fields
                station_id = line[0:11].strip()
                latitude = float(line[12:20].strip())
                longitude = float(line[21:30].strip())
                element = line[31:35].strip()
                first_year = int(line[36:40].strip())
                last_year = int(line[41:45].strip())
                
                inventory_data.append([station_id, latitude, longitude, element, first_year, last_year])

    # Write to CSV file
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "latitude", "longitude", "element", "first_year", "last_year"])  # Write header
        writer.writerows(inventory_data)

parse_inventory_file(input_file, output_file)
print(f"Successfully converted {input_file} to {output_file}")

Successfully converted data/climate/ghcnd-inventory.txt to data/climate/ghcnd-inventory.csv


In [None]:
import pandas as pd


def parse_dly_line(line):
    """
    Parse one line from a GHCN-Daily .dly file.

    Each line has fixed-width columns:
      - Columns 1-11: Station ID
      - Columns 12-15: Year
      - Columns 16-17: Month
      - Columns 18-21: Element
      - Then for each day (1 to 31):
          * Columns 22-26, 30-34, ...: Value (integer, -9999 if missing)
          * Next column: Measurement flag (MFLAG)
          * Next column: Quality flag (QFLAG)
          * Next column: Source flag (SFLAG)

    Returns a dictionary with parsed data.
    """
    record = {}
    record["id"] = line[0:11]
    record["year"] = int(line[11:15])
    record["month"] = int(line[15:17])
    record["element"] = line[17:21]

    daily_values = []
    # Each day occupies 8 characters starting at column 22 (index 21)
    for day in range(31):
        start = 21 + day * 8
        value_str = line[start : start + 5]
        try:
            value = int(value_str)
        except ValueError:
            value = None

        mflag = line[start + 5]
        qflag = line[start + 6]
        sflag = line[start + 7]

        # A value of -9999 is considered missing
        if value == -9999:
            value = None

        daily_values.append(
            {
                "day": day + 1,
                "value": value,
                "mflag": mflag.strip() or None,
                "qflag": qflag.strip() or None,
                "sflag": sflag.strip() or None,
            }
        )

    record["daily_values"] = daily_values
    return record


def parse_dly_file(filename):
    """
    Parse a GHCN-Daily .dly file.

    Returns a list of dictionaries, one per record (month) in the file.
    """
    records = []
    with open(filename, "r") as f:
        for line in f:
            # Ensure the line has enough characters to be a valid record
            if len(line) < 269:
                continue
            record = parse_dly_line(line)
            records.append(record)
    return records


def dly_to_dataframe(filename):
    """
    Convert the parsed .dly file into a pandas DataFrame.

    Each row represents a single day's observation.
    """
    records = parse_dly_file(filename)
    rows = []
    for rec in records:
        station_id = rec["id"]
        year = rec["year"]
        month = rec["month"]
        element = rec["element"]
        for day_rec in rec["daily_values"]:
            row = {
                "station_id": station_id,
                "year": year,
                "month": month,
                "day": day_rec["day"],
                "element": element,
                "value": day_rec["value"],
                "mflag": day_rec["mflag"],
                "qflag": day_rec["qflag"],
                "sflag": day_rec["sflag"],
            }
            rows.append(row)
    df = pd.DataFrame(rows)
    return df


input_file = "data/climate/ghcnd_all/ASN00086135.dly"
output_csv = "data/climate/ghcnd_all_csv/ASN00086135.csv"

# Create DataFrame from the .dly file.
df = dly_to_dataframe(input_file)

# Export DataFrame to CSV.
df.to_csv(output_csv, index=False)
print(f"Exported DataFrame to {output_csv}")


In [12]:
# Convert all .dly files in a directory to CSV
def convert_all_dly_to_csv(input_dir, output_dir):
    """
    Convert all .dly files in the input directory to CSV format.

    Args:
        input_dir (str): Path to the input directory containing .dly files
        output_dir (str): Path to the output directory for CSV files
    """
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if filename.endswith(".dly"):
            dly_path = os.path.join(input_dir, filename)
            csv_path = os.path.join(output_dir, filename.replace(".dly", ".csv"))
            df = dly_to_dataframe(dly_path)
            df.to_csv(csv_path, index=False)
            print(f"Converted {filename} to {csv_path}")

input_dir = "data/climate/ghcnd_all"
output_dir = "data/climate/ghcnd_all_csv"
convert_all_dly_to_csv(input_dir, output_dir)

Converted AM000037959.dly to data/climate/ghcnd_all_csv/AM000037959.csv
Converted IN012230800.dly to data/climate/ghcnd_all_csv/IN012230800.csv
Converted IN011070700.dly to data/climate/ghcnd_all_csv/IN011070700.csv
Converted CA004011580.dly to data/climate/ghcnd_all_csv/CA004011580.csv
Converted CA004077605.dly to data/climate/ghcnd_all_csv/CA004077605.csv
Converted ASN00080010.dly to data/climate/ghcnd_all_csv/ASN00080010.csv
Converted MXN00008057.dly to data/climate/ghcnd_all_csv/MXN00008057.csv
Converted AM000037781.dly to data/climate/ghcnd_all_csv/AM000037781.csv
Converted IN020100301.dly to data/climate/ghcnd_all_csv/IN020100301.csv
Converted KZ000038333.dly to data/climate/ghcnd_all_csv/KZ000038333.csv
Converted ASN00014103.dly to data/climate/ghcnd_all_csv/ASN00014103.csv
Converted BR028815040.dly to data/climate/ghcnd_all_csv/BR028815040.csv
Converted ASN00085168.dly to data/climate/ghcnd_all_csv/ASN00085168.csv
Converted CA007063320.dly to data/climate/ghcnd_all_csv/CA007063