In [None]:
# Standard library imports
import sys
import os
import requests
from io import StringIO

# 3rd-party library imports
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

In [None]:
# GHCN Land Data
file_url = 'https://data.giss.nasa.gov/pub/gistemp/ghcnm.tavg.qcf.dat'

try:
    # Send an HTTP GET request to the URL
    response = requests.get(file_url)

    # Check if the request was successful
    if response.status_code == 200:
        
        # Get the content of the response
        file_data = response.content.decode("utf-8")

        # Create a list to store formatted data
        formatted_data = []

        # Loop through file data
        for line in file_data.split('\n'):
            
            # Check if line is not empty
            if line.strip():
                
                # Extract relevant data
                # (Using code from GHCNV4Reader())
                station_id = line[:11]
                year = int(line[11:15])
                values = [int(line[i:i+5]) for i in range(19, 115, 8)]
                
                # Append data to list
                formatted_data.append([station_id, year] + values)

        # Create DataFrame from formatted data
        column_names = ['Station_ID', 'Year'] + [f'Month_{i}' for i in range(1, 13)]
        df_GHCN = pd.DataFrame(formatted_data, columns=column_names)
        
        # Replace -9999 with NaN
        df_GHCN.replace(-9999, np.nan, inplace=True)
        
        # Format data - convert to degrees C
        month_columns = [f'Month_{i}' for i in range(1, 13)]
        df_GHCN[month_columns] = df_GHCN[month_columns].divide(100)

    else:
        print("Failed to download the file. Status code:", response.status_code)

except Exception as e:
    print("An error occurred:", str(e))

In [None]:
df_GHCN

In [None]:
# GHCN Station Meta data
file_url = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'

# Define the column widths
column_widths = [11, 9, 10, 7, 3, 31]

# Create DataFrame
df_meta = pd.read_fwf(file_url, widths=column_widths, header=None, names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])
df_meta

In [None]:
print(len(df_GHCN))
print(len(df_meta))

In [None]:
# Merge on station ID
df = pd.merge(df_GHCN, df_meta[['Station_ID', 'Latitude', 'Longitude', 'Name']], on='Station_ID', how='left')

# Set index
df = df.set_index('Station_ID')
df

In [None]:
# Calculate fraction of NaN values
nan_fraction = round(df.isna().mean().mean() * 100, 3)
print(f'Fraction of NaN values in the DataFrame: {nan_fraction}%')

In [None]:
df.isna().mean()

In [None]:
def filter_coordinates(df):
    """
    Filters a DataFrame based on latitude and longitude conditions.

    Args:
    df (pd.DataFrame): The input DataFrame with 'Latitude' and 'Longitude' columns.

    Returns:
    pd.DataFrame: The filtered DataFrame with rows where latitude is between -90 and 90,
    and longitude is between -180 and 180.
    """
    
    # Define latitude and longitude range conditions
    lat_condition = (df['Latitude'] >= -90) & (df['Latitude'] <= 90)
    lon_condition = (df['Longitude'] >= -180) & (df['Longitude'] <= 180)

    # Apply the conditions to filter the DataFrame
    df_filtered = df[lat_condition & lon_condition]
    
    # Calculate number of rows filtered
    num_filtered = len(df) - len(df_filtered)
    print(f'Number of rows with invalid coordinates (removed): {num_filtered}')

    return df_filtered    

In [None]:
df_filtered = filter_coordinates(df)

# Xarray Conversion (incomplete)

In [None]:
dataset = xr.Dataset.from_dataframe(df)
dataset

# Step 0

In [3]:
# Standard library imports
import requests
import sys
import os

# 3rd-party library imports
import pandas as pd
import numpy as np

# Local imports
GHCN_temp_url = 'https://data.giss.nasa.gov/pub/gistemp/ghcnm.tavg.qcf.dat'
GHCN_meta_url = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'

def get_GHCN_data(temp_url, meta_url):

    '''
    Retrieves and formats temperature data from the Global Historical Climatology Network (GHCN) dataset.

    Args:
    temp_url (str): The URL to the temperature data file in GHCN format.
    meta_url (str): The URL to the metadata file containing station information.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing temperature data with station metadata.
    
    This function sends an HTTP GET request to the temperature data URL, processes the data to create
    a formatted DataFrame, replaces missing values with NaN, converts temperature values to degrees Celsius,
    and merges the data with station metadata based on station IDs. The resulting DataFrame includes
    columns for station latitude, longitude, and name, and is indexed by station IDs.
    '''

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(temp_url)

        # Check if the request was successful
        if response.status_code == 200:
            
            # Get the content of the response
            file_data = response.content.decode("utf-8")

            # Create a list to store formatted data
            formatted_data = []

            # Loop through file data
            for line in file_data.split('\n'):
                
                # Check if line is not empty
                if line.strip():
                    
                    # Extract relevant data
                    # (Using code from GHCNV4Reader())
                    station_id = line[:11]
                    year = int(line[11:15])
                    values = [int(line[i:i+5]) for i in range(19, 115, 8)]
                    
                    # Append data to list
                    formatted_data.append([station_id, year] + values)

            # Create DataFrame from formatted data
            column_names = ['Station_ID', 'Year'] + [f'Month_{i}' for i in range(1, 13)]
            df_GHCN = pd.DataFrame(formatted_data, columns=column_names)
            
            # Replace -9999 with NaN
            df_GHCN.replace(-9999, np.nan, inplace=True)
            
            # Format data - convert to degrees C
            month_columns = [f'Month_{i}' for i in range(1, 13)]
            df_GHCN[month_columns] = df_GHCN[month_columns].divide(100)

        else:
            print("Failed to download the file. Status code:", response.status_code)

    except Exception as e:
        print("An error occurred:", str(e))

    # Define the column widths, create meta data dataframe
    column_widths = [11, 9, 10, 7, 3, 31]
    df_meta = pd.read_fwf(meta_url, widths=column_widths, header=None,
                          names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])
    # Merge on station ID, set index
    df = pd.merge(df_GHCN, df_meta[['Station_ID', 'Latitude', 'Longitude', 'Name']], on='Station_ID', how='left')
    df = df.set_index('Station_ID')

    return df


def step0():
    '''
    Performs the initial data processing steps for the GHCN temperature dataset.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing filtered and formatted temperature data.
    
    This function retrieves temperature data from the Global Historical Climatology Network (GHCN) dataset,
    processes and formats the data, and returns a DataFrame. The data is first fetched using specified URLs,
    and is returned for further analysis.
    '''
    df_GHCN = get_GHCN_data(GHCN_temp_url, GHCN_meta_url)
    return df_GHCN

In [4]:
step0_output = step0()

# Step 1

In [24]:
import pandas as pd
import numpy as np
import os

def filter_coordinates(df):
    """
    Filters a DataFrame based on latitude and longitude conditions.

    Args:
    df (pd.DataFrame): The input DataFrame with 'Latitude' and 'Longitude' columns.

    Returns:
    pd.DataFrame: The filtered DataFrame with rows where latitude is between -90 and 90,
    and longitude is between -180 and 180.
    """
    
    # Define latitude and longitude range conditions
    lat_condition = (df['Latitude'] >= -90) & (df['Latitude'] <= 90)
    lon_condition = (df['Longitude'] >= -180) & (df['Longitude'] <= 180)

    # Apply the conditions to filter the DataFrame
    df_filtered = df[lat_condition & lon_condition]
    
    # Calculate number of rows filtered
    num_filtered = len(df) - len(df_filtered)
    print(f'Number of rows with invalid coordinates (removed): {num_filtered}')

    return df_filtered

import pandas as pd
import re


rules_text = '''
CHM00052836  omit: 0-1948
CHXLT909860  omit: 0-1950
BL000085365  omit: 0-1930
MXXLT948335  omit: 0-1952
ASN00058012  omit: 0-1899
ASN00084016  omit: 0-1899
ASN00069018  omit: 0-1898
NIXLT013080  omit: 0-1930
NIXLT751359  omit: 0-9999
CHXLT063941  omit: 0-1937
CHM00054843  omit: 0-1937
MXM00076373  omit: 0-9999
USC00044022  omit: 0-9999
USC00044025  omit: 0-9999
CA002402332  omit: 2011-9999
RSM00024266  omit: 2021/09
'''

def step1(step0_output):
    df_filtered = filter_coordinates(step0_output)
    df_clean = filter_stations_by_rules(df_filtered, rules_text)
    return df_clean

In [22]:
step1_output = step1(step0_output)

Number of rows with invalid coordinates (removed): 194947


In [23]:
print(len(step0_output))
print(len(step1_output))

1453166
0
