In [1]:
# Standard library imports
import sys
import os
import requests
import itertools
import math
from io import StringIO
from tqdm import tqdm

# 3rd-party library imports
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

# Step 0

In [2]:
# Standard library imports
import requests
import sys
import os

# 3rd-party library imports
import pandas as pd
import numpy as np

# Add the parent folder to sys.path
# parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.insert(0, parent_dir)

# Local imports
# from parameters.data import GHCN_temp_url, GHCN_meta_url
GHCN_temp_url = 'https://data.giss.nasa.gov/pub/gistemp/ghcnm.tavg.qcf.dat'
GHCN_meta_url = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'


def get_GHCN_data(temp_url, meta_url):

    '''
    Retrieves and formats temperature data from the Global Historical Climatology Network (GHCN) dataset.

    Args:
    temp_url (str): The URL to the temperature data file in GHCN format.
    meta_url (str): The URL to the metadata file containing station information.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing temperature data with station metadata.
    
    This function sends an HTTP GET request to the temperature data URL, processes the data to create
    a formatted DataFrame, replaces missing values with NaN, converts temperature values to degrees Celsius,
    and merges the data with station metadata based on station IDs. The resulting DataFrame includes
    columns for station latitude, longitude, and name, and is indexed by station IDs.
    '''

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(temp_url)

        # Check if the request was successful
        if response.status_code == 200:
            
            # Get the content of the response
            file_data = response.content.decode("utf-8")

            # Create a list to store formatted data
            formatted_data = []

            # Loop through file data
            for line in file_data.split('\n'):
                
                # Check if line is not empty
                if line.strip():
                    
                    # Extract relevant data
                    # (Using code from GHCNV4Reader())
                    station_id = line[:11]
                    year = int(line[11:15])
                    values = [int(line[i:i+5]) for i in range(19, 115, 8)]
                    
                    # Append data to list
                    formatted_data.append([station_id, year] + values)

            # Create DataFrame from formatted data
            column_names = ['Station_ID', 'Year'] + [f'Month_{i}' for i in range(1, 13)]
            df_GHCN = pd.DataFrame(formatted_data, columns=column_names)
            
            # Replace -9999 with NaN
            df_GHCN.replace(-9999, np.nan, inplace=True)
            
            # Format data - convert to degrees C
            month_columns = [f'Month_{i}' for i in range(1, 13)]
            df_GHCN[month_columns] = df_GHCN[month_columns].divide(100)

        else:
            print("Failed to download the file. Status code:", response.status_code)

    except Exception as e:
        print("An error occurred:", str(e))

    # Define the column widths, create meta data dataframe
    column_widths = [11, 9, 10, 7, 3, 31]
    df_meta = pd.read_fwf(meta_url, widths=column_widths, header=None,
                          names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])
    # Merge on station ID, set index
    df = pd.merge(df_GHCN, df_meta[['Station_ID', 'Latitude', 'Longitude', 'Name']], on='Station_ID', how='left')
    df = df.set_index('Station_ID')

    return df


def step0():
    '''
    Performs the initial data processing steps for the GHCN temperature dataset.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing filtered and formatted temperature data.
    
    This function retrieves temperature data from the Global Historical Climatology Network (GHCN) dataset,
    processes and formats the data, and returns a DataFrame. The data is first fetched using specified URLs,
    and is returned for further analysis.
    '''
    df_GHCN = get_GHCN_data(GHCN_temp_url, GHCN_meta_url)
    return df_GHCN

In [3]:
step0_output = step0()

# Step 1

In [4]:
import pandas as pd
import os
import re
import sys

# Add the parent folder to sys.path
# parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.insert(0, parent_dir)

# Local imports
# from parameters.data import drop_rules
drop_rules = '''
CHM00052836  omit: 0-1948
CHXLT909860  omit: 0-1950
BL000085365  omit: 0-1930
MXXLT948335  omit: 0-1952
ASN00058012  omit: 0-1899
ASN00084016  omit: 0-1899
ASN00069018  omit: 0-1898
NIXLT013080  omit: 0-1930
NIXLT751359  omit: 0-9999
CHXLT063941  omit: 0-1937
CHM00054843  omit: 0-1937
MXM00076373  omit: 0-9999
USC00044022  omit: 0-9999
USC00044025  omit: 0-9999
CA002402332  omit: 2011-9999
RSM00024266  omit: 2021/09
'''


def filter_coordinates(df):
    """
    Filters a DataFrame based on latitude and longitude conditions.

    Args:
    df (pd.DataFrame): The input DataFrame with 'Latitude' and 'Longitude' columns.

    Returns:
    pd.DataFrame: The filtered DataFrame with rows where latitude is between -90 and 90,
    and longitude is between -180 and 180.
    """
    
    # Define latitude and longitude range conditions
    lat_condition = (df['Latitude'] >= -90) & (df['Latitude'] <= 90)
    lon_condition = (df['Longitude'] >= -180) & (df['Longitude'] <= 180)

    # Apply the conditions to filter the DataFrame
    df_filtered = df[lat_condition & lon_condition]
    
    # Calculate number of rows filtered
    num_filtered = len(df) - len(df_filtered)
    print(f'Number of rows with invalid coordinates (removed): {num_filtered}')

    return df_filtered


def filter_stations_by_rules(dataframe, rules_text):
    """
    Filters a DataFrame of climate station data based on exclusion rules specified in a text format.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame containing climate station data.
        rules_text (str): A string containing exclusion rules for specific stations and years.

    Returns:
        pd.DataFrame: A filtered DataFrame with stations omitted based on the provided rules.

    Rules Format:
        The 'rules_text' should be formatted as follows:
        - Each rule is represented as a single line in the text.
        - Each line should start with the station ID followed by exclusion rules.
        - Exclusion rules consist of 'omit:' followed by the years to exclude, e.g., 'omit: 2000-2010'.
        - Years can be specified as a single year (e.g., 'omit: 2000') or as a range (e.g., 'omit: 2000-2010').
        - Year ranges can also be specified using '/' (e.g., 'omit: 2000/2002').

    Example:
        rules_text = '''
            CHM00052836  omit: 0-1948
            CHXLT909860  omit: 0-1950
            BL000085365  omit: 0-1930
            ...
        '''

    This function takes the provided rules and applies them to the input DataFrame,
    resulting in a new DataFrame with stations excluded based on the specified rules.
    """

    # Parse the rules from the provided text
    rules = {}
    for line in rules_text.split('\n'):
        if line.strip():
            match = re.match(r'([A-Z0-9]+)\s+omit:\s+(\S+)', line)
            if match:
                station_id, year_rule = match.groups()
                rules[station_id] = year_rule

    # Create a mask to identify rows to omit
    mask = pd.Series(True, index=dataframe.index)

    for station_id, year_rule in rules.items():
        try:
            # Split the year_rule into start and end years
            start_year, end_year = map(int, year_rule.split('-'))
        except ValueError:
            # Handle cases like '2011/12' or '2012-9999'
            if '/' in year_rule:
                start_year = int(year_rule.split('/')[0])
                end_year = start_year
            elif '-' in year_rule:
                start_year = int(year_rule.split('-')[0])
                end_year = int(year_rule.split('-')[1])
            else:
                continue

        # Update the mask to False for the specified range of years for the station_id
        mask &= ~((dataframe['Year'] >= start_year) & (dataframe['Year'] <= end_year) & (dataframe.index == station_id))

    # Apply the mask to filter the DataFrame
    filtered_dataframe = dataframe[mask]

    # Calculate number of rows filtered
    num_filtered = len(dataframe) - len(filtered_dataframe)
    print(f'Number of rows removed according to station exclusion rules: {num_filtered}')

    return filtered_dataframe


def step1(step0_output):
    """
    Applies data filtering and cleaning operations to the input DataFrame.

    Parameters:
        step0_output (pd.DataFrame): The initial DataFrame containing climate station data.

    Returns:
        pd.DataFrame: A cleaned and filtered DataFrame ready for further analysis.

    This function serves as a data processing step by applying two essential filtering operations:
    1. `filter_coordinates`: Filters the DataFrame based on geographical coordinates, retaining relevant stations.
    2. `filter_stations_by_rules`: Filters the DataFrame based on exclusion rules, omitting specified stations and years.

    The resulting DataFrame is cleaned of irrelevant stations and years according to specified rules
    and is ready for subsequent data analysis or visualization.
    """
        
    df_filtered = filter_coordinates(step0_output)
    df_clean = filter_stations_by_rules(df_filtered, drop_rules)
    return df_clean

In [5]:
step1_output = step1(step0_output)

Number of rows with invalid coordinates (removed): 194947
Number of rows removed according to station exclusion rules: 344


# Step 2

In [6]:
# Skip for now

# Step 3

In [7]:
def generate_80_cell_grid():
    n_bands = 8  # Number of latitude bands
    n_boxes_per_band = 10  # Number of boxes per band

    data = []

    for band in range(n_bands):
        lat_south = -90 + band * (180 / n_bands)
        lat_north = -90 + (band + 1) * (180 / n_bands)

        for i in range(n_boxes_per_band):
            lon_west = -180 + i * (360 / n_boxes_per_band)
            lon_east = -180 + (i + 1) * (360 / n_boxes_per_band)

            data.append((lat_south, lat_north, lon_west, lon_east))

    df = pd.DataFrame(data, columns=['Southern', 'Northern', 'Western', 'Eastern'])
    

    # Calculate the center latitude and longitude
    df['Center_Latitude'] = (df['Southern'] + df['Northern']) / 2
    df['Center_Longitude'] = (df['Western'] + df['Eastern']) / 2
    
    return df

def lerp(x, y, p):
    return y * p + (1 - p) * x

def generate_8000_cell_grid():
    def subgen(lat_s, lat_n, lon_w, lon_e):
        alts = math.sin(lat_s * math.pi / 180)
        altn = math.sin(lat_n * math.pi / 180)
        for y in range(10):
            s = 180 * math.asin(lerp(alts, altn, y * 0.1)) / math.pi
            n = 180 * math.asin(lerp(alts, altn, (y + 1) * 0.1)) / math.pi
            for x in range(10):
                w = lerp(lon_w, lon_e, x * 0.1)
                e = lerp(lon_w, lon_e, (x + 1) * 0.1)
                yield (s, n, w, e)

    initial_regions_df = generate_80_cell_grid()
    data = []

    for index, row in initial_regions_df.iterrows():
        for subcell in subgen(row['Southern'], row['Northern'], row['Western'], row['Eastern']):
            data.append(subcell)

    grid_df = pd.DataFrame(data, columns=['Southern', 'Northern', 'Western', 'Eastern'])
    
    # Calculate the center latitude and longitude
    grid_df['Center_Latitude'] = (grid_df['Southern'] + grid_df['Northern']) / 2
    grid_df['Center_Longitude'] = (grid_df['Western'] + grid_df['Eastern']) / 2
    
    return grid_df

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the spherical distance (in kilometers) between two pairs of
    latitude and longitude coordinates using the Haversine formula.

    Args:
        lat1 (float): Latitude of the first point in degrees.
        lon1 (float): Longitude of the first point in degrees.
        lat2 (float): Latitude of the second point in degrees.
        lon2 (float): Longitude of the second point in degrees.

    Returns:
        float: Spherical distance in kilometers.
    """
    # Convert latitude and longitude from degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Radius of the Earth in kilometers
    radius = 6371.0  # Earth's mean radius

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = radius * c

    return distance


def linearly_decreasing_weight(distance, max_distance):
    """
    Calculate a linearly decreasing weight based on the given distance
    and maximum distance.

    Args:
        distance (float): The distance at which you want to calculate the weight.
        max_distance (float): The maximum distance at which the weight becomes 0.

    Returns:
        float: The linearly decreasing weight, ranging from 1 to 0.
    """
    # Ensure that distance is within the valid range [0, max_distance]
    distance = max(0, min(distance, max_distance))

    # Calculate the weight as a linear interpolation
    weight = 1.0 - (distance / max_distance)
    
    return weight

In [8]:
GHCN_meta_url = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
column_widths = [11, 9, 10, 7, 3, 31]
df_meta = pd.read_fwf(GHCN_meta_url, widths=column_widths, header=None,
                      names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])

grid_8000_df = generate_8000_cell_grid()
grid_80_df = generate_80_cell_grid()
station_df = df_meta

In [14]:
df_meta

Unnamed: 0,Station_ID,Latitude,Longitude,Elevation,State,Name
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL
...,...,...,...,...,...,...
124949,ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON
124950,ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO
124951,ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE
124952,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE


In [16]:
# Initialize an empty list to store station IDs and weights as dictionaries
station_weights_within_radius = []

# Maximum distance for the weight calculation (e.g., 1200.0 km)
max_distance = 1200.0

# Use tqdm to track progress
for index, row in tqdm(grid_80_df.iterrows(), total=len(grid_80_df), desc="Processing"):
    center_lat = row['Center_Latitude']
    center_lon = row['Center_Longitude']
    
    # Calculate distances for each station in station_df
    distances = station_df.apply(lambda x: haversine_distance(center_lat, center_lon, x['Latitude'], x['Longitude']), axis=1)
    
    # Find station IDs within the specified radius
    nearby_stations = station_df[distances <= max_distance]
    
    # Calculate weights for each nearby station
    weights = nearby_stations.apply(lambda x: linearly_decreasing_weight(distances[x.name], max_distance), axis=1)
    
    # Create a dictionary of station IDs and weights
    station_weights = dict(zip(nearby_stations['Station_ID'], weights))
    
    # Append the dictionary to the result list
    station_weights_within_radius.append(station_weights)

# Add the list of station IDs and weights as a new column
grid_80_df['Nearby_Stations'] = station_weights_within_radius

# Print grid_80_df with the new column
print(grid_80_df)


Processing: 100%|███████████████████████████████| 80/80 [00:29<00:00,  2.73it/s]

    Southern  Northern  Western  Eastern  Center_Latitude  Center_Longitude  \
0      -90.0     -67.5   -180.0   -144.0           -78.75            -162.0   
1      -90.0     -67.5   -144.0   -108.0           -78.75            -126.0   
2      -90.0     -67.5   -108.0    -72.0           -78.75             -90.0   
3      -90.0     -67.5    -72.0    -36.0           -78.75             -54.0   
4      -90.0     -67.5    -36.0      0.0           -78.75             -18.0   
..       ...       ...      ...      ...              ...               ...   
75      67.5      90.0      0.0     36.0            78.75              18.0   
76      67.5      90.0     36.0     72.0            78.75              54.0   
77      67.5      90.0     72.0    108.0            78.75              90.0   
78      67.5      90.0    108.0    144.0            78.75             126.0   
79      67.5      90.0    144.0    180.0            78.75             162.0   

                                      Nearby_Statio




# Xarray Conversion (incomplete)

In [13]:
# df = step1_output

# # Transpose the DataFrame to have months as columns
# df_copy = df.set_index(['Year', 'Name', 'Latitude', 'Longitude']).transpose()

# # Create a MultiIndex with separate levels for Station_ID, Year, and Month
# df_copy.columns = pd.MultiIndex.from_tuples(
#     [(station_id, year, f"Month_{month}") for station_id in df_copy.columns.get_level_values(0) for year in df_copy.index.get_level_values(0) for month in range(1, 13)],
#     names=['Station_ID', 'Year', 'Month']
# )

# # Drop Year column from the copied DataFrame
# df_copy = df_copy.drop('Year', axis=0)

# # Convert the copied DataFrame to an xarray Dataset
# ds = xr.Dataset.from_dataframe(df_copy)

# # Rename latitude and longitude variables (if needed)
# ds = ds.rename({'Longitude': 'Lon', 'Latitude': 'Lat'})

# # Print the resulting xarray Dataset
# print(ds)