In [1]:
# Standard library imports
import sys
import os
import requests
import itertools
import math
from io import StringIO
from tqdm import tqdm
from typing import Tuple

# 3rd-party library imports
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

# Step 0

In [2]:
'''
Step 0: Downloading Data

Combining diverse inputs into a single dataset

Inputs include:
    - GHCN v4 data
    - ERRST v5 data (later on?)
'''

# Standard library imports
import requests
import sys
import os
from typing import List

# 3rd-party library imports
import pandas as pd
import numpy as np

# Add the parent folder to sys.path
# parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.insert(0, parent_dir)

# Local imports
# from parameters.data import GHCN_temp_url, GHCN_meta_url
GHCN_temp_url = 'https://data.giss.nasa.gov/pub/gistemp/ghcnm.tavg.qcf.dat'
GHCN_meta_url = 'https://data.giss.nasa.gov/pub/gistemp/v4.inv'

# Local imports
from parameters.data import GHCN_temp_url, GHCN_meta_url

def get_GHCN_data(temp_url: str, meta_url: str, start_year: int) -> pd.DataFrame:
    '''
    Retrieves and formats temperature data from the Global Historical Climatology Network (GHCN) dataset.

    Args:
    temp_url (str): The URL to the temperature data file in GHCN format.
    meta_url (str): The URL to the metadata file containing station information.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing temperature data with station metadata.
    
    This function sends an HTTP GET request to the temperature data URL, processes the data to create
    a formatted DataFrame, replaces missing values with NaN, converts temperature values to degrees Celsius,
    and merges the data with station metadata based on station IDs. The resulting DataFrame includes
    columns for station latitude, longitude, and name, and is indexed by station IDs.
    '''

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(temp_url)

        # Check if the request was successful
        if response.status_code == 200:
            
            # Get the content of the response
            file_data: str = response.content.decode("utf-8")

            # Create a list to store formatted data
            formatted_data = []

            # Loop through file data
            for line in file_data.split('\n'):
                
                # Check if line is not empty
                if line.strip():
                    
                    # Extract relevant data
                    # (Using code from GHCNV4Reader())
                    station_id: str = line[:11]
                    year: int = int(line[11:15])
                    values: List[int] = [int(line[i:i+5]) for i in range(19, 115, 8)]
                    
                    # Append data to list
                    formatted_data.append([station_id, year] + values)

            # Create DataFrame from formatted data
            column_names: List[str] = ['Station_ID', 'Year'] + [f'{i}' for i in range(1, 13)]
            df_GHCN: pd.DataFrame = pd.DataFrame(formatted_data, columns=column_names)
            
            # Replace -9999 with NaN
            df_GHCN.replace(-9999, np.nan, inplace=True)
            
            # Format data - convert to degrees C
            month_columns: List[str] = [f'{i}' for i in range(1, 13)]
            df_GHCN[month_columns] = df_GHCN[month_columns].divide(100)
            
            # Drop all years before start year
            start_year_mask = df_GHCN['Year'] >= 1850
            df_GHCN = df_GHCN.loc[start_year_mask]
            
        else:
            print("Failed to download the file. Status code:", response.status_code)

    except Exception as e:
        print("An error occurred:", str(e))
        
    # Pivot the dataframe
    pivoted_df = df_GHCN.pivot(index='Station_ID', columns='Year')

    # Flatten the multi-level columns and format them as desired
    pivoted_df.columns = [f"{col[0]}_{col[1]}" for col in pivoted_df.columns]

    # Sort the columns by the month number
    sorted_columns = sorted(pivoted_df.columns, key=lambda x: int(x.split('_')[1]))

    # Reorder the dataframe columns
    pivoted_df = pivoted_df[sorted_columns]

    # Reset the index
    pivoted_df.reset_index(inplace=True)

    # Define the column widths, create meta data dataframe
    column_widths: List[int] = [11, 9, 10, 7, 3, 31]
    df_meta: pd.DataFrame = pd.read_fwf(meta_url, widths=column_widths, header=None,
                          names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])

    # Merge on station ID, set index, drop station names
    df: pd.DataFrame = pd.merge(pivoted_df, df_meta[['Station_ID', 'Latitude', 'Longitude', 'Name']], on='Station_ID', how='left')
    df.set_index('Station_ID', inplace=True)
    df.drop(columns='Name', inplace=True) 

    return df

def step0() -> pd.DataFrame:
    '''
    Performs the initial data processing steps for the GHCN temperature dataset.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing filtered and formatted temperature data.
    
    This function retrieves temperature data from the Global Historical Climatology Network (GHCN) dataset,
    processes and formats the data, and returns a DataFrame. The data is first fetched using specified URLs,
    and is returned for further analysis.
    '''
    df_GHCN: pd.DataFrame = get_GHCN_data(GHCN_temp_url, GHCN_meta_url, 1850)
    return df_GHCN

In [3]:
step0_output = step0()

In [4]:
step0_output

Unnamed: 0_level_0,1_1850,2_1850,3_1850,4_1850,5_1850,6_1850,7_1850,8_1850,9_1850,10_1850,...,5_2023,6_2023,7_2023,8_2023,9_2023,10_2023,11_2023,12_2023,Latitude,Longitude
Station_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACW00011604,,,,,,,,,,,...,,,,,,,,,57.7667,11.8667
AE000041196,,,,,,,,,,,...,31.89,34.68,36.65,36.67,,,,,25.3330,55.5170
AEM00041184,,,,,,,,,,,...,,,,,,,,,25.6170,55.9330
AEM00041194,,,,,,,,,,,...,32.70,35.18,37.38,37.79,,,,,25.2550,55.3640
AEM00041216,,,,,,,,,,,...,,,,,,,,,24.4300,54.4700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZI000067983,,,,,,,,,,,...,,,,,,,,,-20.2000,32.6160
ZI000067991,,,,,,,,,,,...,,,,,,,,,-22.2170,30.0000
ZIXLT371333,,,,,,,,,,,...,,,,,,,,,-17.8300,31.0200
ZIXLT443557,,,,,,,,,,,...,,,,,,,,,-18.9800,32.4500


# Step 1

In [5]:
'''
Step 1: Removal of bad data

Drop or adjust certain records (or parts of records).
This includes outliers / out of range reports.
Determined using configuration file.
    <TO-DO> Figure out if this method is ideal.
'''

import pandas as pd
import os
import re
import sys

# Add the parent folder to sys.path
# parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.insert(0, parent_dir)

# Local imports
# from parameters.data import drop_rules
drop_rules = '''
CHM00052836 omit: 0-1948
CHXLT909860 omit: 0-1950
BL000085365 omit: 0-1930
MXXLT948335 omit: 0-1952
ASN00058012 omit: 0-1899
ASN00084016 omit: 0-1899
ASN00069018 omit: 0-1898
NIXLT013080 omit: 0-1930
NIXLT751359 omit: 0-9999
CHXLT063941 omit: 0-1937
CHM00054843 omit: 0-1937
MXM00076373 omit: 0-9999
USC00044022 omit: 0-9999
USC00044025 omit: 0-9999
CA002402332 omit: 2011-9999
RSM00024266 omit: 2021/09
'''


def filter_coordinates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filters a DataFrame based on latitude and longitude conditions.

    Args:
    df (pd.DataFrame): The input DataFrame with 'Latitude' and 'Longitude' columns.

    Returns:
    pd.DataFrame: The filtered DataFrame with rows where latitude is between -90 and 90,
    and longitude is between -180 and 180.
    """
    
    # Define latitude and longitude range conditions
    lat_condition = (df['Latitude'] >= -90) & (df['Latitude'] <= 90)
    lon_condition = (df['Longitude'] >= -180) & (df['Longitude'] <= 180)

    # Apply the conditions using the .loc indexer
    df_filtered = df.loc[lat_condition & lon_condition]
    
    # Calculate number of rows filtered
    num_filtered = len(df) - len(df_filtered)
    print(f'Number of stations with invalid coordinates (removed): {num_filtered}')

    return df_filtered


def step1(step0_output: pd.DataFrame) -> pd.DataFrame:
    """
    Applies data filtering and cleaning operations to the input DataFrame.

    Parameters:
        step0_output (pd.DataFrame): The initial DataFrame containing climate station data.

    Returns:
        pd.DataFrame: A cleaned and filtered DataFrame ready for further analysis.

    This function serves as a data processing step by applying two essential filtering operations:
    1. `filter_coordinates`: Filters the DataFrame based on geographical coordinates, retaining relevant stations.
    2. `filter_stations_by_rules`: Filters the DataFrame based on exclusion rules, omitting specified stations and years.

    The resulting DataFrame is cleaned of irrelevant stations and years according to specified rules
    and is ready for subsequent data analysis or visualization.
    """
        
    df_filtered = filter_coordinates(step0_output)
    return df_filtered
    #df_clean = filter_stations_by_rules(df_filtered, drop_rules)
    #return df_clean

In [6]:
step1_output = step1(step0_output)
step1_output

Number of stations with invalid coordinates (removed): 0


Unnamed: 0_level_0,1_1850,2_1850,3_1850,4_1850,5_1850,6_1850,7_1850,8_1850,9_1850,10_1850,...,5_2023,6_2023,7_2023,8_2023,9_2023,10_2023,11_2023,12_2023,Latitude,Longitude
Station_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACW00011604,,,,,,,,,,,...,,,,,,,,,57.7667,11.8667
AE000041196,,,,,,,,,,,...,31.89,34.68,36.65,36.67,,,,,25.3330,55.5170
AEM00041184,,,,,,,,,,,...,,,,,,,,,25.6170,55.9330
AEM00041194,,,,,,,,,,,...,32.70,35.18,37.38,37.79,,,,,25.2550,55.3640
AEM00041216,,,,,,,,,,,...,,,,,,,,,24.4300,54.4700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZI000067983,,,,,,,,,,,...,,,,,,,,,-20.2000,32.6160
ZI000067991,,,,,,,,,,,...,,,,,,,,,-22.2170,30.0000
ZIXLT371333,,,,,,,,,,,...,,,,,,,,,-17.8300,31.0200
ZIXLT443557,,,,,,,,,,,...,,,,,,,,,-18.9800,32.4500


In [7]:
drop_rules = '''
NIXLT751359 omit: 0-9999
MXM00076373 omit: 0-9999
USC00044022 omit: 0-9999
USC00044025 omit: 0-9999

CHM00052836 omit: 0-1948
CHXLT909860 omit: 0-1950
BL000085365 omit: 0-1930
MXXLT948335 omit: 0-1952
ASN00058012 omit: 0-1899
ASN00084016 omit: 0-1899
ASN00069018 omit: 0-1898
NIXLT013080 omit: 0-1930
CHXLT063941 omit: 0-1937
CHM00054843 omit: 0-1937

CA002402332 omit: 2011-9999
RSM00024266 omit: 2021/09
'''

In [8]:
df = step0_output.copy()

In [9]:
start_year = 1900

In [10]:
keep_cols = df.columns

In [11]:
time_cols = [col for col in df.columns if not (col.startswith('Latitude') or col.startswith('Longitude'))]
cols_to_keep = [col for col in time_cols if int(col.split('_')[1]) > int(start_year)]
cols_to_remove = [col for col in time_cols if col not in cols_to_keep]

In [12]:
cols_to_remove

['1_1850',
 '2_1850',
 '3_1850',
 '4_1850',
 '5_1850',
 '6_1850',
 '7_1850',
 '8_1850',
 '9_1850',
 '10_1850',
 '11_1850',
 '12_1850',
 '1_1851',
 '2_1851',
 '3_1851',
 '4_1851',
 '5_1851',
 '6_1851',
 '7_1851',
 '8_1851',
 '9_1851',
 '10_1851',
 '11_1851',
 '12_1851',
 '1_1852',
 '2_1852',
 '3_1852',
 '4_1852',
 '5_1852',
 '6_1852',
 '7_1852',
 '8_1852',
 '9_1852',
 '10_1852',
 '11_1852',
 '12_1852',
 '1_1853',
 '2_1853',
 '3_1853',
 '4_1853',
 '5_1853',
 '6_1853',
 '7_1853',
 '8_1853',
 '9_1853',
 '10_1853',
 '11_1853',
 '12_1853',
 '1_1854',
 '2_1854',
 '3_1854',
 '4_1854',
 '5_1854',
 '6_1854',
 '7_1854',
 '8_1854',
 '9_1854',
 '10_1854',
 '11_1854',
 '12_1854',
 '1_1855',
 '2_1855',
 '3_1855',
 '4_1855',
 '5_1855',
 '6_1855',
 '7_1855',
 '8_1855',
 '9_1855',
 '10_1855',
 '11_1855',
 '12_1855',
 '1_1856',
 '2_1856',
 '3_1856',
 '4_1856',
 '5_1856',
 '6_1856',
 '7_1856',
 '8_1856',
 '9_1856',
 '10_1856',
 '11_1856',
 '12_1856',
 '1_1857',
 '2_1857',
 '3_1857',
 '4_1857',
 '5_1857',


In [14]:
rule_list = [x for x in drop_rules.split('\n') if x]

for i in rule_list:
    station = i.split(' omit: ')[0]
    print(station)
    year_range = i.split(' omit: ')[1]
    if '-' in i:
        start_year = year_range.split('-')[0]
        end_year = year_range.split('-')[1]
        print(start_year, end_year)
                
        if start_year == '0':
            if end_year == '9999':
                df.loc[station, df.columns.difference(['Latitude', 'Longitude'])] = np.nan
            else:
                #columns_to_keep = [col for col in df.columns if not col.startswith('Latitude') and not col.startswith('Longitude') and int(col.split('_')[1]) >= int(end_year)]
                #df.loc[station, columns_to_keep] = np.nan
                
                #columns_to_replace = [col for col in df.columns if not (col.startswith('Latitude') or col.startswith('Longitude')) and int(col.split('_')[1]) < int(start_year)]
                #columns_to_replace = [col for col in df.columns if not (col.startswith('Latitude') or col.startswith('Longitude')) and int(col.split('_')[1]) < int(start_year)]
                #columns_to_replace = [col for col in df.columns if not (col.startswith('Latitude') or col.startswith('Longitude')) and int(col.split('_')[1]) > int(start_year)]

                time_cols = [col for col in df.columns if not (col.startswith('Latitude') or col.startswith('Longitude'))]
                cols_to_keep = [col for col in time_cols if int(col.split('_')[1]) > int(end_year)]
                cols_to_replace = [col for col in time_cols if col not in cols_to_keep]
                
                df.loc[station, cols_to_replace] = np.nan
                            
    # later
    else:
        year = year_range.split('/')[0]
        month = year_range.split('/')[1]
        print(year, month)

    print('\n')

NIXLT751359
0 9999


MXM00076373
0 9999


USC00044022
0 9999


USC00044025
0 9999


CHM00052836
0 1948


CHXLT909860
0 1950


BL000085365
0 1930


MXXLT948335
0 1952


ASN00058012
0 1899


ASN00084016
0 1899


ASN00069018
0 1898


NIXLT013080
0 1930


CHXLT063941
0 1937


CHM00054843
0 1937


CA002402332
2011 9999


RSM00024266
2021 09




In [15]:
# TESTING 

before = step0_output
after = df

station_rules = [x for x in drop_rules.split('\n') if x]

for i in station_rules:
    station = i.split(' omit: ')[0]
    print(station)
    years = i.split(' omit: ')[1]
    print(years)
    
    # Test unfiltered years
    df_i = before.drop(columns=['Latitude', 'Longitude'])
    station_i = df_i.loc[station]
    columns = station_i.index.tolist()
    values = station_i.values.tolist()
    values = [x if not np.isnan(x) else np.nan for x in values]
    valid_years = []
    for j in range(len(values)):
        if np.isnan(values[j]):
            pass
        else:
            year = columns[j].split('_')[1]
            valid_years.append(year)
    valid_years = sorted(list(set(valid_years)))
    print(valid_years)
    
    # Test filtered years
    df_i = after.drop(columns=['Latitude', 'Longitude'])
    station_i = df_i.loc[station]
    columns = station_i.index.tolist()
    values = station_i.values.tolist()
    values = [x if not np.isnan(x) else np.nan for x in values]
    valid_years = []
    for j in range(len(values)):
        if np.isnan(values[j]):
            pass
        else:
            year = columns[j].split('_')[1]
            valid_years.append(year)
    valid_years = sorted(list(set(valid_years)))
    print(valid_years)
    print('\n')

NIXLT751359
0-9999
['1900', '1901', '1902', '1903', '1904', '1907', '1910', '1911', '1912', '1913', '1914', '1915', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1947', '1948', '1949', '1950']
[]


MXM00076373
0-9999
['1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1993', '1994', '1995', '1996', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '20

['1933', '1934', '1935', '1936', '1937', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988']
['1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988']


CHM00054843
0-1937
['1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', 

# Step 2

In [None]:
# Skip for now

# Step 3

In [None]:
'''
Step 3: Gridding of cells

There are 8000 cells across the globe.
Each cell's values are computed using station records within a 1200km radius.
    - Contributions are weighted according to distance to cell center
    (linearly decreasing to 0 at distance 1200km)
'''

import math
from typing import Tuple

import numpy as np
import pandas as pd
from pandas import Series


def calculate_area(row: Series) -> float:
    earth_radius_km: float = 6371.0
    delta_longitude: float = np.radians(row['Eastern'] - row['Western'])
    southern_latitude: float = np.radians(row['Southern'])
    northern_latitude: float = np.radians(row['Northern'])
    area: float = (earth_radius_km ** 2) * delta_longitude * (np.sin(northern_latitude) - np.sin(southern_latitude))
    return area


def calculate_center_coordinates(row: pd.Series) -> Tuple[float, float]:
    """Calculate the center latitude and longitude for a given box.

    Args:
        row (pd.Series): A Pandas Series representing a row of the DataFrame with ('southern', 'northern', 'western', 'eastern') coordinates.

    Returns:
        Tuple[float, float]: A tuple containing the center latitude and longitude.
    """
    center_latitude = 0.5 * (math.sin(row['Southern'] * math.pi / 180) + math.sin(row['Northern'] * math.pi / 180))
    center_longitude = 0.5 * (row['Western'] + row['Eastern'])
    center_latitude = math.asin(center_latitude) * 180 / math.pi
    return center_latitude, center_longitude


def generate_80_cell_grid() -> pd.DataFrame:
    """Generate an 80-cell grid DataFrame with columns for southern, northern, western, eastern,
    center_latitude, and center_longitude coordinates.

    Returns:
        pd.DataFrame: The generated DataFrame.
    """
    grid_data = []
    
    # Number of horizontal boxes in each band
    # (proportional to the thickness of each band)
    band_boxes = [4, 8, 12, 16]
    
    # Sines of latitudes
    band_altitude = [1, 0.9, 0.7, 0.4, 0]

    # Generate the 40 cells in the northern hemisphere
    for band in range(len(band_boxes)):
        n = band_boxes[band]
        for i in range(n):
            lats = 180 / math.pi * math.asin(band_altitude[band + 1])
            latn = 180 / math.pi * math.asin(band_altitude[band])
            lonw = -180 + 360 * float(i) / n
            lone = -180 + 360 * float(i + 1) / n
            box = (lats, latn, lonw, lone)
            grid_data.append(box)

    # Generate the 40 cells in the southern hemisphere by reversing the northern hemisphere cells
    for box in grid_data[::-1]:
        grid_data.append((-box[1], -box[0], box[2], box[3]))

    # Create a DataFrame from the grid data
    df = pd.DataFrame(grid_data, columns=['Southern', 'Northern', 'Western', 'Eastern'])

    # Calculate center coordinates for each box and add them as new columns
    center_coords = df.apply(calculate_center_coordinates, axis=1)
    df[['Center_Latitude', 'Center_Longitude']] = pd.DataFrame(center_coords.tolist(), index=df.index)

    return df
    

def interpolate(x: float, y: float, p: float) -> float:
    return y * p + (1 - p) * x


def generate_8000_cell_grid(grid_80):

    # Initialize an empty list to store subboxes
    subbox_list = []

    for index, row in grid_80.iterrows():
        alts = math.sin(row['Southern'] * math.pi / 180)
        altn = math.sin(row['Northern'] * math.pi / 180)

        for y in range(10):
            s = 180 * math.asin(interpolate(alts, altn, y * 0.1)) / math.pi
            n = 180 * math.asin(interpolate(alts, altn, (y + 1) * 0.1)) / math.pi
            for x in range(10):
                w = interpolate(row['Western'], row['Eastern'], x * 0.1)
                e = interpolate(row['Western'], row['Eastern'], (x + 1) * 0.1)

                # Create a DataFrame for the subbox
                subbox_df = pd.DataFrame({'Southern': [s], 'Northern': [n], 'Western': [w], 'Eastern': [e]})

                # Append the subbox DataFrame to the list
                subbox_list.append(subbox_df)

    # Concatenate all subboxes into a single DataFrame
    grid_8000 = pd.concat(subbox_list, ignore_index=True)

    # Calculate center coordinates for each box and add them as new columns
    center_coords = grid_8000.apply(calculate_center_coordinates, axis=1)
    grid_8000[['Center_Latitude', 'Center_Longitude']] = pd.DataFrame(center_coords.tolist(), index=grid_8000.index)

    # Calculate area of all 8000 cells
    grid_8000['Area'] = grid_8000.apply(calculate_area, axis=1)

    # Print the resulting DataFrame
    return grid_8000

def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Calculate the spherical distance (in kilometers) between two pairs of
    latitude and longitude coordinates using the Haversine formula.

    Args:
        lat1 (float): Latitude of the first point in degrees.
        lon1 (float): Longitude of the first point in degrees.
        lat2 (float): Latitude of the second point in degrees.
        lon2 (float): Longitude of the second point in degrees.

    Returns:
        float: Spherical distance in kilometers.
    """
    # Convert latitude and longitude from degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Radius of the Earth in kilometers
    radius: float = 6371.0  # Earth's mean radius

    # Haversine formula
    dlat: float = lat2 - lat1
    dlon: float = lon2 - lon1

    a: float = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c: float = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance: float = radius * c

    return distance


def linearly_decreasing_weight(distance: float, max_distance: float) -> float:
    """
    Calculate a linearly decreasing weight based on the given distance
    and maximum distance.

    Args:
        distance (float): The distance at which you want to calculate the weight.
        max_distance (float): The maximum distance at which the weight becomes 0.

    Returns:
        float: The linearly decreasing weight, ranging from 1 to 0.
    """
    # Ensure that distance is within the valid range [0, max_distance]
    distance: float = max(0, min(distance, max_distance))

    # Calculate the weight as a linear interpolation
    weight: float = 1.0 - (distance / max_distance)
    
    return weight

def nearby_stations(grid_df, station_df):

    # Initialize an empty list to store station IDs and weights as dictionaries
    station_weights_within_radius = []

    # Maximum distance for the weight calculation (e.g., 1200.0 km)
    max_distance = 1200.0

    # Use tqdm to track progress
    for index, row in tqdm(grid_df.iterrows(), total=len(grid_df), desc="Processing"):
        center_lat = row['Center_Latitude']
        center_lon = row['Center_Longitude']

        # Calculate distances for each station in station_df
        distances = station_df.apply(lambda x: haversine_distance(center_lat, center_lon, x['Latitude'], x['Longitude']), axis=1)

        # Find station IDs within the specified radius
        nearby_stations = station_df[distances <= max_distance]

        # Calculate weights for each nearby station
        weights = nearby_stations.apply(lambda x: linearly_decreasing_weight(distances[x.name], max_distance), axis=1)

        # Create a dictionary of station IDs and weights
        station_weights = dict(zip(nearby_stations['Station_ID'], weights))

        # Append the dictionary to the result list
        station_weights_within_radius.append(station_weights)

    # Add the list of station IDs and weights as a new column
    grid_df['Nearby_Stations'] = station_weights_within_radius

    # Set index name
    grid_df.index.name = 'Box_Number'
    
    return grid_df

In [None]:
grid_80 = generate_80_cell_grid()
grid_80['Area'] = grid_80.apply(calculate_area, axis=1)

grid_8000 = generate_8000_cell_grid(grid_80)
grid_8000['Area'] = grid_8000.apply(calculate_area, axis=1)

In [None]:
meta_url = 'https://data.giss.nasa.gov/pub/gistemp/v4.inv'
column_widths: List[int] = [11, 9, 10, 7, 3, 31]
station_df: pd.DataFrame = pd.read_fwf(meta_url, widths=column_widths, header=None,
                          names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])

In [None]:
grid_80 = nearby_stations(grid_80, station_df)

In [None]:
grid_8000 = nearby_stations(grid_8000, station_df)

In [None]:
grid_8000

In [None]:
def find_box_number(station_df, grid_80_df):
    box_numbers = []

    for _, station_row in tqdm(station_df.iterrows(), total=len(station_df)):
        latitude = station_row['Latitude']
        longitude = station_row['Longitude']

        for box_number, box_row in grid_80_df.iterrows():
            southern = box_row['Southern']
            northern = box_row['Northern']
            western = box_row['Western']
            eastern = box_row['Eastern']

            if southern <= latitude <= northern and western <= longitude <= eastern:
                box_numbers.append(box_number)
                break
        else:
            box_numbers.append(None)

    return box_numbers

In [None]:
# Find box numbers for each station, add to station_df
box_numbers = find_box_number(station_df, grid_80)
station_df['Box_Number'] = box_numbers

In [None]:
station_df

# Step 4: SST Data

In [None]:
# Skipping for now
# Should be consolidated into step 0 / 1

# Step 5: Anomalyzing Data

In [None]:
def anomalize_temperature_data(data, reference_period=(1951, 1980)):
    # Extract the years from the DataFrame
    years = data['Year'].unique()
    
    # Calculate monthly means for the reference period
    reference_data = data[(data['Year'] >= reference_period[0]) & (data['Year'] <= reference_period[1])]
    monthly_means = reference_data.iloc[:, 1:13].mean()
    
    # Initialize a DataFrame to store the anomalized data
    anomalized_data = data.copy()
    
    # Anomalize each month's data
    for month in tqdm(range(1, 13), desc="Anomalizing Months"):
        
        # Calculate the anomaly for the current month
        anomalized_data[f'Month_{month}'] = data.apply(lambda row: row[f'Month_{month}'] - monthly_means[month - 1], axis=1)
    
    return anomalized_data

In [None]:
df = step1_output
df_anom = anomalize_temperature_data(df, reference_period=(1951, 1980))
df_anom

# MISC

In [None]:
plt.scatter(lons, lats, s=5, c='blue', alpha=0.5, marker='o')
plt.grid()

# Add contour lines based on the point density
x_bins = np.linspace(-180, 180, 50)
y_bins = np.linspace(-90, 90, 50)
H, xedges, yedges = np.histogram2d(lons, lats, bins=(x_bins, y_bins))
plt.contour(xedges[:-1], yedges[:-1], H.T, levels=10, colors='red', linewidths=1)

# Customize labels and title
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Contour Map of Lat/Lon Pair Frequency')

# Show the plot
plt.show()

In [None]:
# Create a density heatmap using plt.hist2d
plt.hist2d(lons, lats, bins=250, cmap='viridis', vmin=0, vmax=10)

# Add a colorbar
cbar = plt.colorbar()
cbar.set_label('Density')

# Customize labels and title
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Density Heatmap of Lat/Lon Pairs')

# Show the plot
plt.grid()
plt.show()

In [None]:
# Create a figure with a Robinson projection
fig, ax = plt.subplots(subplot_kw={'projection': ccrs.Robinson(central_longitude=0)})

# Plot the latitude and longitude data in the Robinson projection
ax.scatter(lons, lats, s=10, c='blue', alpha=0.5, transform=ccrs.PlateCarree())

# Add coastlines for reference
ax.coastlines()

# Customize labels and title
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Lat/Lon Points in Robinson Projection')

# Show the plot
plt.show()