In [20]:
# Standard library imports
import sys
import os
import requests
from io import StringIO

# 3rd-party library imports
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

In [21]:
# GHCN Land Data
file_url = 'https://data.giss.nasa.gov/pub/gistemp/ghcnm.tavg.qcf.dat'

try:
    # Send an HTTP GET request to the URL
    response = requests.get(file_url)

    # Check if the request was successful
    if response.status_code == 200:
        
        # Get the content of the response
        file_data = response.content.decode("utf-8")

        # Create a list to store formatted data
        formatted_data = []

        # Loop through file data
        for line in file_data.split('\n'):
            
            # Check if line is not empty
            if line.strip():
                
                # Extract relevant data
                # (Using code from GHCNV4Reader())
                station_id = line[:11]
                year = int(line[11:15])
                values = [int(line[i:i+5]) for i in range(19, 115, 8)]
                
                # Append data to list
                formatted_data.append([station_id, year] + values)

        # Create DataFrame from formatted data
        column_names = ['Station_ID', 'Year'] + [f'Month_{i}' for i in range(1, 13)]
        df_GHCN = pd.DataFrame(formatted_data, columns=column_names)
        
        # Replace -9999 with NaN
        df_GHCN.replace(-9999, np.nan, inplace=True)
        
        # Format data - convert to degrees C
        month_columns = [f'Month_{i}' for i in range(1, 13)]
        df_GHCN[month_columns] = df_GHCN[month_columns].divide(100)

    else:
        print("Failed to download the file. Status code:", response.status_code)

except Exception as e:
    print("An error occurred:", str(e))

In [22]:
df_GHCN

Unnamed: 0,Station_ID,Year,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,ACW00011604,1961,-0.93,2.32,4.68,7.69,11.24,15.95,15.66,14.77,14.09,11.70,5.06,-0.43
1,ACW00011604,1962,1.09,0.81,-1.58,6.31,9.04,13.77,15.06,13.89,11.59,9.90,3.19,-1.30
2,ACW00011604,1963,-7.17,-5.57,-1.03,5.37,12.20,16.23,16.16,15.92,13.28,9.36,5.62,-1.12
3,ACW00011604,1964,0.58,-0.89,0.51,7.34,12.15,14.38,15.02,15.53,12.17,7.84,5.42,1.08
4,ACW00011604,1965,0.40,-1.09,0.34,5.86,9.83,14.96,14.83,14.73,13.73,9.70,0.27,-1.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453161,ZIXLT622116,1966,21.80,20.40,18.40,16.90,14.30,12.80,12.10,14.60,17.70,19.80,20.90,21.10
1453162,ZIXLT622116,1967,21.10,19.90,18.90,19.20,15.10,13.50,11.00,13.80,16.60,20.80,19.90,19.10
1453163,ZIXLT622116,1968,21.80,20.00,19.30,18.20,15.60,10.80,13.70,16.30,17.60,21.80,18.40,20.70
1453164,ZIXLT622116,1969,20.90,21.50,19.50,18.30,14.10,13.10,11.60,14.60,17.80,21.00,20.40,19.10


In [23]:
# GHCN Station Meta data
file_url = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'

# Define the column widths
column_widths = [11, 9, 10, 7, 3, 31]

# Create DataFrame
df_meta = pd.read_fwf(file_url, widths=column_widths, header=None, names=['Station_ID', 'Latitude', 'Longitude', 'Elevation', 'State', 'Name'])
df_meta

Unnamed: 0,Station_ID,Latitude,Longitude,Elevation,State,Name
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL
...,...,...,...,...,...,...
124949,ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON
124950,ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO
124951,ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE
124952,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE


In [24]:
print(len(df_GHCN))
print(len(df_meta))

1453166
124954


In [25]:
# Merge on station ID
df = pd.merge(df_GHCN, df_meta[['Station_ID', 'Latitude', 'Longitude', 'Name']], on='Station_ID', how='left')

# Set index
df = df.set_index('Station_ID')
df

Unnamed: 0_level_0,Year,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,Latitude,Longitude,Name
Station_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ACW00011604,1961,-0.93,2.32,4.68,7.69,11.24,15.95,15.66,14.77,14.09,11.70,5.06,-0.43,17.1167,-61.7833,ST JOHNS COOLIDGE FLD
ACW00011604,1962,1.09,0.81,-1.58,6.31,9.04,13.77,15.06,13.89,11.59,9.90,3.19,-1.30,17.1167,-61.7833,ST JOHNS COOLIDGE FLD
ACW00011604,1963,-7.17,-5.57,-1.03,5.37,12.20,16.23,16.16,15.92,13.28,9.36,5.62,-1.12,17.1167,-61.7833,ST JOHNS COOLIDGE FLD
ACW00011604,1964,0.58,-0.89,0.51,7.34,12.15,14.38,15.02,15.53,12.17,7.84,5.42,1.08,17.1167,-61.7833,ST JOHNS COOLIDGE FLD
ACW00011604,1965,0.40,-1.09,0.34,5.86,9.83,14.96,14.83,14.73,13.73,9.70,0.27,-1.82,17.1167,-61.7833,ST JOHNS COOLIDGE FLD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZIXLT622116,1966,21.80,20.40,18.40,16.90,14.30,12.80,12.10,14.60,17.70,19.80,20.90,21.10,,,
ZIXLT622116,1967,21.10,19.90,18.90,19.20,15.10,13.50,11.00,13.80,16.60,20.80,19.90,19.10,,,
ZIXLT622116,1968,21.80,20.00,19.30,18.20,15.60,10.80,13.70,16.30,17.60,21.80,18.40,20.70,,,
ZIXLT622116,1969,20.90,21.50,19.50,18.30,14.10,13.10,11.60,14.60,17.80,21.00,20.40,19.10,,,


In [26]:
# Calculate fraction of NaN values
nan_fraction = round(df.isna().mean().mean() * 100, 3)
print(f'Fraction of NaN values in the DataFrame: {nan_fraction}%')

Fraction of NaN values in the DataFrame: 13.286%


In [27]:
df.isna().mean()

Year         0.000000
Month_1      0.147064
Month_2      0.138692
Month_3      0.140215
Month_4      0.139595
Month_5      0.138804
Month_6      0.138558
Month_7      0.141294
Month_8      0.146738
Month_9      0.145064
Month_10     0.146840
Month_11     0.147269
Month_12     0.153137
Latitude     0.134153
Longitude    0.134153
Name         0.134153
dtype: float64

In [41]:
def filter_coordinates(df):
    """
    Filters a DataFrame based on latitude and longitude conditions.

    Args:
    df (pd.DataFrame): The input DataFrame with 'Latitude' and 'Longitude' columns.

    Returns:
    pd.DataFrame: The filtered DataFrame with rows where latitude is between -90 and 90,
    and longitude is between -180 and 180.
    """
    
    # Define latitude and longitude range conditions
    lat_condition = (df['Latitude'] >= -90) & (df['Latitude'] <= 90)
    lon_condition = (df['Longitude'] >= -180) & (df['Longitude'] <= 180)

    # Apply the conditions to filter the DataFrame
    df_filtered = df[lat_condition & lon_condition]
    
    # Calculate number of rows filtered
    num_filtered = len(df) - len(df_filtered)
    print(f'Number of rows with invalid coordinates (removed): {num_filtered}')

    return df_filtered    

In [42]:
df_filtered = filter_coordinates(df)

Number of rows with invalid coordinates (removed): 194947


# Xarray Conversion (incomplete)

In [13]:
dataset = xr.Dataset.from_dataframe(df)
dataset