In [1]:
import requests
import pandas as pd 
import numpy as np
import os
import re
from datetime import datetime
from bs4 import BeautifulSoup

In [3]:
def get_year_urls() -> list: 
  """
  Retrieves the URLs for every year's page in the USCRN index.
  
  Returns:
  year_urls (list): A list of URLs for every year's page.
  """

  url = "https://www.ncei.noaa.gov/pub/data/uscrn/products/subhourly01/"
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")

  links = soup.find_all("a") 
  years = [str(x).zfill(1) for x in range(2000,2024)]
  year_urls = [url + link['href'] for link in links if link['href'].rstrip('/') in years]
  return year_urls

def get_file_urls() -> list: 
  """
  Retrieves the URLs for every file contained on each year's page.

  Returns: 
  file_urls (list): A list of file URLs.
  """
  year_urls = get_year_urls()

  file_urls = []
  for url in year_urls: 
    response = requests.get(url) 
    soup = BeautifulSoup(response.content, 'html.parser')
    file_links = soup.find_all('a', href=re.compile(r'AK.*\.txt'))
    if file_links:
      new_file_urls = [url + link.getText() for link in file_links]
      file_urls.extend(new_file_urls)
  return file_urls


def get_station_location(url) -> str: 
  """
  Extracts the name of the station from a given URL.
  
  Args:
  url (str): The URL to extract the station name from.
  
  Returns:
  station_location (str): The name of the station.
  """
  regex = r"([St.]*[A-Z][a-z]+_*[A-Za-z]*).*.txt" 
  file_name = re.search(regex, url).group(0)
  station_location = re.sub("(_formerly_Barrow.*|_[0-9].*)", "", file_name)
  return  station_location
  

# Get rows for current batch
url = "https://www.ncei.noaa.gov/pub/data/uscrn/products/subhourly01/2023/CRNS0101-05-2023-AK_Aleknagik_1_NNE.txt"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:

# Get location from url
station_location = get_station_location(url)
print(station_location)
# Get new rows 
rows = [re.split('\s+', row) for row in str(soup).strip().split("\n")]
print(new_rows)

In [8]:

wind_cols = [[station_location] + row[:5] + row[-2:] for row in new_rows]
print(wind_cols)


[['Aleknagik', '23583', '20230101', '0005', '20221231', '1505', '3.09', '0'], ['Aleknagik', '23583', '20230101', '0010', '20221231', '1510', '2.96', '0'], ['Aleknagik', '23583', '20230101', '0015', '20221231', '1515', '2.70', '0'], ['Aleknagik', '23583', '20230101', '0020', '20221231', '1520', '2.84', '0'], ['Aleknagik', '23583', '20230101', '0025', '20221231', '1525', '2.76', '0'], ['Aleknagik', '23583', '20230101', '0030', '20221231', '1530', '2.80', '0'], ['Aleknagik', '23583', '20230101', '0035', '20221231', '1535', '3.04', '0'], ['Aleknagik', '23583', '20230101', '0040', '20221231', '1540', '2.95', '0'], ['Aleknagik', '23583', '20230101', '0045', '20221231', '1545', '2.50', '0'], ['Aleknagik', '23583', '20230101', '0050', '20221231', '1550', '2.85', '0'], ['Aleknagik', '23583', '20230101', '0055', '20221231', '1555', '2.82', '0'], ['Aleknagik', '23583', '20230101', '0100', '20221231', '1600', '3.17', '0'], ['Aleknagik', '23583', '20230101', '0105', '20221231', '1605', '2.85', '0']

In [9]:
columns = ['station_location','wbanno','utc_date','utc_time',
'lst_date','lst_time',"wind_1_5", "wind_flag"]
  

df = pd.DataFrame(wind_cols, columns=columns)

In [19]:
df.columns

Index(['station_location', 'wbanno', 'utc_date', 'utc_time', 'lst_date',
       'lst_time', 'wind_1_5', 'wind_flag'],
      dtype='object')

In [None]:
# convert wind_1_5 to float
df['wind_1_5'] = df['wind_1_5'].astype(float)

# convert to datetimes
df['utc_datetime'] = pd.to_datetime(df['utc_date'].astype(int).astype(str) + df['utc_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')
df['lst_datetime'] = pd.to_datetime(df['lst_date'].astype(int).astype(str) + df['lst_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')

# drop old date and time columns
df.drop(['utc_date', 'utc_time', 'lst_date', 'lst_time'], axis=1, inplace=True)

# reorder columns 
cols = ['station_location','wbanno','utc_datetime','lst_datetime', 'wind_1_5', 'wind_flag'] 
df = df[cols]

# add date-added column
df['date_added_utc'] = datetime.utcnow() 

