* This file is for the downloading and extraction of all data files from 1950 - 2000 from the NOAA database.
* All non-contiguous-USA datafiles are ignored


In [1]:
import requests
import gzip
import tarfile
import os
import numpy as np
import pandas as pd
from pyquery import PyQuery as pq
from urllib import request as urlreq
import reverse_geocoder as rg

In [2]:
endpt = 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/archive/'
req = requests.get(endpt)
html = pq(req.text)     #turn page into html for parsing using pyquery
html_lst = list(html.items('a'))[5:]       #only get the links to the .gz files, ignore header etc

#append links to list
links = []
for l in html_lst:
    links.append(str(l).split('"')[1])

#only get 1950 - 2000
links = links[links.index('1950.tar.gz'): links.index('2000.tar.gz')+1]


In [4]:
#PARSE ALL DATA IN LINKS 
for l in links:
    #open gzip to temp dir
    temp_end = f"{endpt}{l}"
    response = requests.get(temp_end, stream=True)
    file = tarfile.open(fileobj=response.raw, mode="r|gz")
    file.extractall(path = "./temp")

    #move data from each csv  to dataframe then delete csv
    df = pd.DataFrame()
    for filename in os.listdir("./temp"):
        temp = pd.read_csv(f"./temp/{filename}")
        if temp["LATITUDE"].isnull().any() or temp["LONGITUDE"].isnull().any():
            os.remove(f"./temp/{filename}")
            continue
        (lat, long) = (float(temp["LATITUDE"][0]), float(temp["LONGITUDE"][0]))  #get location info
        if lat > 49.4 or lat < 24.4 or long < -125 or long > -66.9: #rough estimate of US location -- rough filter 
            os.remove(f"./temp/{filename}")
            continue
        #edge cases -- check station name
        if temp["NAME"][0][-2:] != "US":
            os.remove(f"./temp/{filename}")
            continue
    # loc = rg.search((lat, long))[0]
    # if loc['cc'] != 'US':
    #     os.remove(f"./temp/{filename}")
    #     continue
    # if loc['admin1'] == 'Hawaii' or loc['admin1'] == 'Alaska':
    #     os.remove(f"./temp/{filename}")
    #     continue
        df = pd.concat([df, temp])
        os.remove(f"./temp/{filename}")

    #parse date and filter data
    df["DATE"] = pd.to_datetime(df["DATE"])
    df["MONTH"] = df["DATE"].dt.month
    df["YEAR"] = df["DATE"].dt.year
    df = df[["NAME", "DATE", "YEAR", "MONTH", "TEMP", "DEWP", "MIN", "MAX", "PRCP", "SNDP"]]
    
    #export raw data as csv
    df.to_csv(f"./raw_data/{l[:4]}_raw.csv")
