# Weather file - raw data process
- Main task: reading in raw weather data from two sources and combine all output into .csv format <br/>
- Data source: 
    1. NOAA Integrated Surface Data (ISH): https://www.ncdc.noaa.gov/isd/data-access
    2. National Solar Radiation Database (NSRD): https://rredc.nrel.gov/solar/old_data/nsrdb/
- Main output: 
    1. met data: df_temp_all, df_rh_all, df_precip_all
    2. solarad data: df_solrad_all

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import time 
import datetime

### Step 1. Hourly Meteorology Data - ISH
- Main task: 
    - Read in and process raw met data
    - Store output as dataframe and save as .csv
- Main input: 
    - /data/**ISH/***
- Main output: 
    - **df_temp_all**
    - **df_rh_all**
    - **df_precip_all**

#### 1.1: function calculating RH with temp and dew point temp input
Following the Clausius-Clapeyron relationship

In [6]:
# Creating Clausius-Clapeyron function

def CC(temp, temp_dew):
    """
    function that calculates relative humidity with temperature and dew point temperature
    temperautre input units: ˚C
    """
    # constant parameters
    Tref = 273.15  # reference temperature
    Es_Tref = 6.11 # saturation vapor pressure at reference temperature
    Lv = 2.5e+06   # latent heat of vaporation (J/kg)
    Rv = 461       # gas constant for moist air (J/kg)
    
    # transformed temperature inputs
    Tair = temp + Tref
    Tdew = temp_dew + Tref
    
    # Clausius-Clapeyron relation
    es = Es_Tref*np.exp((Lv/Rv)*(1/Tref - 1/Tair))
    e = Es_Tref*np.exp((Lv/Rv)*(1/Tref - 1/Tdew))
    rh = round(e/es,4)
    
    return(rh)

#### 1.2: read in individual met files and parse out data needed

In [7]:
%%time # run time ~30 mins

# timing related settings
years = np.arange(1961, 1991) # timeframe in which we have weather data
dateparse = lambda dates: [datetime.datetime.strptime(d, "%Y%m%d%H") for d in dates] # dateparsing method to be used in pd.read_fwf
season_start, season_end = '03-01-', '11-30-' # setting a pretty borad range for growing season

# setting up np.read_fwf arguments
colnames = ['time', 'temp', 'dew_temp', 'precip', 'precip_time', 'precip_depth', 'precip_condition', 'precip_quality', 'rh']
colspecs = [(15,25), (87,92), (93,98), (105,8193)]

# empty dataframes to store data from all site-years
df_temp_all = pd.DataFrame()
df_rh_all = pd.DataFrame()
df_precip_all = pd.DataFrame()

# reading in all weather data and storing as dataframe
for year in years:
    print(year) # output to track code progress
    times = pd.date_range(season_start + str(year), season_end + str(year), freq='1H')
    fnames = glob.glob('/home/disk/eos8/ach315/data/ISH/' + str(year) + '/*')
    
    # creating dataframes to store all site data for an individual year
    df_temp_sites = pd.DataFrame(index=times)
    df_rh_sites = pd.DataFrame(index=times)
    df_precip_sites = pd.DataFrame(index=times)
    
    for name in fnames:
        # WBAN site name 
        site_id = name.split('/')[-1].split('-')[-2]
        
        # read in individual files
        df = pd.read_fwf(name, names=colnames, colspecs=colspecs, header=None, index_col='time',
                         encoding='latin_1', dtype={'temp':int, 'precip':str}, 
                         parse_dates=True, date_parser=dateparse)
    
        # remove duplicated hours, keeping only the first measurement per hour
        df = df[df.index.duplicated(keep='first') == False]
        
        # add in missing time values (corrects for leap years) and keeps only growing season
        df = df.reindex(times, fill_value=np.nan)
        
        # finding precip data
        try:
            df.precip_time = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(0,2).astype(float)
            df.precip_depth = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(2, 6).astype(float)
            df.precip_condition = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(6,7).astype(float)
            df.precip_quality = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(7,8).astype(float)
        except: 
            print(year, name)
                
        # replacing missing values (9999) with NANs 
        df.temp = df.temp.replace({9999: np.nan})
        
        # converting units 
        df.temp = df.temp/10
        df.dew_temp = df.dew_temp/10
        df.precip_depth = df.precip_depth/10
        
        # calculating RH through Clausius Clapeyron
        df.rh = CC(df.temp, df.dew_temp)*100

        # combining weather data into individual dataframes
        df_temp = pd.DataFrame({site_id: df.temp}, index= times)
        df_rh = pd.DataFrame({site_id: df.rh}, index=times)
        df_precip = pd.DataFrame({site_id: df.precip_depth}, index=times)
        
        df_temp_sites = pd.concat([df_temp_sites, df_temp], axis= 1, sort=True)
        df_rh_sites = pd.concat([df_rh_sites, df_rh], axis=1, sort=True)
        df_precip_sites = pd.concat([df_precip_sites, df_precip], axis=1, sort=True)

    # combining all site-years data together
    df_temp_all = pd.concat([df_temp_all, df_temp_sites], sort=True)
    df_rh_all = pd.concat([df_rh_all, df_rh_sites], sort=True)
    df_precip_all = pd.concat([df_precip_all, df_precip_sites], sort=True)

1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1973 /home/disk/eos8/ach315/data/ISH/1973/725330-14827-1973
1973 /home/disk/eos8/ach315/data/ISH/1973/723530-13967-1973
1974
1974 /home/disk/eos8/ach315/data/ISH/1974/723721-23184-1974
1975
1975 /home/disk/eos8/ach315/data/ISH/1975/723676-23048-1975
1976
1976 /home/disk/eos8/ach315/data/ISH/1976/722446-93987-1976
1976 /home/disk/eos8/ach315/data/ISH/1976/724280-14821-1976
1976 /home/disk/eos8/ach315/data/ISH/1976/912120-41415-1976
1977
1977 /home/disk/eos8/ach315/data/ISH/1977/724280-14821-1977
1977 /home/disk/eos8/ach315/data/ISH/1977/726835-24230-1977
1978
1978 /home/disk/eos8/ach315/data/ISH/1978/723815-23161-1978
1979
1979 /home/disk/eos8/ach315/data/ISH/1979/723401-13963-1979
1980
1980 /home/disk/eos8/ach315/data/ISH/1980/723815-23161-1980
1981
1981 /home/disk/eos8/ach315/data/ISH/1981/723815-23161-1981
1982
1983
1983 /home/disk/eos8/ach315/data/ISH/1983/726430-14920-1983
1984
1985
1986
1987
1988
1989
1990
run time: 

#### 1.3: Output the processed weather data into individual .csv files
- Main output: 
    - /weadata/**temp_all.csv**
    - /weadata/**precip_all.csv**
    - /weadata/**rh_all.csv**

In [9]:
#df_temp_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/temp_all.csv')
#df_precip_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/precip_all.csv')
#df_rh_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/rh_all.csv')

### Step 2. Hourly Solar Radiation Data - NSRD
- Main task: 
    - Read in and process raw hourly solar radiation data
    - Store output as dataframe and save as .csv
- Main input:
    - /data/**ISH_NSRD/***
- Main output:
    - **df_solrad_all**

#### 2.1: read in solar radiation data

In [51]:
%%time

years = np.arange(1961,1963) # test one year first
colnames = ['year', 'month', 'day', 'hour', 'solrad']
colspecs = [(1,3), (4,6), (7,9), (10,12), (23,27)]

df_solrad_all = pd.DataFrame()

for y in years:
    print(y)
    
    fnames = glob.glob('/home/disk/eos8/ach315/data/ISH_NSRD/' + str(y) + '/*')    
    df_solrad_sites = pd.DataFrame()
    
    for f in fnames:
        WBAN_id = str(f.split('/')[-1].split('_')[0])
        df = pd.read_fwf(f, skiprows=[0], header=None, 
                         names=colnames, colspecs=colspecs)
        time = df.apply(lambda row: datetime.datetime(y, row['month'], row['day'], row['hour']-1), axis=1)
        solrad = df['solrad'] # Global Horizontal Radiation (Wh/m2)
        df_solrad = pd.DataFrame(solrad)
        df_solrad.columns = [WBAN_id]
        df_solrad.index = time        
        df_solrad_sites = pd.concat([df_solrad_sites, df_solrad], axis=1, sort=True)
            
    df_solrad_all = pd.concat([df_solrad_all, df_solrad_sites], sort=True)

# convert 9999 values into NANs
df_solrad_all = df_solrad_all.replace({9999: np.nan})

1961
1962
CPU times: user 5min 40s, sys: 886 ms, total: 5min 40s
Wall time: 5min 44s


#### 2.2: Output the processed weather data into individual .csv files
- Main output: 
    - /weadata/**solrad_all.csv**

In [None]:
#df_solrad_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/solrad_all.csv')