In [9]:
#dependencies for all functions
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import geopandas as gpd
import pickle
from netCDF4 import Dataset

In [10]:
def plot_AB(prov='AB'):

    """
    plot borders of alberta
    
    example:
    import geopandas as gpd
    import matplotlib.pyplot as plt
    plot_AB()
    plt.show()
    """
    
    provIndex=0
    provshapes_filename = '/Users/samanderson/Desktop/MATLAB/streamflow/Canada_Borders/PROVINCE.SHP'
    provshapes = gpd.read_file(provshapes_filename)
    provPoly = provshapes['geometry'][provIndex]
    lonBorder,latBorder = provPoly.exterior.coords.xy 

    plt.plot(lonBorder,latBorder,'k')

In [20]:
def plot_rivers():

    """
    plots rivers in Canada; takes a hot minute to run
    
    example:
    import geopandas as gpd
    import matplotlib.pyplot as plt
    plot_rivers()
    """
    
    filename = '/Users/samanderson/Desktop/MATLAB/streamflow/Rivers_Lakes/RiversAndLakes_7.5m.shp'
    rivshapes = gpd.read_file(filename)
    rivPoly = rivshapes['geometry']

    for ind in range(len(rivPoly)-1): #for each linestring object, plot water feature
        lon,lat = rivPoly[ind].coords.xy 
        plt.plot(lon,lat,'k')

In [11]:
def get_ProvinceDailyTemp(prov):
    
    """
    get T field in Alberta, from 1979 to 2010
    
    example:
    import geopandas as gpd
    import numpy as np
    from netCDF4 import Dataset
    T, lon, lat, hoursERA = get_ProvinceDailyTemp('AB')
    """

    #get lat/lon of province
    if prov=='AB':
        provIndex = 0 #index in shapefile of province of interest
    elif prov=='BC':
        provIndex = 11
    else:
        print('ERROR: UNKNOWN PROVINCE')

    provshapes_filename = '/Users/samanderson/Desktop/MATLAB/streamflow/Canada_Borders/PROVINCE.SHP'
    provshapes = gpd.read_file(provshapes_filename)
    provPoly = provshapes['geometry'][provIndex]
    lonBorder,latBorder = provPoly.exterior.coords.xy 

    #open netcdf file
    fileDirERA = '/Users/samanderson/Desktop/MATLAB/streamflow/'
    filenameERA = 'interim_1979-01-01to2010-12-31_AB_BC.nc'
    filePathERA = fileDirERA + filenameERA
    ERA = Dataset(filePathERA)

    #extract data from file
    lonERA = ERA.variables['longitude'][:] #longitude in degrees W 
    lonERA = -np.abs(lonERA-360) #longitude, in -degrees W
    latERA = ERA.variables['latitude'][:]
    hoursERA = ERA.variables['time'][:] #hours since Jan 1, 1900
    #hoursERA = hoursERA - hoursERA[0] #set first time to zero -- now measure time in hours from start of file
    T_hourly = ERA.variables['t2m'][:] #2-metre temperature, in Kelvin

    #convert temp data at 00:00, 06:00, 12:00, 18:00 to daily averages
    T = []
    for daynum in range(int(len(hoursERA)/4)): #for each day
        inds = list(range(daynum*4,daynum*4+4))
        T.append(np.mean(T_hourly[inds,:,:],axis=0))

    #find which ERA lat/lon are within the province -- set others to nan
    from descartes import PolygonPatch
    borderPatch = PolygonPatch(provPoly)

    inProv = np.zeros_like(T[0])
    ilat = 0
    ilon = 0
    for latTest in latERA:
        ilon=0
        for lonTest in lonERA:
            point = [lonTest,latTest]
            inProv[ilat,ilon] = borderPatch.contains_point(point,radius=0)
            if not inProv[ilat,ilon]:
                inProv[ilat,ilon] = np.nan
            ilon+=1
        ilat+=1

    initDate = [1979,1,1]
    finDate = [2010,12,31]
    years,months,days,dayInds = get_betweenDates(initDate,finDate)
        
    tempDict = {
        'T':T,
        'lonERA':lonERA,
        'latERA':latERA,
        'hoursERA':hoursERA,
        'yearsERA':years,
        'monthsERA':months,
        'daysERA':days        
    }

    #for day in range(len(T)): #for each day, apply mask
        #T[day]*=inProv

    return tempDict

In [12]:
def get_ProvinceDailyPrec(prov):

    """
    get P field in Alberta, from 1979 to 2010
    
    example:
    import geopandas as gpd
    import numpy as np
    from netCDF4 import Dataset
    P, lon, lat, hoursERA = get_ProvinceDailyPrec('AB')
    """
    
    #get lat/lon of province
    if prov=='AB':
        provIndex = 0 #index in shapefile of province of interest
    elif prov=='BC':
        provIndex = 11
    else:
        print('ERROR: UNKNOWN PROVINCE')

    provshapes_filename = '/Users/samanderson/Desktop/MATLAB/streamflow/Canada_Borders/PROVINCE.SHP'
    provshapes = gpd.read_file(provshapes_filename)
    provPoly = provshapes['geometry'][provIndex]
    lonBorder,latBorder = provPoly.exterior.coords.xy 

    #open netcdf file
    fileDirERA = '/Users/samanderson/Desktop/MATLAB/streamflow/'
    filenameERA = 'interim_1979-01-01to2010-12-31_AB_BC_Prec_12hr_step.nc'
    filePathERA = fileDirERA + filenameERA
    ERA = Dataset(filePathERA)

    #extract data from file
    lonERA = ERA.variables['longitude'][:] #longitude in degrees W 
    lonERA = -np.abs(lonERA-360) #longitude, in -degrees W
    latERA = ERA.variables['latitude'][:]
    hoursERA = ERA.variables['time'][:] #hours since Jan 1, 1900
    #hoursERA = hoursERA - hoursERA[0] #set first time to zero -- now measure time in hours from start of file
    P_12hourly = ERA.variables['tp'][:] #accumulated precipitation, in mm

    #convert temp data at 00:00, 06:00, 12:00, 18:00 to daily averages
    P = []
    for daynum in range(int(len(hoursERA)/2)): #for each day
        inds = list(range(daynum*2,daynum*2+2))
        P.append(np.sum(P_12hourly[inds,:,:],axis=0))

    #find which ERA lat/lon are within the province -- set others to nan
    from descartes import PolygonPatch
    borderPatch = PolygonPatch(provPoly)

    inProv = np.zeros_like(P[0])
    ilat = 0
    ilon = 0
    for latTest in latERA:
        ilon=0
        for lonTest in lonERA:
            point = [lonTest,latTest]
            inProv[ilat,ilon] = borderPatch.contains_point(point,radius=0)
            if not inProv[ilat,ilon]:
                inProv[ilat,ilon] = np.nan
            ilon+=1
        ilat+=1
        
    initDate = [1979,1,1]
    finDate = [2010,12,31]
    years,months,days,dayInds = get_betweenDates(initDate,finDate)
        
    precDict = {
        'P':P,
        'lonERA':lonERA,
        'latERA':latERA,
        'hoursERA':hoursERA,
        'yearsERA':years,
        'monthsERA':months,
        'daysERA':days        
    }

    #for day in range(len(T)): #for each day, apply mask
        #P[day]*=inProv

    return precDict

In [13]:
def get_betweenDates(initDate,finDate):
    
    """
    out: years, months, days, dayInds -- np arrays of the dates between (inclusive) initDate and finDate
    in: initDate: [yyyy,mm,dd]
        finDate: [yyyy,mm,dd]
    
    example:
    
    import numpy as np
    initDate = [1979,1,1]
    finDate = [2010,12,31]
    years,months,days,dayInds = get_betweenDates(initDate,finDate)
    """

    monthsInYear = np.hstack([1*np.ones((1,31)), 2*np.ones((1,28)), 3*np.ones((1,31)), 4*np.ones((1,30)), 5*np.ones((1,31)),
                              6*np.ones((1,30)), 7*np.ones((1,31)), 8*np.ones((1,31)), 9*np.ones((1,30)), 10*np.ones((1,31)), 
                              11*np.ones((1,30)), 12*np.ones((1,31))])
    monthsInYear = monthsInYear[0]
    monthsInYear_ly = np.hstack([1*np.ones((1,31)), 2*np.ones((1,29)), 3*np.ones((1,31)), 4*np.ones((1,30)), 5*np.ones((1,31)), 
                                 6*np.ones((1,30)), 7*np.ones((1,31)), 8*np.ones((1,31)), 9*np.ones((1,30)), 10*np.ones((1,31)), 
                                 11*np.ones((1,30)), 12*np.ones((1,31))])
    monthsInYear_ly = monthsInYear_ly[0]
    daysInYear = np.hstack([range(1,32), range(1,29), range(1,32), range(1,31), range(1,32), range(1,31), range(1,32), 
                  range(1,32), range(1,31), range(1,32), range(1,31), range(1,32)])
    daysInYear_ly = np.hstack([range(1,32), range(1,30), range(1,32), range(1,31), range(1,32), range(1,31), range(1,32), 
                  range(1,32), range(1,31), range(1,32), range(1,31), range(1,32)])

    years = []
    months = []
    days = []
    dayInds = []
    for year in range(initDate[0],finDate[0]+1): #for each year, append on the right day/month/year vector

        if np.mod(year,4)!=0: #if it is not a leap year
            years.append(year*np.ones((1,365)))
            months.append(monthsInYear)
            days.append(daysInYear)
            dayInds.append(range(1,366))
        else: #if it is a leap year
            years.append(year*np.ones((1,366)))
            months.append(monthsInYear_ly)
            days.append(daysInYear_ly)
            dayInds.append(range(1,367))

    years = np.hstack(years)[0]
    months = np.hstack(months)
    days = np.hstack(days)
    dayInds = np.hstack(dayInds)
    
    return years, months, days, dayInds

In [14]:
def get_ProvinceDailyStreamflow(prov='AB', yearType=3, computeFlow = 0, saveFlowVars = 0):
    
    """
    loads streamflow data for province of Alberta.
    
    prov: 'AB' ('BC' hopefully in future will work later)
    yearType: 1==ActNat50; 2==ActNat40; 3==ActNat30; 4==ActReg40; 5==ActNatReg40; 6==ActLakes40; 7==ActNat30_2014
    computeFlow: do you want to compute the flow from raw data (1) or do you want to just load saved data (0)?
    saveFlowVars: if you are computing the flow, do you want to save the flow variables (1) or not (0)?
    
    example:
    
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    from datetime import datetime
    import numpy as np
    import geopandas as gpd
    import pickle
    flowDict = get_ProvinceDailyStreamflow()
    
    """

    #prov = 'AB'
    #yearType = 3 #1==ActNat50; 2==ActNat40; 3==ActNat30; 4==ActReg40; 5==ActNatReg40; 6==ActLakes40; 7==ActNat30_2014
    #computeFlow = 1 #1: open flow files and format variables; 0: open saved flow data
    #saveFlowVars = 1 #1: save the flow variables; 0: don't save flow variables

    folderWithFiles = '/Users/samanderson/Desktop/MATLAB/streamflow/ActNat30/ABActNat30' #folder that contains all of the flow files
    yearRange = [1987,2010]
    window = yearRange
    windowyears = window[1] - window[0]

    #get names of all files in 'folderWithFiles'
    flowFiles = []
    for root, dirs, files in os.walk(folderWithFiles):
        for name in sorted(files):
            if name[0]=='0' or name[0]=='1': #if the current file is one that contains flow data (all flow files have a 0 at the start)
                flowFiles.append(os.path.join(root,name)) #full path name of this file
            else: #if it is not a file that contains flow data
                if name[0]!='.': #if it is not .DS_store, ie: if it is the 'ABActNat30.csv' (or equivalent) that is a table of data for each file
                    infofilename = os.path.join(root,name)               

        df = pd.read_csv(infofilename,encoding = "ISO-8859-1") #read in summary data of all streamflow files
        cols = [col.strip() for col in df.columns] #some columns have white space in name -- remove this
        df.columns = cols
        df = df.drop([0,len(df)-2,len(df)-1]) #remove empty rows
        df.index = range(len(df)) #redo column indices (otherwise will start at 1)
        totalStations = len(df)
        #totalStations = 3

        #extract the most often used columns
        stationID = df['Station']
        stationName = df['StationName']
        stationLat = df['Latitude'].astype(float)
        stationLon = df['Longitude'].astype(float)
        stationDrainageArea = df['DrainageArea']

    if computeFlow:  

        #initialize
        all_flowseason = np.zeros([totalStations,365])
        all_flowseason_NF = np.zeros_like(all_flowseason)
        all_flowseason_norm = np.zeros_like(all_flowseason)
        all_flowseason_norm_NF = np.zeros_like(all_flowseason)
        all_flowseason_norm_smooth = np.zeros_like(all_flowseason)
        all_flowseason_norm_smooth_NF = np.zeros_like(all_flowseason)
        yearvec = []
        yearmin = np.zeros([totalStations,1])
        yearmax = np.zeros_like(yearmin)
        all_flow = []
        all_flow_NF = []
        all_flowwindow = []
        all_flowwindow_NF = []
        all_flowwindow_norm = []
        all_flowwindow_norm_NF = []

        #get dates within window of interest
        initDate = str(yearRange[0]) + '-01-01'
        finDate = str(yearRange[1]) + '-12-31'
        windowDatesTimestamp = pd.date_range(initDate,finDate)  
        windowDates = [datetime.strftime(ii,'%Y-%m-%d') for ii in windowDatesTimestamp]
        windowYears = np.asarray([int(d[0:4]) for d in windowDates])
        windowMonths = np.asarray([int(d[5:7]) for d in windowDates])
        windowDays = np.asarray([int(d[8:10]) for d in windowDates])

        for ind in range(totalStations): #for each station/flowfile

            print('Computing: Station ' + str(ind+1) + '/' + str(totalStations))

            filename = flowFiles[ind]

            df = pd.read_csv(filename)
            df = df.drop([len(df)-2,len(df)-1])

            dates = df['Date'] #these are the dates that are in the data -- missing dates are omitted -- want to fill

            #reformat dates from yyyy/mm/dd to yyyy-mm-dd
            objdates = [datetime.strptime(date,'%Y/%m/%d') for date in dates] 
            newdate = [datetime.strftime(date,'%Y-%m-%d') for date in objdates]
            df.index = newdate

            idx = pd.date_range(dates[0],dates[len(dates)-1]) #this is all of the dates
            idxdate = [datetime.strftime(ii,'%Y-%m-%d') for ii in idx]
            df = df.reindex(idxdate,fill_value=np.nan) #missing dates are filled with nan
            df['Dates'] = idxdate #dates column is now filled

            dates = df['Dates'] #filled with dates
            flow = np.asarray(df['Flow']) #filled with nans
            years = np.asarray([int(d[0:4]) for d in dates])
            months = np.asarray([int(d[5:7]) for d in dates])
            days = np.asarray([int(d[8:10]) for d in dates])
            yearmin[ind] = np.min(years)
            yearmax[ind] = np.max(years)

            yearInds = [np.argwhere(years==year) for year in range(yearRange[0],yearRange[1]+1)]
            currFlow = [[flow[ind] for ind in yearInd] for yearInd in yearInds]
            #currFlow_NF = [currFlow[ind][np.isnan(all_flowseason[ind])]
            currFlowMat = [currFlow[ind][:365] for ind in range(len(currFlow))]
            currFlow_NF = []

            all_flowseason[ind] = np.squeeze(np.nanmean(currFlowMat,0))

            flowseason_mean = np.nanmean(all_flowseason[ind])
            flowseason_std = np.nanstd(all_flowseason[ind])

            all_flowseason[ind][np.isnan(all_flowseason[ind])] = np.nanmin(all_flowseason[ind])
            all_flowseason_norm[ind] = (all_flowseason[ind] - flowseason_mean)/flowseason_std

            all_flowwindow.append(np.vstack(currFlow))
            all_flowwindow_norm.append((all_flowwindow[ind] - flowseason_mean)/flowseason_std)

            x = pd.Series(all_flowseason_norm[ind])      
            all_flowseason_norm_smooth[ind] = x.rolling(30).mean()

            #fill nans: fill seasonal with min values; fill flow with seasonal
            all_flowseason_NF[ind] = all_flowseason[ind]

            all_flowseason_NF[ind][np.isnan(all_flowseason[ind])] = np.nanmin(all_flowseason[ind])
            flowseason_mean_NF = np.nanmean(all_flowseason_NF[ind])
            flowseason_std_NF = np.nanstd(all_flowseason_NF[ind])
            all_flowseason_norm_NF[ind] = (all_flowseason_NF[ind] - flowseason_mean_NF)/flowseason_std_NF

            x = pd.Series(all_flowseason_norm[ind])      
            all_flowseason_norm_smooth_NF[ind] = x.rolling(30).mean()

            #currFlow_NF = []
            for ind1 in range(len(currFlow)):
                dummy = np.squeeze(currFlow[ind1])
                if len(np.argwhere(np.isnan(np.squeeze(np.squeeze(currFlow[ind1])))))>0: #if there are nans to fill
                    if np.argwhere(np.isnan(np.squeeze(dummy)))[-1]<365: #if not a leap year
                        dummy[np.argwhere(np.isnan(np.squeeze(dummy)))] = all_flowseason[ind][np.argwhere(np.isnan(np.squeeze(dummy)))]
                        currFlow_NF.append(np.expand_dims(dummy,1))
                    else: #if leap year
                        dummy[np.argwhere(np.isnan(np.squeeze(dummy)))[0:-1]] = all_flowseason[ind][np.argwhere(np.isnan(np.squeeze(dummy)))[0:-1]]
                        dummy[-1] = dummy[-2]
                        currFlow_NF.append(np.expand_dims(dummy,1))
                else: #if there are no nans to fill in the first place
                    currFlow_NF = currFlow

            currFlowMat_NF = [currFlow_NF[ind][:365] for ind in range(len(currFlow_NF))]

            all_flowwindow_NF.append(np.vstack(currFlow_NF))
            all_flowwindow_norm_NF.append((all_flowwindow_NF[ind] - flowseason_mean_NF)/flowseason_std_NF)

            x = pd.Series(all_flowseason_norm_NF[ind])      
            all_flowseason_norm_smooth_NF[ind] = x.rolling(30).mean()

        flowDict = {
            'stationID':stationID,
            'stationName':stationName,
            'stationLat':stationLat,
            'stationLon':stationLon,
            'stationDrainageArea':stationDrainageArea,
            'all_flowseason':all_flowseason,
            'all_flowseason_NF':all_flowseason_NF,
            'all_flowseason_norm':all_flowseason_norm,
            'all_flowseason_norm_NF':all_flowseason_norm_NF,
            'all_flowseason_norm_smooth':all_flowseason_norm_smooth,
            'all_flow':all_flow,
            'all_flowwindow':all_flowwindow,
            'all_flowwindow_NF':all_flowwindow_NF,
            'all_flowwindow_norm':all_flowwindow_norm,
            'all_flowwindow_norm_NF':all_flowwindow_norm_NF,
            'windowDates':windowDates,
            'windowYears':windowYears,
            'windowMonths':windowMonths,
            'windowDays':windowDays
        }

        if saveFlowVars:

            pickle_out = open('ABActNat30_flowvars.pickle','wb')
            pickle.dump(flowDict,pickle_out)
            pickle_out.close()

    else:

        pickle_in = open('ABActNat30_flowvars.pickle','rb')
        flowDict = pickle.load(pickle_in)

        stationID = flowDict['stationID']
        stationName = flowDict['stationName']
        stationLat = flowDict['stationLat']
        stationLon = flowDict['stationLon']
        stationDrainageArea = flowDict['stationDrainageArea']
        all_flowseason = flowDict['all_flowseason']
        all_flowseason_norm = flowDict['all_flowseason_norm']
        all_flow = flowDict['all_flow']
        all_flowwindow = flowDict['all_flowwindow']
        all_flowwindow_norm = flowDict['all_flowwindow_norm']
        windowDates = flowDict['windowDates']
        windowYears = flowDict['windowYears']
        windowMonths = flowDict['windowMonths']
        windowDays = flowDict['windowDays']
            
    return flowDict