In [1]:
from netCDF4 import Dataset
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import pickle
from scipy import stats
%matplotlib inline


In [2]:
#make one empty list for each soil moisture product

products = ['AWRAL.csv',
 'SMOS_LMEB_A.csv',
 'AMSR2_LPRM_D.csv',
 'ASCAT_TUW_D.csv',
 'SMOS_LPRM_A.csv',
 'WaterDyn.csv',
 'MSDI.csv',
 'API.csv',
 'KBDI.csv',
 'AMSR2_JAXA_A.csv',
 'CABLE.csv']

#we will keep them in this order

aux = []
f=open('multi_products/SWEEP.v1.0/' + products[0],'r',encoding="ISO-8859-1")
#reading the first product file

for line in f:
    if line[:9]== 'Site name':
        x=line.split(',')
        aux.append(x[1:]) 
    if line[:3] == 'Lat':
        x=line.split(',')
        aux.append(x[1:])
    elif line[:3]=='Lon':
        x=line.split(',')
        aux.append(x[1:])
        
sites = []
for i in range(len(aux[0])):
    try: 
        lat = float(aux[0][i][0:-1]) 
    except ValueError:
        lat = float(aux[0][i][0:-2])
    try: 
        lon = float(aux[1][i][0:-1]) 
    except ValueError:
        lon = float(aux[1][i][0:-2])
    if aux[2][i][0:3] == 'Wea':
        aux[2][i] = aux[2][i][0:-1]
    sites.append([aux[2][i], -1*lat, lon])
        
#Sites contains long lat for each site in a nested list

In [61]:
# loop through products to get the data for each day for each site

data =[]

for name in products:
    dse = []
    sm = []
    f=open('multi_products/SWEEP.v1.0/' + name,'r',encoding="ISO-8859-1")
    
    for line in f:
        try: 
            line = line.split(',')
            y, m, d = int(line[0][0:4]), int(line[0][4:6]), int(line[0][6:8])
            dt = datetime(y, m, d) - datetime(1900, 1, 1)
            dse.append(dt.days)
            sm.append(list(map(float, line[1:])))
        except: 
            pass 
    aux = pd.DataFrame(sm)
    
    #aux.replace({-9999.0:np.nan}, inplace = True)
    
    for i in range(aux.shape[0]):
        for j in range(aux.shape[1]):
            if aux[j][i] < -9000:
                aux[j][i] = np.nan
    
    aux.rename(index = dict([(i, dse[i]) for i in range(len(dse))]), inplace = True)
    data.append(aux)
    
    print(name)
    
with open('multi_products/products_data.pkl','wb') as f:
    pickle.dump(data, f)
    
    
#Missing data? Replace -9999.0's with Nan

#To locate data by index use loc
#df.loc[['row index name']][column name/index]

#in pandas, the first [ ] denotes column, the second row
# as opposed to python where the first [ ] is row, second column

AWRAL.csv
SMOS_LMEB_A.csv
AMSR2_LPRM_D.csv
ASCAT_TUW_D.csv
SMOS_LPRM_A.csv
WaterDyn.csv
MSDI.csv
API.csv
KBDI.csv
AMSR2_JAXA_A.csv
CABLE.csv


In [19]:
# It works!

In [22]:
# Want to find corresponding FMC values for those dates and sites

fmct, fmc = [], [] # for time and data respectively

for year in range(2001,2015):
    url_fmc = 'http://dapds00.nci.org.au/thredds/dodsC/ub8/au/FMC/c6/mosaics/fmc_c6_'+ str('%i' % year) + '.nc'
    fmc_data = Dataset(url_fmc, 'r')
    
    lat_fmc = fmc_data['latitude'][:]
    lon_fmc = fmc_data['longitude'][:]
    time = fmc_data['time'][:]
        
#computing the 3 x 3 FMC gridded average around the given sites
    aa, bb = [], []
    for s in sites:
        a = np.where((lat_fmc < s[1]+0.00251) & (lat_fmc > s[1]-0.00251))
        b = np.where((lon_fmc < s[2]+0.00251) & (lon_fmc > s[2]-0.00251))
        #print(a,b, lat_fmc[0], lat_fmc[-1], s[1])
        aa.append(a[0][0])
        bb.append(b[0][0])       
    
    for i in range (time.size):
        x = datetime(1970,1,1) + timedelta(seconds = int(time[i]))
        x = x-datetime(1900,1,1)
        fmct.append(x.days)
        fmc.append(np.zeros(len(sites), dtype=float) * np.nan)
        
        for s in range(len(sites)):
            v1 = np.mean(fmc_data['fmc_mean'][i,aa[s]-1:aa[s]+2,bb[s]-1:bb[s]+2])
            fmc[-1][s] = v1
    
    print(year)

fmc = pd.DataFrame(fmc)
print(fmc[0][0])

with open('multi_products/fmc_data2.pkl', 'wb') as f:
    pickle.dump([fmct, fmc], f)
            



2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
57.25722122192383


In [11]:
with open('multi_products/fmc_data2.pkl','rb') as f:
    fmct, fmc = pickle.load(f)

with open('multi_products/products_data.pkl','rb') as f:
    data = pickle.load(f)
    
# from best import fit_best

import numpy as np
from scipy import stats
from scipy.optimize import curve_fit

def linear(x, a, b):
    return a * x + b

#def glf(x, A, K, C, Q, B, ni):
#    return A + (K - A) / ((C + Q * np.exp(-B * x)) ** (1./ni))

def glf(x, A, K, B, ni):
    return A + (K - A) / (np.exp(-B * x) ** (1./ni))

def exponencial(x, a, b, c):
    return a + b * np.exp(c*x)

def powerlaw(x, a, b, c):
    return a + b * (x**c)

def fit_best(xdata, ydata):
    """ Fits many function types to the provided x,y datasets and return the best based on r² value.
    The functions are:
        (1) linear regression;
        (2) generalised logistic function;
        (3) an exponencial function and;
        (4) power-law function
    """
    bestmodel = 1
    
    foundfit = False
    
    # Fitting the linear model first
    try:
        param_lin, pcov = curve_fit(linear, xdata, ydata, maxfev=10000)
        yf = linear(xdata, *param_lin)
        r, p = stats.pearsonr(ydata, yf)
        r2max = r ** 2
        foundfit = True
    except RuntimeError:
        r2max = 0
    
    
    # Fitting the generalised logistic function
    try:
        param_glf, pcov = curve_fit(glf, xdata, ydata, maxfev=10000)
        yf = glf(xdata, *param_glf)
        r, p = stats.pearsonr(ydata, yf)
        r2 = r ** 2
        foundfit = True

        if r2 > r2max:
            bestmodel = 2
            r2max = r2
            
    except RuntimeError:
        r2max = 0
    
    # Fitting the exponencial function
    try:
        param_exp, pcov = curve_fit(exponencial, xdata, ydata, maxfev=10000)
        yf = exponencial(xdata, *param_exp)
        r, p = stats.pearsonr(ydata, yf)
        r2 = r ** 2
        foundfit = True

        if r2 > r2max:
            bestmodel = 3
            r2max = r2
            
    except RuntimeError:
        r2max = 0
    
    # Fitting the power-law function
    try:
        param_law, pcov = curve_fit(powerlaw, xdata, ydata, maxfev=10000)
        yf = powerlaw(xdata, *param_law)
        r, p = stats.pearsonr(ydata, yf)
        r2 = r ** 2
        foundfit = True

        if r2 > r2max:
            bestmodel = 4
            r2max = r2
    
    except RuntimeError:
        r2max = 0

    if foundfit:
        
        # Use the best fit to create a sample of 100 x,y pairs of model data
        xi = np.linspace(np.min(xdata), np.max(xdata), 100)

        if bestmodel == 1:
            yi = linear(xi, *param_lin)
        elif bestmodel == 2:
            yi = glf(xi, *param_glf)
        elif bestmodel == 3:
            yi = exponencial(xi, *param_exp)
        else:
            yi = powerlaw(xi, *param_law)

        return xi, yi, r2max
    
    else:
        return xi, xi*np.nan, r2max

In [9]:
# creating one figure per site with subplots of each product
NS = range(len(sites))
NP = range(len(products))
table = np.zeros((len(sites), len(products))) - 1

for si in NS:
    fig, axes = plt.subplots(nrows=2,ncols=6, figsize=(20,7))
    
    for j, ax in enumerate(axes.flatten()):
        
        if j == 11:
            continue
        
        # scatter plot
        ax.scatter(data[j][si][fmct], fmc[si], label='_nolegend_')
        ax.set_title(products[j][:-4])
        
        # gathering only real values (removing nan)
        Xlr, Ylr = data[j][si][fmct], fmc[si]
        mask = np.isfinite([Xlr, Ylr]).all(axis=0)
        
#         # computing linear regression
#         slope, intercept, r_value, p_value, std_err = stats.linregress(Xlr[mask], Ylr[mask])
#         strlin = str('r$^2$ = %f' %(r_value**2))
#         table[si,j] = r_value**2
#         minx, maxx = np.min(Xlr[mask]), np.max(Xlr[mask])
#         miny, maxy = slope*minx + intercept, slope*maxx + intercept
#         ax.plot([minx, maxx], [miny,maxy], 'k-', label = strlin)
        
        # fitting the best model
        xm, ym, r2 = fit_best(Xlr[mask], Ylr[mask])
        table[si,j] = r2
        strlin = str('r$^2$ = %f' % r2)
        ax.plot(xm, ym, 'k-', lw = 2, label = strlin)
        
        ax.legend()
        ax.grid()
        
    fig.suptitle(sites[si][0])
    plt.tight_layout()
    plt.subplots_adjust(top=0.90)
    plt.savefig(str('images/multproducts_%2.2i.png' % si))
    plt.close(fig)

table = pd.DataFrame(table)
table.rename(index = dict([(i, sites[i][0]) for i in NS]), inplace = True)
table.columns = [x[:-4] for x in products]
table.to_csv('multi_products/multi_daily_best_r2.csv')
    

  r = r_num / r_den


In [50]:
for p in NP:
    csv = open(str('categ/%s' % products[p]), 'w')
    for si in NS:
        csv.write(',%s' % sites[si][0])
    csv.write('\n')
    for t in fmct:
        csv.write('%i' % t)
        for si in NS:
            csv.write(',%f' % data[p][si][t])
        csv.write('\n')
    csv.close()
    print(products[p])


AWRAL.csv
SMOS_LMEB_A.csv
AMSR2_LPRM_D.csv
ASCAT_TUW_D.csv
SMOS_LPRM_A.csv
WaterDyn.csv
MSDI.csv
API.csv
KBDI.csv
AMSR2_JAXA_A.csv
CABLE.csv


In [59]:
csv = open('categ/FMC.csv','w')
for si in NS:
    csv.write(',%s' % sites[si][0])
csv.write('\n')
for t in range(len(fmct)):
    csv.write('%i' % fmct[t])
    for i in NS:
        csv.write(',%f' % fmc[i][t])
    csv.write('\n')
csv.close()

In [7]:
#Create a new monthly correlation with the data that we did the daily time step for FMC and SM
#Our data in the daily products is in a list

#Create a empty list which we will put the monthly data
mdata =[]

#Convert the first day into date time and the last one to check the years there
fmct = np.array(fmct, dtype=int)
start = datetime(1900, 1, 1) + timedelta(days = int(fmct[0]))
end =   datetime(1900, 1, 1) + timedelta(days = int(fmct[-1]))
#We need to do a for loop to cycle through the years and another to cycle through the month
# each product will have a dateframe
# the last product will be the fmc column
#function separate by comma and list and array semi colon separator
#tn is day in the next month but complicated because monthe have different days
for s in range(len(sites)):
    sdata, months = [], []
#for p in range(len(products)+1):
    for y in range(start.year, end.year+1):
        for m in range(1,13):
            #creating the boundaries in time within the month for the data frame
            t0 = datetime(y, m, 1) - datetime(1900, 1, 1)
            t0 = t0.days
            tn = datetime(y,m,1) + timedelta(days=40) 
            tn = datetime(tn.year, tn.month, 1)- datetime(1900, 1, 1)
            tn = tn.days
            w = np.where((fmct >= t0) & (fmct < tn))[0]#returns a tuple with a list
            
            #collect the average betweeen t0 an tn and create average and store it
            aux =[]
            g = np.arange(t0,tn)
            for p in range(len(products)):
                q = data[p][s][g]
                #here we compute the mean
                aux.append(np.nanmean(q))
            q = np.nanmean(fmc[s][w])
            aux.append(q)
            sdata.append(aux)
            #created a variiable datetime and called the method strftime
            #to create a string from the datetime
            argument = datetime(y,m,1).strftime('%Y-%m')
            months.append(argument)
                          
           #print(s,y,m,months)

    #sdata is the dataframe for just one site
    
    names = [products[i][0:-4] for i in range(len(products))]
    sdata = pd.DataFrame(sdata, columns = names +['FMC'], index = months)
    #print(sdata)
    
    mdata.append(sdata)

#mdata has all the data for all the sites
#print(mdata)



In [12]:
# plots of mdata

# creating one figure per site with subplots of each product
NS = range(len(sites))
NP = range(len(products))
table = np.zeros((len(sites), len(products))) - 1

for si in NS:
    fig, axes = plt.subplots(nrows=2,ncols=6, figsize=(20,7))
    
    for j, ax in enumerate(axes.flatten()):
        
        if j == 11:
            continue
        
        # scatter plot
        ax.scatter(mdata[si][names[j]], mdata[si]['FMC'], label='_nolegend_')
        ax.set_title(names[j])
        
        # gathering only real values (removing nan)
        Xlr, Ylr = mdata[si][names[j]], mdata[si]['FMC']
        mask = np.isfinite([Xlr, Ylr]).all(axis=0)
        
#         # computing linear regression
#         slope, intercept, r_value, p_value, std_err = stats.linregress(Xlr[mask], Ylr[mask])
#         strlin = str('r$^2$ = %f' %(r_value**2))
#         table[si,j] = r_value**2
#         minx, maxx = np.min(Xlr[mask]), np.max(Xlr[mask])
#         miny, maxy = slope*minx + intercept, slope*maxx + intercept
#         ax.plot([minx, maxx], [miny,maxy], 'k-', label = strlin)
        
        # fitting the best model
        xm, ym, r2 = fit_best(Xlr[mask], Ylr[mask])
        table[si,j] = r2
        strlin = str('r$^2$ = %f' % r2)
        ax.plot(xm, ym, 'k-', lw = 2, label = strlin)
        
        ax.legend()
        ax.grid()
        
    fig.suptitle(sites[si][0])
    plt.tight_layout()
    plt.subplots_adjust(top=0.90)
    plt.savefig(str('multi_products/monthly_best/monthlymean_%2.2i.png' % si))
    plt.close(fig)
    
    
table = pd.DataFrame(table)
table.rename(index = dict([(i, sites[i][0]) for i in NS]), inplace = True)
table.columns = [x[:-4] for x in products]
table.to_csv('multi_products/multi_month_best_r2.csv')
    