<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preamble" data-toc-modified-id="Preamble-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preamble</a></span></li><li><span><a href="#Different-Regions" data-toc-modified-id="Different-Regions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Different Regions</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Functions</a></span></li><li><span><a href="#Indivual-Phases" data-toc-modified-id="Indivual-Phases-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Indivual Phases</a></span></li><li><span><a href="#Bootstrap" data-toc-modified-id="Bootstrap-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Bootstrap</a></span><ul class="toc-item"><li><span><a href="#Create-Count-Array" data-toc-modified-id="Create-Count-Array-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Create Count Array</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Functions</a></span></li><li><span><a href="#Running-and-Saving" data-toc-modified-id="Running-and-Saving-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Running and Saving</a></span></li></ul></li></ul></div>

# Preamble

In [1]:
##############################

import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask.array
import cartopy.crs as ccrs
import pickle
import matplotlib.colors as colors
import datetime as dt
rb = plt.cm.RdBu
bm = plt.cm.Blues

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = 'RMM.pickle'
pickle_in = open(path, 'rb')
RMM = pickle.load(pickle_in)

In [3]:
# %load /home/563/ab2313/MJO/get_awap.py
import sys

def get_platform():
    platforms = {
        'linux1' : 'Linux',
        'linux2' : 'Linux',
        'darwin' : 'OS X',
        'win32' : 'Windows'
    }

    if sys.platform not in platforms:
        return sys.platform

    return platforms[sys.platform]


platform = get_platform()


if platform == 'OS X':
    path =  '/Users/alexborowiak/Desktop/large_files/'


   
    
else:
#     path = '/home/student.unimelb.edu.au/aborowiak/Desktop/Code/Scripts/big_files/'
    path = '/home/563/ab2313/big_files/'


# precip = xr.open_dataset(path + 'AWAP_w.nc')


In [4]:

precip = xr.open_dataset(path + 'AWAP_W.nc', chunks={'time':-1, 'lat': 50, 'lon': 50}).precip

In [5]:
RMM = RMM.reset_index()
RMM['Date'] = RMM['Date'] + pd.to_timedelta('9h')
RMM = RMM.set_index('Date')

# Different Regions

The rainfall is divided into different phases

In [6]:
regions = np.array([slice(110, 120),slice(120.25, 140),slice(140.25, 156.25)])
mjo_enhanced_list = np.array([[4,5],[4,5,6],[4,5,6,7]])

In [7]:
# This is creating 3 seperate data frames comprised of the rainfall just in the individual phases


########## Enhanced

region_data = []

for reg_num in [0,1,2]:
    region = regions[reg_num]
    precip_region = precip.sel(lon = region)

    mjo_enhanced = mjo_enhanced_list[reg_num]
    enhanced_dates = np.array(RMM[np.logical_and(RMM['Phase'].isin(mjo_enhanced), RMM['Amplitude'] >= 1)].index)

    precip_enhancecd = precip_region.where(precip_region.time.isin(enhanced_dates))
    
    region_data.append(precip_enhancecd)
    
enhanced_precip = (region_data[0].combine_first(region_data[1])).combine_first(region_data[2])



########## Supppressed
region_data = []

for reg_num in [0,1,2]:
    region = regions[reg_num]
    precip_region = precip.sel(lon = region)

    mjo_suppressed = mjo_enhanced_list[reg_num]
    suppressed_dates = np.array(RMM[np.logical_and(~RMM['Phase'].isin(mjo_suppressed), RMM['Amplitude'] >= 1)].index)

    precip_enhancecd = precip_region.where(precip_region.time.isin(suppressed_dates))
    
    region_data.append(precip_enhancecd)
    
suppressed_precip = (region_data[0].combine_first(region_data[1])).combine_first(region_data[2])

########## Inactive
region_data = []

for reg_num in [0,1,2]:
    region = regions[reg_num]
    precip_region = precip.sel(lon = region)

    mjo_inactive = mjo_enhanced_list[reg_num]
    inactive_dates = np.array(RMM[RMM['Amplitude'] <1].index)

    precip_enhancecd = precip_region.where(precip_region.time.isin(inactive_dates))
    
    region_data.append(precip_enhancecd)
    
inactive_precip = (region_data[0].combine_first(region_data[1])).combine_first(region_data[2])

In [8]:
# The climatology for each month. The values above this are considered to be extreme. This will be what I will
# be counting. This is the 90th percentile of all rainfall

climatology_90 = precip.groupby('time.month').reduce(np.nanpercentile, q = 90, dim = 'time')

climatology_95 = precip.groupby('time.month').reduce(np.nanpercentile, q = 95, dim = 'time')

The values greater than the 90th percentile from each phase of the MJO is returned

In [9]:
# This returns just the extremes. 

def return_extremes(precip, threshold):
    storage = []
    months = [10,11,12,1,2,3]
    for i,month in enumerate(months):
        precip_month = precip.where(precip.time.dt.month == month, drop = True)
        threshold_month = threshold.sel(month = month)
        
        precip_month_ex = precip_month.where(precip_month >= threshold_month)

        storage.append(precip_month_ex)
        
        if i == 0:
            extreme_xr = precip_month_ex
        else:
            extreme_xr = extreme_xr.combine_first(precip_month_ex)  
    
    return  extreme_xr
    

In [10]:
enhanced_90 = return_extremes(enhanced_precip , climatology_90)

suppressed_90 = return_extremes(suppressed_precip , climatology_90)

inactive_90 = return_extremes(inactive_precip , climatology_90)

precip_90 = return_extremes(precip , climatology_90)

In [17]:
data_90 = xr.concat([enhanced_90,suppressed_90,inactive_90,precip_90],
                    pd.Index(['enhanced','suppressed','inactive','precip'], name = 'mjo'))

In [11]:
enhanced_95 = return_extremes(enhanced_precip , climatology_95)

suppressed_95 = return_extremes(suppressed_precip , climatology_95)

inactive_95 = return_extremes(inactive_precip , climatology_95)

precip_95 = return_extremes(precip , climatology_95)

In [60]:
data_95 = xr.concat([enhanced_95,suppressed_95,inactive_95,precip_95],
                    pd.Index(['enhanced','suppressed','inactive','precip'], name = 'mjo'))

# Functions

In [27]:
def into_xr(data, orgininal, name = 'precip'):
    return xr.DataArray(
    data,
    dims=['lat','lon'],
    coords={'lat':orgininal.lat, 'lon': orgininal.lon},
    name=name
    )

In [28]:
def calculate_countfrac(data_sub,data_all,per):
    
    # Counting all of the extreme events
    count_sub = data_sub.count(dim = 'time')
    
    # Counting all of the rain events
    count_all = data_all.count(dim = 'time')
    
    # Finding the ratio of the events
    frac = count_sub/count_all
    
    # Finding out how much more or less than expected this is. E.g is we are talking about the 90th percentile
    # then we would expect the frac to be 0.1. But if we get 0.2, the this is twice as much as expected. Dividing
    # this by the per amount (e.g 1/0.1 = 10)
    frac = frac/per
    
    xr = into_xr(frac, data_all, 'frac_count')
    
    return xr

In [29]:
def count_all_months_and_regions(phase_precip, all_precip,per):
    
    months = [10,11,12,1,2,3]
    temp_dict = {}
    
    for month in months:
        # Just the data that is in this month
        phase_prec_month = phase_precip.where(phase_precip.time.dt.month == month, drop = True)
        all_prec_month = all_precip.where(all_precip.time.dt.month == month, drop = True)
        
        # Running the function above
        frac_month = calculate_countfrac(phase_prec_month ,all_prec_month,per)
        
        # Storing the results
        temp_dict[str(month)] = frac_month
    
    frac_int = xr.concat([temp_dict['10'],temp_dict['11'],temp_dict['12'],temp_dict['1'],
                            temp_dict['2'],temp_dict['3']], pd.Index(months, name = 'month'))
    
    return frac_int
    
    
    

# Indivual Phases

In [30]:
per = 0.1
enhanced_count90 = count_all_months_and_regions(enhanced_90 , enhanced_precip, per)

suppressed_count90 = count_all_months_and_regions(suppressed_90, suppressed_precip,per)

inactive_count90 = count_all_months_and_regions(inactive_90, inactive_precip,per)

In [31]:
count_90 = xr.concat([enhanced_count90, suppressed_count90, inactive_count90],
                    pd.Index(['enhanced','suppressed','inactive'], name = 'mjo'))

In [32]:
per = 0.05

enhanced_count95 = count_all_months_and_regions(enhanced_95 , enhanced_precip, per)

suppressed_count95 = count_all_months_and_regions(suppressed_95, suppressed_precip,per)

inactive_count95 = count_all_months_and_regions(inactive_95, inactive_precip,per)

In [33]:
count_95 = xr.concat([enhanced_count95, suppressed_count95, inactive_count95],
                    pd.Index(['enhanced','suppressed','inactive'], name = 'mjo'))

In [34]:
save = 1
if save:
    count_95.to_netcdf('count_95.nc')

    count_90.to_netcdf('count_90.nc')

# Bootstrap

Steps:

* 1: Creating the count array. This counts all of the rainfall events that occur. So this will count all in the normal and the extreme arrays. This is useful as now I only have to do a sum across all of the years (months).
* 2: Th e

In [35]:
enhanced_precip

<xarray.DataArray 'precip' (time: 7822, lat: 53, lon: 178)>
dask.array<shape=(7822, 53, 178), dtype=float32, chunksize=(7822, 50, 50)>
Coordinates:
  * lon      (lon) float64 112.0 112.2 112.5 112.8 ... 155.5 155.8 156.0 156.2
  * time     (time) datetime64[ns] 1974-10-01T09:00:00 ... 2017-12-31T09:00:00
  * lat      (lat) float64 -23.0 -22.75 -22.5 -22.25 ... -10.5 -10.25 -10.0
Attributes:
    long_name:  Daily Precipitation
    units:      mm

In [36]:
enhanced_90

<xarray.DataArray 'precip' (time: 7822, lat: 53, lon: 178)>
dask.array<shape=(7822, 53, 178), dtype=float32, chunksize=(7822, 50, 50)>
Coordinates:
  * time     (time) datetime64[ns] 1974-10-01T09:00:00 ... 2017-12-31T09:00:00
  * lon      (lon) float64 112.0 112.2 112.5 112.8 ... 155.5 155.8 156.0 156.2
  * lat      (lat) float64 -23.0 -22.75 -22.5 -22.25 ... -10.5 -10.25 -10.0
Attributes:
    long_name:  Daily Precipitation
    units:      mm

## Create Count Array

The count array is created to speed up the bootstrapping.
As the end results of count is the count from every mouth, 
if the count of each month is already created, then the final 
results can just be the sum of all of these.

In [37]:
def each_year_count(data):
    
    storage = []
    #Looping through all of the months
    months = [10,11,12,1,2,3]
    for month in months:
        data_month = data.where(data.time.dt.month == month, drop = True)
        count_year = data_month.groupby('time.year').count(dim = 'time')
        storage.append(count_year)
        
    count_all = xr.concat(storage, pd.Index(months, name = 'month'))
    
    return count_all
        
        


In [38]:
en_90_year_count = each_year_count(enhanced_90)
sup_90_year_count = each_year_count(suppressed_90)
inact_90_year_count = each_year_count(inactive_90)

year_count_90 = xr.concat([en_90_year_count, sup_90_year_count,inact_90_year_count],
                       pd.Index(['enhanced', 'suppressed','inactive'], name = 'mjo'))

In [39]:
en_95_year_count = each_year_count(enhanced_95)
sup_95_year_count = each_year_count(suppressed_95)
inact_95_year_count = each_year_count(inactive_95)

year_count_95 = xr.concat([en_95_year_count, sup_95_year_count,inact_95_year_count],
                       pd.Index(['enhanced', 'suppressed','inactive'], name = 'mjo'))

In [40]:
en_year_count = each_year_count(enhanced_precip)
sup_year_count = each_year_count(suppressed_precip)
inact_year_count = each_year_count(inactive_precip)

year_count = xr.concat([en_year_count, sup_year_count,inact_year_count],
                       pd.Index(['enhanced', 'suppressed','inactive'], name = 'mjo'))

In [41]:
en_90_year_count

<xarray.DataArray 'precip' (month: 6, year: 44, lat: 53, lon: 178)>
dask.array<shape=(6, 44, 53, 178), dtype=float64, chunksize=(1, 1, 50, 50)>
Coordinates:
  * year     (year) int64 1974 1975 1976 1977 1978 ... 2013 2014 2015 2016 2017
  * lon      (lon) float64 112.0 112.2 112.5 112.8 ... 155.5 155.8 156.0 156.2
  * lat      (lat) float64 -23.0 -22.75 -22.5 -22.25 ... -10.5 -10.25 -10.0
  * month    (month) int64 10 11 12 1 2 3

## Functions

In [42]:
def select_additional(data_sub,data,uniqueValues, occurCount, number):

    
    if any(occurCount >= 2 ):
        # Find the years that are getting sampled multiple times
            # occurCount is the frequency of each year
            # uniqueValues is all the years that have been sampled
        multi_sampled_years = uniqueValues[np.where(occurCount >= 2 )]
        
        #  Finding the data for the additional years
        additional = data.where(data.year.isin(multi_sampled_years), drop = True)
        # Adding n * 100 years to the data so that they don't overlap
        additional['year'] = additional.year +  number * 100
        
        # Combing all the data together
        data_sub = data_sub.combine_first(additional)
        
        #### Running this whole function again (recursivly) to determine which ones have been selected more than
           # twice
        # Reducing the occur count by 1. This will enable the samples that have been sampeld more than twice 
        # to be determing
        occurCount = occurCount - 1
        number += 1
        
        data_sub = select_additional(data,data_sub, uniqueValues, occurCount, number) 
        
        return data_sub
                                
    else:
        return data_sub

In [48]:
def calculate_sum_of_count_frac(data_sub,data_all,per):
    
    # Counting all of the extreme events
    count_sub = data_sub.sum(dim = 'year')
    
    # Counting all of the rain events
    count_all = data_all.sum(dim = 'year')
    
    # Finding the ratio of the events
    frac = count_sub/count_all
    
    # Finding out how much more or less than expected this is. E.g is we are talking about the 90th percentile
    # then we would expect the frac to be 0.1. But if we get 0.2, the this is twice as much as expected. Dividing
    # this by the per amount (e.g 1/0.1 = 10)
    frac = frac/per

    return frac

In [49]:
# Defunct

def bootstrap_phase_count_def(precip_ex ,precip, per, loops):
    
    total_boot = []
    
    # Need to do a certain amount of loops
    for i in range(loops):
        
        # Selecting random years; half the ranges word
        rand_years = np.random.randint(1974,2017,22)
        
        # Subsetting botht the data to the randomyears
        precip_ex_rand = precip_ex.where(precip_ex.year.isin(rand_years))
        precip_rand = precip.where(precip.year.isin(rand_years))
        
        
        # If there are years that have been repeated they need to be added
        uniqueValues, occurCount = np.unique(rand_years, return_counts=True)
        number = 1
        precip_ex_rand = select_additional(precip_ex_rand,precip,
                                                   uniqueValues, occurCount, number)
    
        precip_rand = select_additional(precip_rand, precip,
                                             uniqueValues, occurCount, number)
        ######
        # Executing the above function to cork out the count frac
        run_boot = calculate_sum_of_count_frac(precip_ex_rand, precip_rand, per)
        
        total_boot.append(run_boot)

    total_boot = xr.concat(total_boot, 'run_num')

    return total_boot

In [50]:
# Defunct
def bootstrap_each_month_count_def(precip_ex, precip, per, loops):
    months = [10,11,12,1,2,3]
    xr_month_storage = []   
        
    for month in months:
        # Extracting just the single months
        precip_ex_month = precip_ex.sel(month = month)
        precip_month = precip.sel(month = month)
        
        # Running the above function
        count_month = bootstrap_phase_count(precip_ex_month , precip_month, per, loops)
        
        # Storing the results
        xr_month_storage.append(count_month)
    
    xr_file = xr.concat(xr_month_storage, pd.Index(months, name = 'month'))
    
    return xr_file

In [51]:
# This function takes in the count for a single phase of the MJO

def bootstrap_each_month_count(count_ex, count, per, loops):
    
    month_storage = []
    
    # Loop through all of the months. Extract the data for just one month.
    months = [10,11,12,1,2,3]
    for month in months:
        
        loop_storage = []

        
        count_month_ex = count_ex.sel(month = month)
        count_month = count.sel(month = month)
        
        # Looping through all of the different runs
        for i in range(loops):
            
            ########### Selecting Random Data
            # Selecting random years; half the ranges of the data
            rand_years = np.random.randint(1974,2017,22)
    
            # Subsetting botht the data to the randomyears
            count_ex_rand = count_month_ex.where(count_month_ex.year.isin(rand_years)) # Extremes
            count_rand = count_month.where(count_month.year.isin(rand_years)) # Normal
            
            # If there are years that have been repeated they need to be added
            uniqueValues, occurCount = np.unique(rand_years, return_counts=True)
            
            # (data_sub, data)
            count_ex_rand = select_additional(count_ex_rand,count_month_ex, uniqueValues, occurCount, number = 1)
            count_rand = select_additional(count_rand, count_month,uniqueValues, occurCount, number = 1)
            
            
            ########### Actual Calculation
            # (data_sub, data_all)
            boot_run = calculate_sum_of_count_frac(count_ex_rand, count_rand, per)
            
            loop_storage.append(boot_run)
            
            ## END OF LOOP
        
        month_data = xr.concat(loop_storage, 'run_num')
        month_95 = month_data.reduce(np.nanpercentile, q = 95, dim = 'run_num')
        month_5 = month_data.reduce(np.nanpercentile, q = 5, dim = 'run_num')
        
        month_boot = xr.concat([month_95, month_5], pd.Index([95,5], name = 'percentile'))
        month_storage.append(month_boot)
        
    total_boot = xr.concat(month_storage, pd.Index(months, name = 'month'))
        
    return total_boot
    
    
    

## Running and Saving

In [52]:
loops = 1000
per = 0.1


name = 'enhanced'
enhanced_boot = bootstrap_each_month_count(year_count_90.sel(mjo = name),year_count.sel(mjo = name),
                                           per , loops)


name = 'suppressed'
suppressed_boot = bootstrap_each_month_count(year_count_90.sel(mjo = name),year_count.sel(mjo = name),
                                           per , loops)


name = 'inactive'
inactive_boot = bootstrap_each_month_count(year_count_90.sel(mjo = name),year_count.sel(mjo = name),
                                           per , loops)

In [58]:
count_boot_90 = xr.concat([enhanced_boot, suppressed_boot, inactive_boot], 
                        pd.Index(['enhanced', 'suppressed','inactive'], name = 'mjo'))

count_boot_90.to_netcdf('count_boot_90.nc')

In [56]:
loops = 1000
per = 0.05


name = 'enhanced'
enhanced_boot = bootstrap_each_month_count(year_count_95.sel(mjo = name),year_count.sel(mjo = name),
                                           per , loops)


name = 'suppressed'
suppressed_boot = bootstrap_each_month_count(year_count_95.sel(mjo = name),year_count.sel(mjo = name),
                                           per , loops)


name = 'inactive'
inactive_boot = bootstrap_each_month_count(year_count_95.sel(mjo = name),year_count.sel(mjo = name),
                                           per , loops)

In [59]:
count_boot_95 = xr.concat([enhanced_boot, suppressed_boot, inactive_boot], 
                        pd.Index(['enhanced', 'suppressed','inactive'], name = 'mjo'))

count_boot_95.to_netcdf('count_boot_95.nc')