### Notebook for logistic regression fit
#### Used to fit models for spatially constant thresholds to data at various timescales

In [4]:
%who

Client	 LocalCluster	 check	 client	 cluster	 data_dir	 datetime	 ds_all	 ds_p	 
ds_p_subset	 file_object	 fname	 i	 iWeek	 main_dir	 myfuncs	 nSets	 new_path	 
np	 os	 out_file	 p	 pFiles	 pNames	 pd	 predSel	 pred_df	 
pred_df_sel	 pred_dict	 progress_file	 progress_text	 sst_dir	 sub_dir	 sys	 threshold	 time_slice	 
varname	 vname	 x_new	 xr	 


In [None]:
%%time

## Fitting the model to P-E-Q by season

import sys
new_path = '/home/566/ad9701/drought_probability/'
if new_path not in sys.path:
    sys.path.append(new_path)

import xarray as xr
import numpy as np
import pandas as pd
import datetime
import os
import my_glmfit_funcs as myfuncs

main_dir = '/g/data/w97/ad9701/p_prob_analysis/temp_files/'

varname = 'PminusEQ' # the name of the directory and file
vname = 'PminusEQ'   # the name of the variable inside the files
fname = varname + '_*_*_*.nc'

# select timescales for analysis - select only one
iWeek = 6 #[2, 6, 12] #[2, 6, 8, 12]

# select uniform thresholdsto use for the analysis
threshold = [20, 50, 100]

############################################
# GET THE SST PREDICTORS
############################################

sst_dir = '/g/data/w97/ad9701/p_prob_analysis/sst_data/'
pNames = ['soi', 'sami', 'dmi', 'nino34_anom', 'nino4_anom']
pFiles = ['soi_monthly.nc', 'newsam.1957.2021.nc', 'dmi.had.long.data.nc', 'nino34.long.anom.data.nc', 'nino4.long.anom.data.nc']
for p in np.arange(len(pNames)):
    ds_temp = xr.open_dataset(sst_dir + pFiles[p])
    if (p > 0):
        ds_p[pNames[p]]=ds_temp[pNames[p]]
    else:
        ds_p = ds_temp
    del ds_temp

# select the predictors to include in the model -- these are now set as default values of the corresponding arguments in myfuncs.fit_gridded_logistReg function below
predSel = ['soi', 'dmi']
# formula = 'response ~ soi+dmi'
# parameter = ['Intercept']
# parameter.extend(predSel)
    
# create a new df of sample points at which 'predictions' will be made using the fitted model
time_slice = slice('1911-01-01','2020-05-31')
ds_p_subset = ds_p.sel(time = time_slice)
pred_dict = {}
for p in pNames:
    pred_dict.update({p: ds_p_subset[p].values})
pred_dict.update({"season": ds_p_subset['time.season'].values})    # add season to the sst predictors    
pred_df = pd.DataFrame(pred_dict, index = ds_p_subset['time'])     # make a dataframe of predictors
pred_df_sel = pred_df[predSel]
x_new = myfuncs.createSampleDf(pred_df_sel, list(pred_df_sel.keys()))

############################################
# START A LOCAL CLUSTER
############################################

from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
client

#############################################
# PERFORM CALCULATIONS FOR EACH SET OF DATA
#############################################

nSets = (7*iWeek)-1    # number of sets in addition to the original aggregation

progress_file = "drought_probability/logistRegr_progress.txt"
for i in range(22, nSets):
    sub_dir = '/set' + str(i+2)
    data_dir = main_dir + varname + '_week' + str(iWeek) + '/' + sub_dir + '/'
    out_file = data_dir + 'GLM_results_' + '_'.join(predSel) + '_bySeason.nc'
    
    check = os.path.isfile(out_file)
    if check is True:
        progress_text = varname + '/week' + str(iWeek) + sub_dir + ' is already done'
        with open(progress_file, "a") as file_object:
            file_object.write("\n")
            file_object.write(progress_text)
    else:    
        progress_text = varname + '/week' + str(iWeek) + sub_dir + ' ' + str(datetime.datetime.now())
        with open(progress_file, "a") as file_object:
            file_object.write("\n")
            file_object.write(progress_text)
        ds_all = myfuncs.fit_gridded_logistReg(main_dir = main_dir, varname = varname, iWeek = iWeek, threshold = threshold, \
                                               sub_dir = sub_dir, ds_p = ds_p, x_new = x_new)
        ds_all.to_netcdf(out_file)
        with open(progress_file, "a") as file_object:
            file_object.write(' end time:' + str(datetime.datetime.now()))

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46785 instead


In [3]:
cluster.scale(cores = 0)

In [5]:
from dask.distributed import Client,Scheduler
from dask_jobqueue import SLURMCluster
cluster = SLURMCluster(cores=4,memory="31GB")
client = Client(cluster)
cluster.scale(cores=4)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.0.128.149:34533,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B
