This repository has been archived by the owner on May 12, 2021. It is now read-only.
/
metrics.py
executable file
·2550 lines (2261 loc) · 108 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
'''
Module storing functions to calculate statistical metrics from numpy arrays
'''
import datetime
import subprocess
import sys
import os
import numpy as np
import numpy.ma as ma
from math import floor, log
from toolkit import process
from utils import misc
from storage import files
from pylab import *
import scipy.stats.mstats as mstats
import matplotlib as mpl
import matplotlib.dates
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from matplotlib.font_manager import FontProperties
from mpl_toolkits.basemap import Basemap
from utils.Taylor import TaylorDiagram
# 6/10/2012 JK: any Ngl dependence will be removed in later versions
#import Ngl
def calc_ann_mean(t2, time):
'''
Calculate annual cycle in terms of monthly means at every grid point.
'''
# Calculate annual cycle in terms of monthly means at every grid point including single point case (ndim=1)
# note: this routine is identical to 'calc_annual_cycle_means': must be converted to calculate the annual mean
# Extract months from time variable
months = np.empty(len(time))
for t in np.arange(len(time)):
months[t] = time[t].month
if t2.ndim == 3:
means = ma.empty((12, t2.shape[1], t2.shape[2])) # empty array to store means
# Calculate means month by month
for i in np.arange(12)+1:
means[i - 1, :, :] = t2[months == i, :, :].mean(0)
if t2.ndim == 1:
means = np.empty((12)) # empty array to store means
# Calculate means month by month
for i in np.arange(12)+1:
means[i - 1] = t2[months == i].mean(0)
return means
def calc_clim_month(t2, time):
'''
Calculate monthly means at every grid point.
'''
# Calculate monthly means at every grid point including single point case (ndim=1)
# Extract months from time variable
months = np.empty(len(time))
for t in np.arange(len(time)):
months[t] = time[t].month
if t2.ndim == 3:
means = ma.empty((12, t2.shape[1], t2.shape[2])) # empty array to store means
# Calculate means month by month
for i in np.arange(12) + 1:
means[i - 1, :, :] = t2[months == i, :, :].mean(0)
if t2.ndim == 1:
means = np.empty((12)) # empty array to store means
# Calculate means month by month
for i in np.arange(12) + 1:
means[i - 1] = t2[months == i].mean(0)
return means
def calc_clim_year(nYR, nT, ngrdY, ngrdX, t2, time):
'''
Calculate annual mean timeseries and climatology for both 2-D and point time series.
'''
# Extract months from time variable
yy = np.empty(nT)
mm = np.empty(nT)
for t in np.arange(nT):
yy[t] = time[t].year
mm[t] = time[t].month
if t2.ndim == 3:
tSeries = ma.zeros((nYR, ngrdY, ngrdX))
i = 0
for myunit in np.unique(yy):
wh = (yy == myunit)
data = t2[wh, :, :]
tSeries[i, :, :] = ma.average(data, axis = 0)
#print 'data.shape= ',data.shape,' i= ',i,' yy= ',yy
i += 1
means = ma.zeros((ngrdY, ngrdX))
means = ma.average(tSeries, axis = 0)
elif t2.ndim == 1:
tSeries = ma.zeros((nYR))
i = 0
for myunit in np.unique(yy):
wh = (yy == myunit)
data = t2[wh]
tSeries[i] = ma.average(data, axis = 0)
#print 'data.shape= ',data.shape,' i= ',i,' yy= ',yy
i += 1
means = ma.zeros((ngrdY, ngrdX))
means = ma.average(tSeries, axis = 0)
return tSeries, means
def calc_clim_season(nYR, nT, mB, mE, ngrdY, ngrdX, t2, time):
'''
Calculate seasonal mean timeseries and climatology for both 2-D and point time series.
'''
#-------------------------------------------------------------------------------------
# Calculate seasonal mean timeseries and climatology for both 2-d and point time series
# The season to be calculated is defined by moB and moE; moE>=moB always
#-------------------------------------------------------------------------------------
# Extract months from time variable
yy = np.empty(nT)
mm = np.empty(nT)
for t in np.arange(nT):
yy[t] = time[t].year
mm[t] = time[t].month
if t2.ndim == 3:
tSeries = ma.zeros((nYR, ngrdY, ngrdX))
i = 0
for myunit in np.unique(yy):
wh = (yy == myunit) & (mm >= mB) & (mm <= mE)
data = t2[wh, :, :]
tSeries[i, :, :] = ma.average(data, axis = 0)
#print 'data.shape= ',data.shape,' i= ',i,' yy= ',yy
i += 1
means = ma.zeros((ngrdY, ngrdX))
means = ma.average(tSeries, axis = 0)
elif t2.ndim == 1:
tSeries = ma.zeros((nYR))
i = 0
for myunit in np.unique(yy):
wh = (yy == myunit) & (mm >= mB) & (mm <= mE)
data = t2[wh]
tSeries[i] = ma.average(data, axis = 0)
#print 'data.shape= ',data.shape,' i= ',i,' yy= ',yy
i += 1
means = ma.zeros((ngrdY, ngrdX))
means = ma.average(tSeries, axis = 0)
return tSeries, means
def calc_clim_mo(nYR, nT, ngrdY, ngrdX, t2, time):
'''
Calculate monthly means at every grid points and the annual time series of single model including single point case (ndim=1).
JK20: This is modified from 'calc_clim_month' with additional arguments & output, the annual time series of single model output (mData)
6/8/2013: bug fix: mm = months[t] --> mm = months[t] - 1, otherwise array overflow occurs
'''
# Extract months and monthly time series from the time and raw variable, respectively
months = np.empty(nT)
for t in np.arange(nT):
months[t] = time[t].month
if t == 0:
yy0 = time[t].year
# for a 2-D time series data
if t2.ndim == 3:
mData = ma.empty((nYR, 12, ngrdY, ngrdX))
for t in np.arange(nT):
yy = time[t].year
mm = months[t] - 1
yr = yy - yy0
mData[yr, mm, :, :] = t2[t, :, :]
# Calculate means month by month. means is an empty array to store means
means = ma.empty((12, ngrdY, ngrdX))
for i in np.arange(12) + 1:
means[i - 1, :, :] = t2[months == i, :, :].mean(0)
# for a point time series data
if t2.ndim == 1:
mData = ma.empty((nYR, 12))
for t in np.arange(nT):
yy = time[t].year
mm = months[t]
yr = yy - yy0
mData[yr, mm] = t2[t]
means = np.empty((12))
# Calculate means month by month. means is an empty array to store means
for i in np.arange(12) + 1:
means[i - 1] = t2[months == i].mean(0)
return mData, means
def calc_clim_One_month(moID, nYR, nT, t2, time):
'''
Calculate the montly mean at every grid point for a specified month.
'''
#-------------------------------------------------------------------------------------
# Calculate monthly means at every grid point for a specified month
#-------------------------------------------------------------------------------------
# Extract months and the corresponding time series from time variable
months = np.empty(nT)
for t in np.arange(nT):
months[t] = time[t].month
if t2.ndim == 3:
mData = ma.empty((nYR, t2.shape[1], t2.shape[2])) # empty array to store time series
n = 0
if months[t] == moID:
mData[n, :, :] = t2[t, :, :]
n += 1
means = ma.empty((t2.shape[1], t2.shape[2])) # empty array to store means
# Calculate means for the month specified by moID
means[:, :] = t2[months == moID, :, :].mean(0)
return mData, means
def calc_annual_cycle_means(data, time):
'''
Calculate monthly means for every grid point
Inputs::
data - masked 3d array of the model data (time, lon, lat)
time - an array of python datetime objects
'''
# Extract months from time variable
months = np.empty(len(time))
for t in np.arange(len(time)):
months[t] = time[t].month
#if there is data varying in t and space
if data.ndim == 3:
# empty array to store means
means = ma.empty((12, data.shape[1], data.shape[2]))
# Calculate means month by month
for i in np.arange(12) + 1:
means[i - 1, :, :] = data[months == i, :, :].mean(0)
#if the data is a timeseries over area-averaged values
if data.ndim == 1:
# TODO - Investigate using ma per KDW
means = np.empty((12)) # empty array to store means??WHY NOT ma?
# Calculate means month by month
for i in np.arange(12) + 1:
means[i - 1] = data[months == i].mean(0)
return means
def calc_annual_cycle_std(data, time):
'''
Calculate monthly standard deviations for every grid point
'''
# Extract months from time variable
months = np.empty(len(time))
for t in np.arange(len(time)):
months[t] = time[t].month
# empty array to store means
stds = np.empty((12, data.shape[1], data.shape[2]))
# Calculate means month by month
for i in np.arange(12) + 1:
stds[i - 1, :, :] = data[months == i, :, :].std(axis = 0, ddof = 1)
return stds
def calc_annual_cycle_domain_means(data, time):
'''
Calculate domain means for each month of the year
'''
# Extract months from time variable
months = np.empty(len(time))
for t in np.arange(len(time)):
months[t] = time[t].month
means = np.empty(12) # empty array to store means
# Calculate means month by month
for i in np.arange(12) + 1:
means[i - 1] = data[months == i, :, :].mean()
return means
def calc_annual_cycle_domain_std(data, time):
'''
Calculate domain standard deviations for each month of the year
'''
# Extract months from time variable
months = np.empty(len(time))
for t in np.arange(len(time)):
months[t] = time[t].month
stds = np.empty(12) # empty array to store means
# Calculate means month by month
for i in np.arange(12) + 1:
stds[i - 1] = data[months == i, :, :].std(ddof = 1)
return stds
def calc_bias_annual(t1, t2, optn): # Mean Bias
'''
Calculate the mean difference between two fields over time for each grid point.
'''
# Calculate mean difference between two fields over time for each grid point
# Precrocessing of both obs and model data ensures the absence of missing values
diff = t1-t2
if(open == 'abs'):
diff = abs(diff)
bias = diff.mean(axis = 0)
return bias
def calc_bias(t1, t2):
'''
Calculate mean difference between two fields over time for each grid point
Classify missing data resulting from multiple times (using threshold
data requirement)
i.e. if the working time unit is monthly data, and we are dealing with
multiple months of data then when we show mean of several months, we need
to decide what threshold of missing data we tolerate before classifying a
data point as missing data.
'''
t1Mask = process.create_mask_using_threshold(t1, threshold = 0.75)
t2Mask = process.create_mask_using_threshold(t2, threshold = 0.75)
diff = t1 - t2
bias = diff.mean(axis = 0)
# Set mask for bias metric using missing data in obs or model data series
# i.e. if obs contains more than threshold (e.g.50%) missing data
# then classify time average bias as missing data for that location.
bias = ma.masked_array(bias.data, np.logical_or(t1Mask, t2Mask))
return bias
def calc_bias_dom(t1, t2):
'''
Calculate domain mean difference between two fields over time
'''
diff = t1 - t2
bias = diff.mean()
return bias
def calc_difference(t1, t2):
'''
Calculate mean difference between two fields over time for each grid point
'''
print 'Calculating difference'
diff = t1 - t2
return diff
def calc_mae(t1, t2):
'''
Calculate mean difference between two fields over time for each grid point
Classify missing data resulting from multiple times (using threshold
data requirement)
i.e. if the working time unit is monthly data, and we are dealing with
multiple months of data then when we show mean of several months, we need
to decide what threshold of missing data we tolerate before classifying
a data point as missing data.
'''
t1Mask = process.create_mask_using_threshold(t1, threshold = 0.75)
t2Mask = process.create_mask_using_threshold(t2, threshold = 0.75)
diff = t1 - t2
adiff = abs(diff)
mae = adiff.mean(axis = 0)
# Set mask for mae metric using missing data in obs or model data series
# i.e. if obs contains more than threshold (e.g.50%) missing data
# then classify time average mae as missing data for that location.
mae = ma.masked_array(mae.data, np.logical_or(t1Mask, t2Mask))
return mae
def calc_mae_dom(t1, t2):
'''
Calculate domain mean difference between two fields over time
'''
diff = t1 - t2
adiff = abs(diff)
mae = adiff.mean()
return mae
def calc_rms(t1, t2):
'''
Calculate mean difference between two fields over time for each grid point
'''
diff = t1 - t2
sqdiff = diff ** 2
msd = sqdiff.mean(axis = 0)
rms = np.sqrt(msd)
return rms
def calc_rms_dom(t1, t2):
'''
Calculate RMS differences between two fields
'''
diff = t1 - t2
sqdiff = diff ** 2
msd = sqdiff.mean()
rms = np.sqrt(msd)
return rms
def calc_temporal_stdv(t1):
'''
Calculate the temporal standard deviation.
Input:
t1 - data array of any shape
Output:
A 2-D array of temporal standard deviation
'''
# TODO Make sure the first dimension of t1 is teh time axis.
stdv = t1.std(axis = 0)
return stdv
def calc_temporal_anom_cor(mD, oD):
'''
Calculate the temporal anomaly correlation.
Assumption(s);
The first dimension of mD and oD is the time axis.
Input:
mD - model data array of any shape
oD - observation data array of any shape
Output:
A 2-D array of time series pattern correlation coefficients at each grid point.
REF: 277-281 in Stat methods in atmos sci by Wilks, 1995, Academic Press, 467pp.
'''
mo = oD.mean(axis = 0)
nt = oD.shape[0]
deno1 = ((mD - mo) * (mD - mo)).sum(axis = 0)
deno2 = ((oD - mo) * (oD - mo)).sum(axis = 0)
patcor = ((mD - mo) * (oD - mo)).sum(axis = 0) / sqrt(deno1 * deno2)
return patcor
def calc_spatial_anom_cor(mD, oD):
'''
Calculate anomaly correlation between two 2-D arrays.
Input:
mD - 2-D array of model data
oD - 2-D array of observation data
Output:
The anomaly correlation between the two input arrays.
'''
mo = oD.mean()
d1 = ((mD - mo)*(mD - mo)).sum()
d2 = ((oD - mo)*(oD - mo)).sum()
patcor = ((mD - mo) * (oD - mo)).sum() / sqrt(d1 * d2)
return patcor
def calc_temporal_pat_cor(t1, t2):
'''
Calculate the Temporal Pattern Correlation
Input::
t1 - 3d array of model data
t2 - 3d array of obs data
Output::
2d array of time series pattern correlation coefficients at each grid point.
**Note:** std_dev is standardized on 1 degree of freedom
'''
mt1 = t1.mean(axis = 0)
mt2 = t2.mean(axis = 0)
nt = t1.shape[0]
sigma_t1 = t1.std(axis = 0, ddof = 1)
sigma_t2 = t2.std(axis = 0, ddof = 1)
patcor = ((((t1 - mt1) * (t2 - mt2)).sum(axis = 0)) / (nt)) / (sigma_t1 * sigma_t2)
return patcor
def calc_spatial_pat_cor(t1, t2):
'''
Calcualte pattern correlation between 2-D arrays.
6/10/2013: JK: Enforce both t1 & t2 have the identical mask before calculating std and corr
Input:
t1 - 2-D array of model data
t2 - 2-D array of observation data
Output:
Pattern correlation between two input arrays.
'''
import numpy as np
msk1 = ma.getmaskarray(t1)
msk2 = ma.getmaskarray(t2)
t1 = ma.masked_array(t1.data, np.logical_or(msk1, msk2))
t2 = ma.masked_array(t2.data, np.logical_or(msk1, msk2))
np = ma.count(t1)
mt1 = t1.mean()
mt2 = t2.mean()
st1 = t1.std()
st2 = t2.std()
patcor = ((t1 - mt1) * (t2 - mt2)).sum() / (np * st1 * st2)
return patcor
def calc_pat_cor2D(t1, t2, nT):
'''
Calculate the pattern correlation between 3-D input arrays.
Input:
t1 - 3-D array of model data
t2 - 3-D array of observation data
nT
Output:
1-D array (time series) of pattern correlation coefficients.
'''
# TODO - Update docstring. What is nT?
nt = t1.shape[0]
if(nt != nT):
print 'input time levels do not match: Exit', nT, nt
return -1
# store results in list for convenience (then convert to numpy array at the end)
patcor = []
for t in xrange(nt):
mt1 = t1[t, :, :].mean()
mt2 = t2[t, :, :].mean()
sigma_t1 = t1[t, :, :].std()
sigma_t2 = t2[t, :, :].std()
# TODO: make means and standard deviations weighted by grid box area.
patcor.append((((((t1[t, :, :] - mt1) * (t2[t, :, :] - mt2)).sum()) /
(t1.shape[1] * t1.shape[2]) ) / (sigma_t1 * sigma_t2)))
print t, mt1.shape, mt2.shape, sigma_t1.shape, sigma_t2.shape, patcor[t]
# TODO: deal with missing data appropriately, i.e. mask out grid points with missing data above tolerence level
# convert from list into numpy array
patcor = numpy.array(patcor)
#print patcor.shape
return patcor
def calc_pat_cor(dataset_1, dataset_2):
'''
Purpose: Calculate the Pattern Correlation Timeseries
Assumption(s)::
Both dataset_1 and dataset_2 are the same shape.
* lat, lon must match up
* time steps must align (i.e. months vs. months)
Input::
dataset_1 - 3d (time, lat, lon) array of data
dataset_2 - 3d (time, lat, lon) array of data
Output:
patcor - a 1d array (time series) of pattern correlation coefficients.
**Note:** Standard deviation is using 1 degree of freedom. Debugging print
statements to show the difference the n-1 makes. http://docs.scipy.org/doc/numpy/reference/generated/numpy.std.html
6/17/2013 JK: Add an option for a 1-d arrays
'''
# TODO: Add in try block to ensure the shapes match
nDim1 = dataset_1.ndim
nDim2 = dataset_2.ndim
if nDim1 != nDim2:
print 'dimension mismatch in calc_pat_cor: exit', nDim1,nDim2
sys.exit()
if nDim1 == 1:
mt1 = dataset_1.mean()
mt2 = dataset_2.mean()
nt = dataset_1.shape[0]
sigma_t1 = dataset_1.std()
sigma_t2 = dataset_2.std()
patcor=((dataset_1 - mt1) * (dataset_2 - mt2)).sum() / (nt * sigma_t1 * sigma_t2)
elif nDim1 == 2:
# find mean and std_dev
mt1 = dataset_1.mean()
mt2 = dataset_2.mean()
ny = dataset_1.shape[0]
nx = dataset_1.shape[1]
sigma_t1 = dataset_1.std()
sigma_t2 = dataset_2.std()
patcor=((dataset_1 - mt1) * (dataset_2 - mt2)).sum() / ((ny * nx) * (sigma_t1 * sigma_t2))
elif nDim1 == 3:
nt = dataset_1.shape[0]
ny = dataset_1.shape[1]
nx = dataset_1.shape[2]
# store results in list for convenience (then convert to numpy array)
patcor = []
for t in xrange(nt):
# find mean and std_dev
mt1 = dataset_1[t, :, :].mean()
mt2 = dataset_2[t, :, :].mean()
sigma_t1 = dataset_1[t, :, :].std(ddof = 1)
sigma_t2 = dataset_2[t, :, :].std(ddof=1)
# TODO: make means and standard deviations weighted by grid box area.
# Equation from Santer_et_al 1995
# patcor = (1/(N*M_std*O_std))*sum((M_i-M_bar)*(O_i-O_bar))
patcor.append((((((dataset_1[t, :, :] - mt1) * (dataset_2[t, :, :] - mt2)).sum()) / (ny * nx)) / (sigma_t1 * sigma_t2)))
print t, mt1.shape, mt2.shape, sigma_t1.shape, sigma_t2.shape, patcor[t]
# TODO: deal with missing data appropriately, i.e. mask out grid points
# with missing data above tolerance level
# convert from list into numpy array
patcor = np.array(patcor)
#print patcor.shape, patcor
return patcor
def calc_anom_corn(dataset_1, dataset_2, climatology = None):
'''
Calculate the anomaly correlation.
Input:
dataset_1 - First input dataset
dataset_2 - Second input dataset
climatology - Optional climatology input array. Assumption is that it is for
the same time period by default.
Output:
The anomaly correlation.
'''
# TODO: Update docstring with actual useful information
# store results in list for convenience (then convert to numpy array)
anomcor = []
nt = dataset_1.shape[0]
#prompt for the third file, i.e. climo file...
#include making sure the lat, lon and times are ok for comparision
# find the climo in here and then using for eg, if 100 yrs
# is given for the climo file, but only looking at 10yrs
# ask if want to input climo dataset for use....if no, call previous
if climatology != None:
climoFileOption = raw_input('Would you like to use the full observation dataset as \
the climatology in this calculation? [y/n] \n>')
if climoFileOption == 'y':
base_dataset = climatology
else:
base_dataset = dataset_2
for t in xrange(nt):
mean_base = base_dataset[t, :, :].mean()
anomcor.append((((dataset_1[t, :, :] - mean_base) * (dataset_2[t, :, :] - mean_base)).sum()) /
np.sqrt(((dataset_1[t, :, :] - mean_base) ** 2).sum() *
((dataset_2[t, :, :] - mean_base) ** 2).sum()))
print t, mean_base.shape, anomcor[t]
# TODO: deal with missing data appropriately, i.e. mask out grid points
# with missing data above tolerence level
# convert from list into numpy array
anomcor = np.array(anomcor)
print anomcor.shape, anomcor.ndim, anomcor
return anomcor
def calc_anom_cor(t1, t2):
'''
Calculate the Anomaly Correlation (Deprecated)
'''
nt = t1.shape[0]
# store results in list for convenience (then convert to numpy
# array at the end)
anomcor = []
for t in xrange(nt):
mt2 = t2[t, :, :].mean()
sigma_t1 = t1[t, :, :].std(ddof = 1)
sigma_t2 = t2[t, :, :].std(ddof = 1)
# TODO: make means and standard deviations weighted by grid box area.
anomcor.append(((((t1[t, :, :] - mt2) * (t2[t, :, :] - mt2)).sum()) /
(t1.shape[1] * t1.shape[2])) / (sigma_t1 * sigma_t2))
print t, mt2.shape, sigma_t1.shape, sigma_t2.shape, anomcor[t]
# TODO: deal with missing data appropriately, i.e. mask out grid points with
# missing data above tolerence level
# convert from list into numpy array
anomcor = np.array(anomcor)
print anomcor.shape, anomcor.ndim, anomcor
return anomcor
def calc_nash_sutcliff(dataset_1, dataset_2):
'''
Routine to calculate the Nash-Sutcliff coefficient of efficiency (E)
Assumption(s)::
Both dataset_1 and dataset_2 are the same shape.
* lat, lon must match up
* time steps must align (i.e. months vs. months)
Input::
dataset_1 - 3d (time, lat, lon) array of data
dataset_2 - 3d (time, lat, lon) array of data
Output:
nashcor - 1d array aligned along the time dimension of the input
datasets. Time Series of Nash-Sutcliff Coefficient of efficiency
'''
nt = dataset_1.shape[0]
nashcor = []
for t in xrange(nt):
mean_dataset_2 = dataset_2[t, :, :].mean()
nashcor.append(1 - ((((dataset_2[t, :, :] - dataset_1[t, :, :]) ** 2).sum()) /
((dataset_2[t, :, :] - mean_dataset_2) ** 2).sum()))
print t, mean_dataset_2.shape, nashcor[t]
nashcor = np.array(nashcor)
print nashcor.shape, nashcor.ndim, nashcor
return nashcor
def calc_pdf(dataset_1, dataset_2):
'''
Routine to calculate a normalized Probability Distribution Function with
bins set according to data range.
Equation from Perkins et al. 2007
PS=sum(min(Z_O_i, Z_M_i)) where Z is the distribution (histogram of the data for either set)
called in do_rcmes_processing_sub.py
Inputs::
2 arrays of data
t1 is the modelData and t2 is 3D obsdata - time,lat, lon NB, time here
is the number of time values eg for time period 199001010000 - 199201010000
if annual means-opt 1, was chosen, then t2.shape = (2,lat,lon)
if monthly means - opt 2, was choosen, then t2.shape = (24,lat,lon)
User inputs: number of bins to use and edges (min and max)
Output:
one float which represents the PDF for the year
TODO: Clean up this docstring so we have a single purpose statement
Routine to calculate a normalised PDF with bins set according to data range.
Input::
2 data arrays, modelData and obsData
Output::
PDF for the year
'''
#list to store PDFs of modelData and obsData
pdf_mod = []
pdf_obs = []
# float to store the final PDF similarity score
similarity_score = 0.0
d1_max = dataset_1.amax()
d1_min = dataset_1.amin()
print 'min modelData', dataset_1[:, :, :].min()
print 'max modelData', dataset_1[:, :, :].max()
print 'min obsData', dataset_2[:, :, :].min()
print 'max obsData', dataset_2[:, :, :].max()
# find a distribution for the entire dataset
#prompt the user to enter the min, max and number of bin values.
# The max, min info above is to help guide the user with these choises
print '****PDF input values from user required **** \n'
nbins = int (raw_input('Please enter the number of bins to use. \n'))
minEdge = float(raw_input('Please enter the minimum value to use for the edge. \n'))
maxEdge = float(raw_input('Please enter the maximum value to use for the edge. \n'))
mybins = np.linspace(minEdge, maxEdge, nbins)
print 'nbins is', nbins, 'mybins are', mybins
# TODO: there is no 'new' kwargs for numpy.histogram
# per: http://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
# PLAN: Replace new with density param.
pdf_mod, edges = np.histogram(dataset_1, bins = mybins, normed = True, new = True)
print 'dataset_1 distribution and edges', pdf_mod, edges
pdf_obs, edges = np.histogram(dataset_2, bins = mybins, normed = True, new = True)
print 'dataset_2 distribution and edges', pdf_obs, edges
# TODO: drop this
"""
considering using pdf function from statistics package. It is not
installed. Have to test on Mac.
http://bonsai.hgc.jp/~mdehoon/software/python/Statistics/manual/index.xhtml#TOC31
pdf_mod, edges = stats.pdf(dataset_1, bins=mybins)
print 'dataset_1 distribution and edges', pdf_mod, edges
pdf_obs,edges=stats.pdf(dataset_2,bins=mybins)
print 'dataset_2 distribution and edges', pdf_obs, edges
"""
#find minimum at each bin between lists
i = 0
for model_value in pdf_mod :
print 'model_value is', model_value, 'pdf_obs[', i, '] is', pdf_obs[i]
if model_value < pdf_obs[i]:
similarity_score += model_value
else:
similarity_score += pdf_obs[i]
i += 1
print 'similarity_score is', similarity_score
return similarity_score
def calc_stdev(t1):
'''
Calculate the standard deviation for a given dataset.
Input:
t1 - Dataset to calculate the standard deviation on.
Output:
Array of the standard deviations for each month in the provided dataset.
'''
nt = t1.shape[0]
sigma_t1 = []
for t in xrange(nt):
sigma_t1.append(t1[t, :, :].std(ddof = 1))
sigma_t1 = np.array(sigma_t1)
print sigma_t1, sigma_t1.shape
return sigma_t1
# 6/10/2013 JK: plotting routines are added below
def pow_round(x):
'''
Function to round x to the nearest power of 10
'''
return 10 ** (floor(log(x, 10) - log(0.5, 10)))
def calcNiceIntervals(data, nLevs):
'''
Purpose::
Calculates nice intervals between each color level for colorbars
and contour plots. The target minimum and maximum color levels are
calculated by taking the minimum and maximum of the distribution
after cutting off the tails to remove outliers.
Input::
data - an array of data to be plotted
nLevs - an int giving the target number of intervals
Output::
cLevs - A list of floats for the resultant colorbar levels
'''
# Find the min and max levels by cutting off the tails of the distribution
# This mitigates the influence of outliers
data = data.ravel()
mnLvl = mstats.scoreatpercentile(data, 5)
mxLvl = mstats.scoreatpercentile(data, 95)
locator = mpl.ticker.MaxNLocator(nLevs)
cLevs = locator.tick_values(mnLvl, mxLvl)
# Make sure the bounds of cLevs are reasonable since sometimes
# MaxNLocator gives values outside the domain of the input data
cLevs = cLevs[(cLevs >= mnLvl) & (cLevs <= mxLvl)]
return cLevs
def calcBestGridShape(nPlots, oldShape):
'''
Purpose::
Calculate a better grid shape in case the user enters more columns
and rows than needed to fit a given number of subplots.
Input::
nPlots - an int giving the number of plots that will be made
oldShape - a tuple denoting the desired grid shape (nRows, nCols) for arranging
the subplots originally requested by the user.
Output::
newShape - the smallest possible subplot grid shape needed to fit nPlots
'''
nRows, nCols = oldShape
size = nRows * nCols
diff = size - nPlots
if diff < 0:
raise ValueError('gridShape=(%d, %d): Cannot fit enough subplots for data' %(nRows, nCols))
else:
# If the user enters an excessively large number of
# rows and columns for gridShape, automatically
# correct it so that it fits only as many plots
# as needed
while diff >= nCols:
nRows -= 1
size = nRows * nCols
diff = size - nPlots
# Don't forget to remove unnecessary columns too
if nRows == 1:
nCols = nPlots
newShape = nRows, nCols
return newShape
def drawPortraitDiagramSingle(data, rowLabels, colLabels, cLevs, fName, fType = 'png',
xLabel = '', yLabel = '', cLabel = '', pTitle = '', cMap = None):
'''
Purpose::
Makes a portrait diagram plot.
Input::
data - 2d array of the field to be plotted
rowLabels - list of strings denoting labels for each row
colLabels - list of strings denoting labels for each column
cLevs - a list of integers or floats specifying the colorbar levels
xLabel - a string specifying the x-axis title
yLabel - a string specifying the y-axis title
cLabel - a string specifying the colorbar title
pTitle - a string specifying the plot title
fName - a string specifying the filename of the plot
fType - an optional string specifying the filetype, default is .png
cMap - an optional matplotlib.LinearSegmentedColormap object denoting the colormap,
'''
# Set up the colormap if not specified
if cMap is None:
cMap = plt.cm.RdBu_r
# Set up figure and axes
fig = plt.figure()
ax = fig.add_subplot(111)
# Make the portrait diagram
norm = mpl.colors.BoundaryNorm(cLevs, cMap.N)
cs = ax.matshow(data, cmap = cMap, aspect = 'auto', origin = 'lower', norm = norm)
# Add colorbar
cbar = fig.colorbar(cs, norm = norm, boundaries = cLevs, drawedges = True,
pad = .05)
cbar.set_label(cLabel)
cbar.set_ticks(cLevs)
cbar.ax.xaxis.set_ticks_position("none")
cbar.ax.yaxis.set_ticks_position("none")
# Add grid lines
ax.xaxis.set_ticks(np.arange(data.shape[1] + 1))
ax.yaxis.set_ticks(np.arange(data.shape[0] + 1))
x = (ax.xaxis.get_majorticklocs() - .5)
y = (ax.yaxis.get_majorticklocs() - .5)
ax.vlines(x, y.min(), y.max())
ax.hlines(y, x.min(), x.max())
# Configure ticks
ax.xaxis.tick_bottom()
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.set_xticklabels(rowLabels)
ax.set_yticklabels(colLabels)
# Add labels and title
ax.set_xlabel(xLabel)
ax.set_ylabel(yLabel)
ax.set_title(pTitle)
# Save the figure
fig.savefig('%s.%s' %(fName, fType))
plt.show()
def drawPortraitDiagram(dataset, rowLabels, colLabels, fName, fType = 'png',
gridShape = (1, 1), xLabel = '', yLabel = '', cLabel = '',
pTitle = '', subTitles = None, cMap = None, cLevs = None,