/
eia861.py
2395 lines (2019 loc) · 91 KB
/
eia861.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Module to perform data cleaning functions on EIA861 data tables.
All transformations include:
- Replace . values with NA.
"""
import logging
from typing import Dict
import pandas as pd
import pudl
from pudl.helpers import convert_cols_dtypes
from pudl.metadata.enums import (
CUSTOMER_CLASSES,
FUEL_CLASSES,
NERC_REGIONS,
RELIABILITY_STANDARDS,
REVENUE_CLASSES,
RTO_CLASSES,
TECH_CLASSES,
)
from pudl.metadata.fields import apply_pudl_dtypes
from pudl.metadata.labels import ESTIMATED_OR_ACTUAL, MOMENTARY_INTERRUPTIONS
from pudl.settings import Eia861Settings
logger = logging.getLogger(__name__)
BA_ID_NAME_FIXES: pd.DataFrame = (
pd.DataFrame(
[
# report_date, util_id, ba_id, ba_name
("2001-01-01", 40577, 99999, "Multiple Control Areas"),
("2002-01-01", 40577, 99999, "Multiple Control Areas"),
("2002-01-01", 2759, 13781, "Xcel Energy"),
("2002-01-01", 1004, 40604, "Heartland Consumer Power Dist."),
("2002-01-01", 5659, 20847, "Wisconsin Electric Power"),
("2002-01-01", 5588, 9417, "Interstate Power & Light"),
("2002-01-01", 6112, 9417, "INTERSTATE POWER & LIGHT"),
("2002-01-01", 6138, 13781, "Xcel Energy"),
("2002-01-01", 6276, pd.NA, "Vectren Energy Delivery"),
("2002-01-01", 6501, 9417, "Interstate Power and Light"),
("2002-01-01", 6579, 4716, "Dairyland Power Coop"),
("2002-01-01", 6848, pd.NA, pd.NA),
("2002-01-01", 7140, 18195, "Southern Co Services Inc"),
("2002-01-01", 7257, 22500, "Westar Energy"),
("2002-01-01", 7444, 14232, "Minnkota Power Cooperative"),
("2002-01-01", 8490, 22500, "Westar"),
("2002-01-01", 8632, 12825, "NorthWestern Energy"),
("2002-01-01", 8770, 22500, "Westar Energy"),
("2002-01-01", 8796, 13434, "ISO New England"),
("2002-01-01", 9699, pd.NA, "Tri-State G&T"),
("2002-01-01", 10040, 13781, "Xcel Energy"),
("2002-01-01", 10171, 56669, "Midwest Indep System Operator"),
("2002-01-01", 11053, 9417, "INTERSTATE POWER & LIGHT"),
("2002-01-01", 11148, 2775, "California ISO"),
("2002-01-01", 11522, 1, "Maritimes-Canada"),
("2002-01-01", 11731, 13781, "XCEL Energy"),
("2002-01-01", 11788, 9417, "Interstate Power & Light"),
("2002-01-01", 12301, 14232, "Minnkota Power Cooperative"),
("2002-01-01", 12698, 20391, "Aquila Networks - MPS"),
("2002-01-01", 12706, 18195, "Southern Co Services Inc"),
("2002-01-01", 3258, 9417, "Interstate Power & Light"),
("2002-01-01", 3273, 15473, "Public Regulatory Commission"),
("2002-01-01", 3722, 9417, "Interstate Power and Light"),
("2002-01-01", 1417, 12825, "NorthWestern Energy"),
("2002-01-01", 1683, 12825, "Northwestern Energy"),
("2002-01-01", 1890, 5416, "Duke Energy Corporation"),
("2002-01-01", 4319, 20447, "Okla. Municipal Pwr. Authority"),
("2002-01-01", 18446, 9417, "Interstate Power and Light"),
("2002-01-01", 19108, pd.NA, "NC Rural Electrification Auth."),
("2002-01-01", 19545, 28503, "Western Area Power Admin"),
("2002-01-01", 12803, 18195, "Southern Illinois Power"),
("2002-01-01", 13382, 8283, "Harrison County Rural Electric"),
("2002-01-01", 13423, 829, "Town of New Carlisle"),
("2002-01-01", 13815, 13781, "Xcel Energy"),
("2002-01-01", 14649, 18195, "GSOC (Georgia System Operation"),
("2002-01-01", 15672, 924, "Associated Electric Coop Inc"),
("2002-01-01", 16023, 9417, "Interstate Power and Light"),
("2002-01-01", 16463, pd.NA, "Central Louisiana Electric Co."),
("2002-01-01", 16922, 22500, "Westar Energy"),
("2002-01-01", 16992, 9417, "Interstate Power and Light"),
("2002-01-01", 17643, 924, "Associated Electric Coop Inc"),
("2002-01-01", 17706, 9417, "Interstate Power & Light"),
("2002-01-01", 20811, 19876, "Dominion NC Power"),
("2002-01-01", 3227, 15466, "Xcel Energy"),
("2002-01-01", 20227, 14063, "OG&E"),
("2002-01-01", 17787, 13337, "Mun. Energy Agcy of Nebraska"),
("2002-01-01", 19264, 17718, "Excel Energy"),
("2002-01-01", 11701, 19578, "We Energies"),
("2002-01-01", 28802, 14725, "PJM Interconnection"),
("2002-01-01", 20546, 1692, "Big Rivers Electric Corp."),
("2002-01-01", 6223, 1, "Maritimes-Canada"),
("2002-01-01", 14405, 19876, "VA Power"),
("2002-01-01", 14405, 14725, "PJM"),
("2002-01-01", 12698, 20391, "Aquila Networks - L&P"),
("2002-01-01", 16267, 12698, "Aquila"),
("2002-01-01", 15871, 5723, "ERC of Texas"),
("2002-01-01", 6753, 28503, "Regional Office"),
("2002-01-01", 5571, 14328, "Pacific Gas and Electric Co."),
("2002-01-01", 367, pd.NA, "Western Area Power Admin"),
("2002-01-01", 3247, 13501, "NYISO"),
("2002-01-01", 11014, 5723, "Ercot"),
("2002-01-01", 20845, 12427, "Michigan Power Pool 12427"),
("2002-01-01", 17267, pd.NA, "Watertown, SD"),
("2002-01-01", 12811, pd.NA, "First Energy Corp."),
("2002-01-01", 17368, 13501, "NYISO"),
("2002-01-01", 5877, 13501, "NYISO"),
("2002-01-01", 3240, pd.NA, "Pacific NW Generating Cooperat"),
("2002-01-01", 3037, pd.NA, "Trans Electric"),
("2002-01-01", 12199, 28503, "WAPA-Rocky Mountain"),
("2002-01-01", 8936, 14378, "Pacificorp"),
("2002-01-01", 40604, pd.NA, "Watertown, SD Office"),
("2002-01-01", 19108, pd.NA, "USDA- Rural Utility Service"),
("2002-01-01", 8199, 20391, "Aquila"),
("2002-01-01", 12698, 20391, "Aquila Networks - WPC"),
("2002-01-01", 12698, 20391, "Aquila Networks - WPK"),
("2002-01-01", 20387, 14725, "PJM West"),
("2002-01-01", 588, 20447, "Western Farmers Elec Coop Inc"),
("2002-01-01", 17561, 5723, "ERCOT ISO"),
("2002-01-01", 17320, 13781, "Xcel Energy"),
("2002-01-01", 13676, 17716, "Southwestern Power Admin."),
("2002-01-01", 5703, 13501, "NTISO"),
("2002-01-01", 113, 13501, "NYISO"),
("2002-01-01", 4486, pd.NA, "REMC of Western Indiana"),
("2002-01-01", 1039, 13501, "NYISO"),
("2002-01-01", 5609, pd.NA, "NMISA"),
("2002-01-01", 3989, pd.NA, "WAPA"),
("2002-01-01", 13539, 13501, "NY Independent System Operator"),
("2002-01-01", 15263, 14725, "PJM West"),
("2002-01-01", 12796, 14725, "PJM West"),
("2002-01-01", 3539, 13434, "ISO New England"),
("2002-01-01", 3575, 13434, "ISO New England"),
("2002-01-01", 3559, 13434, "ISO New England"),
("2002-01-01", 18193, pd.NA, pd.NA),
("2002-01-01", 838, 3413, "Chelan PUD"),
("2002-01-01", 1049, 1738, "Bonneville"),
("2002-01-01", 9248, 14725, "PJM"),
("2002-01-01", 15026, 803, "APS Control Area"),
("2002-01-01", 798, 16572, "Salt River Project"),
("2002-01-01", 5603, 13501, "ISO - NEW YORK"),
("2002-01-01", 12260, 19876, "Dominion Virginia Power"),
("2002-01-01", 14788, 17716, "Southwest Power Administration"),
("2002-01-01", 12909, 22500, "Westar Energy"),
("2002-01-01", 5605, 9417, "Interstate Power and Light"),
("2002-01-01", 10908, 9417, "Interstate Power and Light"),
("2003-01-01", 3258, 9417, "Interstate Power & Light"),
("2003-01-01", 6501, 9417, "Interstate Power & Light"),
("2003-01-01", 10650, 9417, "Interstate Power & Light"),
("2003-01-01", 16992, 9417, "Interstate Power & Light"),
("2003-01-01", 3722, 9417, "Interstate Power & Light"),
("2003-01-01", 11788, 9417, "Interstate Power & Light"),
("2003-01-01", 5588, 9417, "Interstate Power & Light"),
("2003-01-01", 11053, 9417, "Interstate Power & Light"),
("2003-01-01", 16023, 9417, "Interstate Power & Light"),
("2003-01-01", 17706, 9417, "Interstate Power & Light"),
("2003-01-01", 18446, 9417, "Interstate Power & Light"),
("2004-01-01", 5309, 18195, "Southern Company Services Inc"),
("2004-01-01", 192, 192, "Ryant T. Rose"),
("2004-01-01", 6501, 9417, "Interstate Power & Light"),
("2004-01-01", 16992, 9417, "Interstate Power & Light"),
("2004-01-01", 8192, 14725, "PJM-West"),
("2004-01-01", 192, 192, "Phillip K. Peter, Sr."),
("2004-01-01", 192, 192, "Nelson Kinegak"),
("2004-01-01", 1004, 40604, "Heartland Consumer Power Dist."),
("2004-01-01", 3258, 9417, "Interstate Power & Light"),
("2004-01-01", 3722, 9417, "Interstate Power & Light"),
("2004-01-01", 19879, pd.NA, "Kevin Smalls St Croix Districe"),
("2004-01-01", 11788, 9417, "Interstate Power & Light"),
("2004-01-01", 4191, 13434, "NEISO"),
("2004-01-01", 10650, 9417, "Interstate Power & Light"),
("2004-01-01", 11053, 9417, "Interstate Power & Light"),
("2004-01-01", 18446, 9417, "Interstate Power & Light"),
("2004-01-01", 27000, pd.NA, "Multiple Operators"),
("2004-01-01", 19879, pd.NA, "Corey Hodge - St Thomass/St Jo"),
("2004-01-01", 13382, 8283, "Harrison County Rural Electric"),
("2004-01-01", 10784, pd.NA, "Hawkeye Tri-county REC"),
("2004-01-01", 16922, pd.NA, "The Brown Atchison Electric Co"),
("2004-01-01", 15026, 803, "APS Control Area"),
("2005-01-01", 192, 192, "Ryant T. Rose"),
("2005-01-01", 192, 192, "Phillip K. Peter, Sr."),
("2005-01-01", 192, 182, "Nelson Kinegak"),
("2005-01-01", 3258, 9417, "Interstate Power & Light"),
("2005-01-01", 1004, 40604, "Heartland Consumer Power Dist."),
("2005-01-01", 5309, 18195, "Southern Company Services Inc"),
("2005-01-01", 6501, 9417, "Interstate Power & Light"),
("2005-01-01", 10623, 6455, "Florida Power Corp"),
("2005-01-01", 10650, 9417, "Interstate Power & Light"),
("2005-01-01", 13382, 8283, "Harrison County Rural Electric"),
("2005-01-01", 16922, pd.NA, "The Brown Atchison Electric Co"),
("2005-01-01", 3722, 9417, "Interstate Power & Light"),
("2005-01-01", 4191, 13434, "NEISO"),
("2005-01-01", 11788, 9417, "Interstate Power & Light"),
("2005-01-01", 8192, 14725, "PJM-West"),
("2005-01-01", 11053, 9417, "Interstate Power & Light"),
("2005-01-01", 13815, 13781, "Northern States Power Co"),
("2005-01-01", 15026, 803, "APS Control Area"),
("2005-01-01", 18446, 9417, "Interstate Power & Light"),
("2005-01-01", 19879, pd.NA, "Kevin Smalls St Croix Districe"),
("2005-01-01", 19879, pd.NA, "Corey Hodge - St Thomass/St Jo"),
("2005-01-01", 27000, pd.NA, "Multiple Operators"),
("2005-01-01", 10610, 13501, "ISO New York"),
("2006-01-01", 10610, 13501, "ISO New York"),
("2008-01-01", 10610, 13501, "ISO New York"),
("2009-01-01", 10610, 13501, "ISO New York"),
("2010-01-01", 6389, 3755, "Cleveland Electric Illum Co"),
("2010-01-01", 6389, 13998, "Ohio Edison Co"),
("2010-01-01", 6389, 18997, "Toledo Edison Co"),
("2010-01-01", 6949, 10000, "Kansas City Power & Light Co"),
("2010-01-01", 14127, 14127, "Omaha Public Power District"),
("2010-01-01", 11196, 13434, "ISO New England"),
("2010-01-01", 97, 56669, "Midwest Independent System Operator"),
("2010-01-01", 3258, 56669, "Midwest Independent System Operator"),
("2010-01-01", 3405, 56669, "Midwest Independent System Operator"),
("2010-01-01", 3755, 56669, "Midwest Independent System Operator"),
("2010-01-01", 7292, 56669, "Midwest Independent System Operator"),
("2010-01-01", 8847, 56669, "Midwest Independent System Operator"),
("2010-01-01", 11701, 56669, "Midwest Independent System Operator"),
("2010-01-01", 13032, 56669, "Midwest Independent System Operator"),
("2010-01-01", 13998, 56669, "Midwest Independent System Operator"),
("2010-01-01", 14716, 56669, "Midwest Independent System Operator"),
("2010-01-01", 17141, 56669, "Midwest Independent System Operator"),
("2010-01-01", 18997, 56669, "Midwest Independent System Operator"),
("2010-01-01", 21249, 56669, "Midwest Independent System Operator"),
("2010-01-01", 40582, 56669, "Midwest Independent System Operator"),
("2010-01-01", 54862, 56669, "Midwest Independent System Operator"),
("2010-01-01", 56162, 56669, "Midwest Independent System Operator"),
("2010-01-01", 56496, 56669, "Midwest Independent System Operator"),
("2010-01-01", 10610, 13501, "ISO New York"),
("2011-01-01", 1968, 56669, "Midwest Independent System Operator"),
("2011-01-01", 20806, 56669, "Midwest Independent System Operator"),
("2011-01-01", 29296, 56669, "Midwest Independent System Operator"),
("2012-01-01", 1968, 56669, "Midwest Independent System Operator"),
("2012-01-01", 20806, 56669, "Midwest Independent System Operator"),
("2012-01-01", 29296, 56669, "Midwest Independent System Operator"),
],
columns=[
"report_date", # We have this
"utility_id_eia", # We have this
"balancing_authority_id_eia", # We need to set this
"balancing_authority_name_eia", # We have this
],
)
.assign(report_date=lambda x: pd.to_datetime(x.report_date))
.pipe(apply_pudl_dtypes, group="eia")
.dropna(subset=["report_date", "balancing_authority_name_eia", "utility_id_eia"])
.set_index(["report_date", "balancing_authority_name_eia", "utility_id_eia"])
)
EIA_FIPS_COUNTY_FIXES: pd.DataFrame = pd.DataFrame(
[
("AK", "Aleutians Ea", "Aleutians East"),
("AK", "Aleutian Islands", "Aleutians East"),
("AK", "Aleutians East Boro", "Aleutians East Borough"),
("AK", "Prince of Wales Ketchikan", "Prince of Wales-Hyder"),
("AK", "Prince Wales", "Prince of Wales-Hyder"),
("AK", "Ketchikan Gateway Bo", "Ketchikan Gateway Borough"),
("AK", "Prince of Wale", "Prince of Wales-Hyder"),
("AK", "Wrangell Petersburg", "Wrangell"),
("AK", "Wrangell Pet", "Wrangell"),
("AK", "Borough, Kodiak Island", "Kodiak Island Borough"),
("AK", "Matanuska Susitna Borough", "Matanuska-Susitna"),
("AK", "Matanuska Susitna", "Matanuska-Susitna"),
("AK", "Skagway-Yakutat", "Skagway"),
("AK", "Skagway Yaku", "Skagway"),
("AK", "Skagway Hoonah Angoon", "Hoonah-Angoon"),
("AK", "Angoon", "Hoonah-Angoon"),
("AK", "Hoonah", "Hoonah-Angoon"),
("AK", "Yukon Koyukuk", "Yukon-Koyukuk"),
("AK", "Yukon Koyuku", "Yukon-Koyukuk"),
("AK", "Yukon-Koyuku", "Yukon-Koyukuk"),
("AK", "Valdez Cordova", "Valdez-Cordova"),
("AK", "Cordova", "Valdez-Cordova"),
("AK", "Valdez Cordo", "Valdez-Cordova"),
("AK", "Lake and Pen", "Lake and Peninsula"),
("AK", "Lake & Peninsula Borough", "Lake and Peninsula"),
("AK", "Kodiak Islan", "Kodiak Island"),
("AK", "Kenai Penins", "Kenai Peninsula"),
("AK", "NW Arctic Borough", "Northwest Arctic"),
("AL", "De Kalb", "DeKalb"),
("AR", "Saint Franci", "St. Francis"),
("CA", "San Bernadino", "San Bernardino"),
("CA", "San Bernardi", "San Bernardino"),
("CT", "Shelton", "Fairfield"),
("FL", "De Soto", "DeSoto"),
("FL", "Miami Dade", "Miami-Dade"),
("FL", "Dade", "Miami-Dade"),
("FL", "St. Lucic", "St. Lucie"),
("FL", "St. Loucie", "St. Lucie"),
("GA", "De Kalb", "DeKalb"),
("GA", "Chattahooche", "Chattahoochee"),
("IA", "Pottawattami", "Pottawattamie"),
("IA", "Kossuh", "Kossuth"),
("IA", "Lousia", "Louisa"),
("IA", "Poweshick", "Poweshiek"),
("IA", "Humbolt", "Humboldt"),
("IA", "Harris", "Harrison"),
("IA", "O Brien", "O'Brien"),
("IL", "JoDavies", "Jo Daviess"),
("IL", "La Salle", "LaSalle"),
("IL", "Green", "Greene"),
("IL", "DeWitt", "De Witt"),
("IL", "Dewitt", "De Witt"),
("IL", "Du Page", "DuPage"),
("IL", "Burke", "Christian"),
("IL", "McCoupin", "Macoupin"),
("IN", "De Kalb County", "DeKalb County"),
("IN", "De Kalb", "DeKalb County"),
("IN", "La Porte", "LaPorte"),
("IN", "Putman", "Putnam"),
("IN", "Pyke", "Pike"),
("IN", "Sulliva", "Sullivan"),
("KS", "Leaveworth", "Leavenworth"),
("KY", "Spenser", "Spencer"),
("LA", "Jefferson Da", "Jefferson Davis"),
("LA", "Pointe Coupe", "Pointe Coupee"),
("LA", "West Baton R", "West Baton Rouge"),
("LA", "DeSoto", "De Soto"),
("LA", "Burke", "Iberia"),
("LA", "West Feleciana", "West Feliciana"),
("MA", "North Essex", "Essex"),
("MI", "Grand Traver", "Grand Traverse"),
("MI", "Antim", "Antrim"),
("MD", "Balto. City", "Baltimore City"),
("MD", "Prince Georg", "Prince George's County"),
("MD", "Worchester", "Worcester"),
("MN", "Fairbault", "Faribault"),
("MN", "Lac Qui Parl", "Lac Qui Parle"),
("MN", "Lake of The", "Lake of the Woods"),
("MN", "Ottertail", "Otter Tail"),
("MN", "Yellow Medic", "Yellow Medicine"),
("MO", "De Kalb", "DeKalb"),
("MO", "Cape Girarde", "Cape Girardeau"),
("MS", "Clark", "Clarke"),
("MS", "Clark", "Clarke"),
("MS", "De Soto", "DeSoto"),
("MS", "Jefferson Da", "Jefferson Davis"),
("MS", "Homoshitto", "Amite"),
("MT", "Anaconda-Dee", "Deer Lodge"),
("MT", "Butte-Silver", "Silver Bow"),
("MT", "Golden Valle", "Golden Valley"),
("MT", "Lewis and Cl", "Lewis and Clark"),
("NC", "Hartford", "Hertford"),
("NC", "Gilford", "Guilford"),
("NC", "North Hampton", "Northampton"),
("ND", "La Moure", "LaMoure"),
("NH", "Plaquemines", "Coos"),
("NH", "New Hampshire", "Coos"),
("OK", "Cimmaron", "Cimarron"),
("NY", "Westcherster", "Westchester"),
("OR", "Unioin", "Union"),
("PA", "Northumberla", "Northumberland"),
("PR", "Aquadilla", "Aguadilla"),
("PR", "Sabana Grand", "Sabana Grande"),
("PR", "San Sebastia", "San Sebastian"),
("PR", "Trujillo Alt", "Trujillo Alto"),
("RI", "Portsmouth", "Newport"),
("TX", "Collingswort", "Collingsworth"),
("TX", "De Witt", "DeWitt"),
("TX", "Hayes", "Hays"),
("TX", "San Augustin", "San Augustine"),
("VA", "Alexandria C", "Alexandria City"),
("VA", "City of Suff", "Suffolk City"),
("VA", "City of Manassas", "Manassas City"),
("VA", "Charlottesvi", "Charlottesville City"),
("VA", "Chesapeake C", "Chesapeake City"),
("VA", "Clifton Forg", "Alleghany"),
("VA", "Colonial Hei", "Colonial Heights City"),
("VA", "Covington Ci", "Covington City"),
("VA", "Fredericksbu", "Fredericksburg City"),
("VA", "Hopewell Cit", "Hopewell City"),
("VA", "Isle of Wigh", "Isle of Wight"),
("VA", "King and Que", "King and Queen"),
("VA", "Lexington Ci", "Lexington City"),
("VA", "Manassas Cit", "Manassas City"),
("VA", "Manassas Par", "Manassas Park City"),
("VA", "Northumberla", "Northumberland"),
("VA", "Petersburg C", "Petersburg City"),
("VA", "Poquoson Cit", "Poquoson City"),
("VA", "Portsmouth C", "Portsmouth City"),
("VA", "Prince Edwar", "Prince Edward"),
("VA", "Prince Georg", "Prince George"),
("VA", "Prince Willi", "Prince William"),
("VA", "Richmond Cit", "Richmond City"),
("VA", "Staunton Cit", "Staunton City"),
("VA", "Virginia Bea", "Virginia Beach City"),
("VA", "Waynesboro C", "Waynesboro City"),
("VA", "Winchester C", "Winchester City"),
("WA", "Wahkiakurn", "Wahkiakum"),
],
columns=["state", "eia_county", "fips_county"],
)
BA_NAME_FIXES: pd.DataFrame = pd.DataFrame(
[
("Omaha Public Power District", 14127, "OPPD"),
("Kansas City Power & Light Co", 10000, "KCPL"),
("Toledo Edison Co", 18997, pd.NA),
("Ohio Edison Co", 13998, pd.NA),
("Cleveland Electric Illum Co", 3755, pd.NA),
],
columns=[
"balancing_authority_name_eia",
"balancing_authority_id_eia",
"balancing_authority_code_eia",
],
)
NERC_SPELLCHECK: Dict[str, str] = {
"GUSTAVUSAK": "ASCC",
"AK": "ASCC",
"HI": "HICC",
"ERCTO": "ERCOT",
"RFO": "RFC",
"RF": "RFC",
"SSP": "SPP",
"VACAR": "SERC", # VACAR is a subregion of SERC
"GATEWAY": "SERC", # GATEWAY is a subregion of SERC
"TERR": "GU",
25470: "MRO",
"TX": "TRE",
"NY": "NPCC",
"NEW": "NPCC",
"YORK": "NPCC",
}
###############################################################################
# EIA Form 861 Transform Helper functions
###############################################################################
def _filter_class_cols(df, class_list):
regex = f"^({'_|'.join(class_list)}).*$"
return df.filter(regex=regex)
def _filter_non_class_cols(df, class_list):
regex = f"^(?!({'_|'.join(class_list)})).*$"
return df.filter(regex=regex)
def _ba_code_backfill(df):
"""Backfill Balancing Authority Codes based on codes in later years.
Note:
The BA Code to ID mapping can change from year to year. If a Balancing Authority
is bought by another entity, the code may change, but the old EIA BA ID will be
retained.
Args:
ba_eia861 (pandas.DataFrame): The transformed EIA 861 Balancing Authority
dataframe (balancing_authority_eia861).
Returns:
pandas.DataFrame: The balancing_authority_eia861 dataframe, but with many fewer
NA values in the balancing_authority_code_eia column.
"""
start_len = len(df)
start_nas = len(df.loc[df.balancing_authority_code_eia.isnull()])
logger.info(
f"Started with {start_nas} missing BA Codes out of {start_len} "
f"records ({start_nas/start_len:.2%})"
)
ba_ids = (
df[
[
"balancing_authority_id_eia",
"balancing_authority_code_eia",
"report_date",
]
]
.drop_duplicates()
.sort_values(["balancing_authority_id_eia", "report_date"])
)
ba_ids["ba_code_filled"] = ba_ids.groupby("balancing_authority_id_eia")[
"balancing_authority_code_eia"
].fillna(method="bfill")
ba_eia861_filled = df.merge(ba_ids, how="left")
ba_eia861_filled = ba_eia861_filled.assign(
balancing_authority_code_eia=lambda x: x.ba_code_filled
).drop("ba_code_filled", axis="columns")
end_len = len(ba_eia861_filled)
if start_len != end_len:
raise AssertionError(
f"Number of rows in the dataframe changed {start_len}!={end_len}!"
)
end_nas = len(
ba_eia861_filled.loc[ba_eia861_filled.balancing_authority_code_eia.isnull()]
)
logger.info(
f"Ended with {end_nas} missing BA Codes out of {end_len} "
f"records ({end_nas/end_len:.2%})"
)
return ba_eia861_filled
def _tidy_class_dfs(df, df_name, idx_cols, class_list, class_type, keep_totals=False):
# Clean up values just enough to use primary key columns as a multi-index:
logger.debug(f"Cleaning {df_name} table index columns so we can tidy data.")
if "balancing_authority_code_eia" in idx_cols:
df = df.assign(
balancing_authority_code_eia=(
lambda x: x.balancing_authority_code_eia.fillna("UNK")
)
)
raw_df = (
df.dropna(subset=["utility_id_eia"])
.astype({"utility_id_eia": "Int64"})
.set_index(idx_cols)
)
# Split the table into index, data, and "denormalized" columns for processing:
# Separate customer classes and reported data into a hierarchical index
logger.debug(f"Stacking EIA861 {df_name} data columns by {class_type}.")
data_cols = _filter_class_cols(raw_df, class_list)
# Create a regex identifier that splits the column headers based on the strings
# deliniated in the class_list not just an underscore. This enables prefixes with
# underscores such as fuel_cell as opposed to single-word prefixes followed by
# underscores. Final string looks like: '(?<=customer_test)_|(?<=unbundled)_'
# This ensures that the underscore AFTER the desired string (that can also include
# underscores) is where the column headers are split, not just the first underscore.
class_list_regex = "|".join(["(?<=" + col + ")_" for col in class_list])
data_cols.columns = data_cols.columns.str.split(
rf"{class_list_regex}", n=1, expand=True
).set_names([class_type, None])
# Now stack the customer classes into their own categorical column,
data_cols = data_cols.stack(level=0, dropna=False).reset_index()
denorm_cols = _filter_non_class_cols(raw_df, class_list).reset_index()
# Merge the index, data, and denormalized columns back together
tidy_df = pd.merge(denorm_cols, data_cols, on=idx_cols)
# Compare reported totals with sum of component columns
if "total" in class_list:
_compare_totals(data_cols, idx_cols, class_type, df_name)
if keep_totals is False:
tidy_df = tidy_df.query(f"{class_type}!='total'")
return tidy_df, idx_cols + [class_type]
def _drop_dupes(df, df_name, subset):
tidy_nrows = len(df)
deduped_df = df.drop_duplicates(subset=subset)
deduped_nrows = len(df)
logger.info(
f"Dropped {tidy_nrows-deduped_nrows} duplicate records from EIA 861 "
f"{df_name} table, out of a total of {tidy_nrows} records "
f"({(tidy_nrows-deduped_nrows)/tidy_nrows:.4%} of all records). "
)
return deduped_df
def _check_for_dupes(df, df_name, subset):
dupes = df.duplicated(subset=subset, keep=False)
if dupes.any():
raise AssertionError(
f"Found {len(df[dupes])} duplicate rows in the {df_name} table, "
f"when zero were expected!"
)
def _early_transform(df):
"""Fix EIA na values and convert year column to date."""
df = pudl.helpers.fix_eia_na(df)
df = pudl.helpers.convert_to_date(df)
return df
def _compare_totals(data_cols, idx_cols, class_type, df_name):
"""Compare reported totals with sum of component columns.
Args:
data_cols (pd.DataFrame): A DataFrame containing only the columns with
normalized information.
idx_cols (list): A list of the primary keys for the given denormalized
DataFrame.
class_type (str): The name (either 'customer_class' or 'tech_class') of
the column for which you'd like to compare totals to components.
df_name (str): The name of the dataframe.
"""
# Convert column dtypes so that numeric cols can be adequately summed
data_cols = pudl.helpers.convert_cols_dtypes(data_cols, data_source="eia")
# Drop data cols that are non numeric (preserve primary keys)
logger.debug(f"{idx_cols}, {class_type}")
data_cols = (
data_cols.set_index(idx_cols + [class_type])
.select_dtypes("number")
.reset_index()
)
logger.debug(f"{data_cols.columns.tolist()}")
# Create list of data columns to be summed
# (may include non-numeric that will be excluded)
data_col_list = set(data_cols.columns.tolist()) - set(idx_cols + [class_type])
logger.debug(f"{data_col_list}")
# Distinguish reported totals from segments
data_totals_df = data_cols.loc[data_cols[class_type] == "total"]
data_no_tots_df = data_cols.loc[data_cols[class_type] != "total"]
# Calculate sum of segments for comparison against reported total
data_sums_df = data_no_tots_df.groupby(idx_cols + [class_type], observed=True).sum()
sum_total_df = pd.merge(
data_totals_df,
data_sums_df,
on=idx_cols,
how="outer",
suffixes=("_total", "_sum"),
)
# Check each data column's calculated sum against the reported total
for col in data_col_list:
col_df = sum_total_df.loc[sum_total_df[col + "_total"].notnull()]
if len(col_df) > 0:
col_df = col_df.assign(
compare_totals=lambda x: (x[col + "_total"] == x[col + "_sum"])
)
bad_math = (col_df["compare_totals"]).sum() / len(col_df)
logger.debug(
f"{df_name}: for column {col}, {bad_math:.0%} "
"of non-null reported totals = the sum of parts."
)
else:
logger.debug(f"{df_name}: for column {col} all total values are NaN")
def _clean_nerc(df, idx_cols):
"""Clean NERC region entries and make new rows for multiple nercs.
This function examines reported NERC regions and makes sure the output column of the
same name has reliable, singular NERC region acronyms. To do so, this function
identifies entries where there are two or more NERC regions specified in a single
cell (such as SPP & ERCOT) and makes new, duplicate rows for each NERC region. It
also converts non-recognized reported nerc regions to 'UNK'.
Args:
df (pandas.DataFrame): A DataFrame with the column 'nerc_region' to be cleaned.
idx_cols (list): A list of the primary keys.
Returns:
pandas.DataFrame: A DataFrame with correct and clean nerc regions.
"""
idx_no_nerc = idx_cols.copy()
if "nerc_region" in idx_no_nerc:
idx_no_nerc.remove("nerc_region")
# Split raw df into primary keys plus nerc region and other value cols
nerc_df = df[idx_cols].copy()
other_df = df.drop("nerc_region", axis=1).set_index(idx_no_nerc)
# Make all values upper-case
# Replace all NA values with UNK
# Make nerc values into lists to see how many separate values are stuffed into one row (ex: 'SPP & ERCOT' --> ['SPP', 'ERCOT'])
nerc_df = nerc_df.assign(
nerc_region=(
lambda x: (x.nerc_region.str.upper().fillna("UNK").str.findall(r"[A-Z]+"))
)
)
# Record a list of the reported nerc regions not included in the recognized regions list (these eventually become UNK)
nerc_col = nerc_df["nerc_region"].tolist()
nerc_list = list(set([item for sublist in nerc_col for item in sublist]))
non_nerc_list = [
nerc_entity
for nerc_entity in nerc_list
if nerc_entity not in NERC_REGIONS + list(NERC_SPELLCHECK.keys())
]
print(
f"The following reported NERC regions are not currently recognized and become \
UNK values: {non_nerc_list}"
)
# Function to turn instances of 'SPP_UNK' or 'SPP_SPP' into 'SPP'
def _remove_nerc_duplicates(entity_list):
if len(entity_list) > 1:
if "UNK" in entity_list:
entity_list.remove("UNK")
if all(x == entity_list[0] for x in entity_list):
entity_list = [entity_list[0]]
return entity_list
# Go through the nerc regions, spellcheck errors, delete those that aren't
# recognized, and piece them back together (with _ separator if more than one
# recognized)
nerc_df["nerc_region"] = (
nerc_df["nerc_region"]
.apply(
lambda x: (
[
i if i not in NERC_SPELLCHECK.keys() else NERC_SPELLCHECK[i]
for i in x
]
)
)
.apply(lambda x: sorted([i if i in NERC_REGIONS else "UNK" for i in x]))
.apply(lambda x: _remove_nerc_duplicates(x))
.str.join("_")
)
# Merge all data back together
full_df = pd.merge(nerc_df, other_df, on=idx_no_nerc)
return full_df
def _compare_nerc_physical_w_nerc_operational(df: pd.DataFrame) -> pd.DataFrame:
"""Show df rows where physical NERC region does not match operational region.
In the Utility Data table, there is the 'nerc_region' index column, otherwise
interpreted as nerc region in which the utility is physically located and the
'nerc_regions_of_operation' column that depicts the nerc regions the utility
operates in. In most cases, these two columns are the same, however there are
certain instances where this is not true. There are also instances where a
utility operates in multiple nerc regions in which case one row will match and
another row will not. The output of this function in a table that shows only the
utilities where the physical nerc region does not match the operational region
ever, meaning there is no additional row for the same utlity during the same
year where there is a match between the cols.
Args:
df: The utility_data_nerc_eia861 table output from the
utility_data() function.
Returns:
A DataFrame with rows for utilities where NO listed operating
nerc region matches the "physical location" nerc region column that's a part of
the index.
"""
# Set NA states to UNK
df["state"] = df["state"].fillna("UNK")
# Create column indicating whether the nerc region matches the nerc region of
# operation (TRUE)
df["nerc_match"] = df["nerc_region"] == df["nerc_regions_of_operation"]
# Group by utility, state, and report date to see which groups have at least one
# TRUE value
grouped_nerc_match_bools = (
df.groupby(["utility_id_eia", "state", "report_date"])[["nerc_match"]]
.any()
.reset_index()
.rename(columns={"nerc_match": "nerc_group_match"})
)
# Merge back with original df to show cases where there are multiple non-matching
# nerc values per utility id, year, and state.
expanded_nerc_match_bools = pd.merge(
df,
grouped_nerc_match_bools,
on=["utility_id_eia", "state", "report_date"],
how="left",
)
# Keep only rows where there are no matches for the whole group.
expanded_nerc_match_bools_false = expanded_nerc_match_bools[
~expanded_nerc_match_bools["nerc_group_match"]
]
return expanded_nerc_match_bools_false
def _pct_to_mw(df, pct_col):
"""Turn pct col into mw capacity using total capacity col."""
mw_value = df["total_capacity_mw"] * df[pct_col] / 100
return mw_value
def _make_yn_bool(df_object):
"""Turn Y/N reporting into True or False boolean statements for df or series."""
return df_object.replace(
{
"Y": True,
"y": True,
"N": False,
"n": False,
}
)
def _thousand_to_one(df_object):
"""Turn reporting in thousands of dollars to regular dollars for df or series."""
return df_object * 1000
###############################################################################
# EIA Form 861 Table Transform Functions
###############################################################################
def service_territory(tfr_dfs):
"""Transform the EIA 861 utility service territory table.
Transformations include:
* Homogenize spelling of county names.
* Add field for state/county FIPS code.
Args:
tfr_dfs (dict): A dictionary of DataFrame objects in which pages from EIA861
form (keys) correspond to normalized DataFrames of values from that page
(values).
Returns:
dict: a dictionary of pandas.DataFrame objects in which pages from EIA861 form
(keys) correspond to normalized DataFrames of values from that page
(values).
"""
# No data tidying required
# There are a few NA values in the county column which get interpreted
# as floats, which messes up the parsing of counties by addfips.
type_compatible_df = tfr_dfs["service_territory_eia861"].astype(
{"county": "string"}
)
# Transform values:
# * Add state and county fips IDs
transformed_df = (
# Ensure that we have the canonical US Census county names:
pudl.helpers.clean_eia_counties(type_compatible_df, fixes=EIA_FIPS_COUNTY_FIXES)
# Add FIPS IDs based on county & state names:
.pipe(pudl.helpers.add_fips_ids)
)
transformed_df["short_form"] = _make_yn_bool(transformed_df.short_form)
tfr_dfs["service_territory_eia861"] = transformed_df
return tfr_dfs
def balancing_authority(tfr_dfs):
"""Transform the EIA 861 Balancing Authority table.
Transformations include:
* Fill in balancing authrority IDs based on date, utility ID, and BA Name.
* Backfill balancing authority codes based on BA ID.
* Fix BA code and ID typos.
Args:
tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table
name. It will be mutated by this function.
Returns:
dict: A dictionary of transformed EIA 861 dataframes, keyed by table name.
"""
# No data tidying required
# All columns are already type compatible.
# Value transformations:
# * Backfill BA codes on a per BA ID basis
# * Fix data entry errors
df = (
tfr_dfs["balancing_authority_eia861"]
.pipe(apply_pudl_dtypes, "eia")
.set_index(["report_date", "balancing_authority_name_eia", "utility_id_eia"])
)
# Fill in BA IDs based on date, utility ID, and BA Name:
df.loc[
BA_ID_NAME_FIXES.index, "balancing_authority_id_eia"
] = BA_ID_NAME_FIXES.balancing_authority_id_eia
# Backfill BA Codes based on BA IDs:
df = df.reset_index().pipe(_ba_code_backfill)
# Typo: NEVP, BA ID is 13407, but in 2014-2015 in UT, entered as 13047
df.loc[
(df.balancing_authority_code_eia == "NEVP")
& (df.balancing_authority_id_eia == 13047),
"balancing_authority_id_eia",
] = 13407
# Typo: Turlock Irrigation District is TIDC, not TID.
df.loc[
(df.balancing_authority_code_eia == "TID")
& (df.balancing_authority_id_eia == 19281),
"balancing_authority_code_eia",
] = "TIDC"
tfr_dfs["balancing_authority_eia861"] = df
return tfr_dfs
def balancing_authority_assn(tfr_dfs):
"""Compile a balancing authority, utility, state association table.
For the years up through 2012, the only BA-Util information that's available comes
from the balancing_authority_eia861 table, and it does not include any state-level
information. However, there is utility-state association information in the
sales_eia861 and other data tables.
For the years from 2013 onward, there's explicit BA-Util-State information in the
data tables (e.g. sales_eia861). These observed associations can be compiled to give
us a picture of which BA-Util-State associations exist. However, we need to merge in
the balancing authority IDs since the data tables only contain the balancing
authority codes.
Args:
tfr_dfs (dict): A dictionary of transformed EIA 861 dataframes. This must
include any dataframes from which we want to compile BA-Util-State
associations, which means this function has to be called after all the basic
transformfunctions that depend on only a single raw table.
Returns:
dict: a dictionary of transformed dataframes. This function both compiles the
association table, and finishes the normalization of the balancing authority
table. It may be that once the harvesting process incorporates the EIA 861, some
or all of this functionality should be pulled into the phase-2 transform
functions.
"""
# These aren't really "data" tables, and should not be searched for associations
non_data_dfs = [
"balancing_authority_eia861",
"service_territory_eia861",
]
# The dataframes from which to compile BA-Util-State associations
data_dfs = [tfr_dfs[table] for table in tfr_dfs if table not in non_data_dfs]
logger.info("Building an EIA 861 BA-Util-State association table.")
# Helpful shorthand query strings....
early_years = "report_date<='2012-12-31'"
late_years = "report_date>='2013-01-01'"
early_dfs = [df.query(early_years) for df in data_dfs]
late_dfs = [df.query(late_years) for df in data_dfs]
# The old BA table lists utilities directly, but has no state information.
early_date_ba_util = _harvest_associations(
dfs=[
tfr_dfs["balancing_authority_eia861"].query(early_years),
],
cols=["report_date", "balancing_authority_id_eia", "utility_id_eia"],
)
# State-utility associations are brought in from observations in data_dfs
early_date_util_state = _harvest_associations(
dfs=early_dfs,
cols=["report_date", "utility_id_eia", "state"],
)
early_date_ba_util_state = early_date_ba_util.merge(
early_date_util_state, how="outer"
).drop_duplicates()
# New BA table has no utility information, but has BA Codes...
late_ba_code_id = _harvest_associations(
dfs=[
tfr_dfs["balancing_authority_eia861"].query(late_years),
],
cols=[
"report_date",
"balancing_authority_code_eia",
"balancing_authority_id_eia",
],
)
# BA Code allows us to bring in utility+state data from data_dfs:
late_date_ba_code_util_state = _harvest_associations(
dfs=late_dfs,
cols=["report_date", "balancing_authority_code_eia", "utility_id_eia", "state"],
)
# We merge on ba_code then drop it, b/c only BA ID exists in all years consistently:
late_date_ba_util_state = (
late_date_ba_code_util_state.merge(late_ba_code_id, how="outer")
.drop("balancing_authority_code_eia", axis="columns")
.drop_duplicates()
)
tfr_dfs["balancing_authority_assn_eia861"] = (
pd.concat([early_date_ba_util_state, late_date_ba_util_state])
.dropna(
subset=[
"balancing_authority_id_eia",
]
)
.pipe(apply_pudl_dtypes, group="eia")
)
return tfr_dfs
def utility_assn(tfr_dfs):
"""Harvest a Utility-Date-State Association Table."""
# These aren't really "data" tables, and should not be searched for associations
non_data_dfs = [
"balancing_authority_eia861",
"service_territory_eia861",
]
# The dataframes from which to compile BA-Util-State associations
data_dfs = [tfr_dfs[table] for table in tfr_dfs if table not in non_data_dfs]
logger.info("Building an EIA 861 Util-State-Date association table.")
tfr_dfs["utility_assn_eia861"] = _harvest_associations(
data_dfs, ["report_date", "utility_id_eia", "state"]
)
return tfr_dfs
def _harvest_associations(dfs, cols):
"""Compile all unique, non-null combinations of values ``cols`` within ``dfs``.
Find all unique, non-null combinations of the columns ``cols`` in the dataframes
``dfs`` within records that are selected by ``query``. All of ``cols`` must be
present in each of the ``dfs``.