# Add PUDL IDs to small gens

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl

In [3]:
# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [5]:
frequency = 'AS' # annual

# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine, freq=frequency) #annual frequency

In [6]:
# Access a table by name as an object
ferc_small = pudl_out.plants_small_ferc1()
ferc_big = pudl_out.plants_steam_ferc1()
eia = pudl_out.gens_eia860()

In [7]:
aa = pudl.glue.ferc1_eia.glue(ferc1=True, eia=True)
glue_tab = aa['plants_pudl']
glue_tab['plant_name_lower'] = glue_tab.plant_name_pudl.str.lower()

In [8]:
glue_tab.query("plant_name_pudl == 'Harris Lake'")

Unnamed: 0,plant_id_pudl,plant_name_pudl,plant_name_lower
2989,2093,Harris Lake,harris lake
9874,8833,Harris Lake,harris lake


In [9]:
ferc_glue_mer = pd.merge(ferc_small, glue_tab, left_on='plant_name_original', right_on='plant_name_lower', how='left')
ferc_glue_mer['dup'] = ferc_glue_mer.duplicated(subset=['plant_name_original'], keep=False)

In [10]:
print(len(ferc_glue_mer[ferc_glue_mer['dup']]))
print(len(ferc_glue_mer))
print(len(ferc_small))

19563
19943
14878


In [23]:
# preliminary look at how many of the plants that have been mapped also have eia technology descriptions

has_tech = eia[eia['technology_description'].notna()].copy()
print(len(ferc_glue_mer.plant_name_pudl.unique()))
print(len([x for x in ferc_glue_mer.plant_id_pudl.unique() if x in has_tech.plant_id_pudl.unique()]))
#print(len([x for x in matching_utility.plant_id_pudl_ferc if x in has_tech.plant_id_pudl]))

# Remember to only use plant id pudls that have one tech per plant id pudl :) 

1077
228


In [12]:
mapping = pd.read_excel('/Users/aesharpe/Desktop/mapping_eia923_ferc1_copy().xlsx', sheet_name='plants_combined')

In [13]:
eia_map = (
    mapping[mapping['plant_name_eia'].notna()]
    [['plant_name_pudl', 'plant_id_pudl', 'utility_name_eia', 'plant_name_eia']].copy())

In [14]:
unmapped_eia = (
    mapping[(mapping['plant_name_eia'].notna()) & (mapping['plant_name_ferc1'].isna())]
    [['plant_name_pudl', 'plant_id_pudl', 'utility_name_eia', 'plant_name_eia']].copy()
    .rename(columns={'plant_id_pudl': 'plant_id_pudl_eia'}))
unmapped_ferc = (
    mapping[(mapping['plant_name_ferc1'].notna()) & (mapping['plant_name_eia'].isna())]
    [['plant_name_pudl', 'plant_id_pudl', 'utility_name_ferc1', 'plant_name_ferc1']].copy()
    .rename(columns={'plant_id_pudl': 'plant_id_pudl_ferc'}))

In [15]:
mer2 = pd.merge(unmapped_eia, unmapped_ferc, on='plant_name_pudl', how='inner')

In [16]:
print(len(unmapped_eia))
print(len(unmapped_ferc))
print(len(mer2))

12377
2669
880


In [17]:
mer2 = mer2.sort_values(['plant_name_pudl'])

In [18]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [19]:
mer3 = mer2.dropna(subset=['utility_name_eia', 'utility_name_ferc1']).copy()
mer3['ratio']= mer3.apply(lambda x: fuzz.partial_ratio(x['utility_name_ferc1'],x['utility_name_eia']),axis=1)
#matching_utility = mer3[(mer3['ratio'] > 70) & (mer3['plant_id_pudl_eia'] != mer3['plant_id_pudl_ferc'])].copy()

In [20]:
#matching_utility.to_pickle('/Users/aesharpe/Desktop/matching_utility.pkl')

In [21]:
# Used these as the first pudl ids to fix
len(matching_utility)

NameError: name 'matching_utility' is not defined

In [24]:
mer4 = mer3[mer3['plant_id_pudl_eia'] != mer3['plant_id_pudl_ferc']]

In [25]:
dd = ferc_small.drop_duplicates('plant_name_original')

In [27]:
len(dd.sort_values('plant_name_original'))#[0:10]

1930