In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import re
import time
import numpy as np
import pandas as pd
import sqlalchemy as sa

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics.pairwise import cosine_similarity

sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import init, mcoe, analysis, constants, settings, outputs
import pudl.extract.ferc1
import pudl.transform.pudl

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56

In [3]:
ferc1_engine = pudl.extract.ferc1.connect_db()
ferc1_steam = pd.read_sql('SELECT * FROM f1_steam;', ferc1_engine)
ferc1_steam = ferc1_steam.drop(['row_seq', 'row_prvlg','report_prd'], axis=1)

# If there's no generation, no fuel expenses, and no total expenses... probably this is bunk.
ferc1_steam['filter_1'] = ((ferc1_steam.net_generation==0) | (ferc1_steam.net_generation.isnull())) & \
                          ((ferc1_steam.expns_fuel==0) | (ferc1_steam.expns_fuel.isnull())) & \
                          ((ferc1_steam.tot_prdctn_expns==0) | (ferc1_steam.tot_prdctn_expns.isnull()))
        
# If we have values in any of the generation, fuel, or total expenses, but there's no plant name or kind,
# we need to check the record -- it might be a summary from other lines that we want to capture.
ferc1_steam['check_1'] = ~ferc1_steam.filter_1 & ((ferc1_steam.plant_name=='') | (ferc1_steam.plant_kind==''))
ferc1_steam['expns_nonfuel'] = ferc1_steam.tot_prdctn_expns - ferc1_steam.expns_fuel
print("FERC plants that need checking: {:.2%}".format(len(ferc1_steam[ferc1_steam['check_1']])/len(ferc1_steam)))

FERC plants that need checking: 1.03%


In [4]:
# Utility & Fuel Consumer Price Index from FRED. Should pull via their API in the long run.
util_inflation = pd.DataFrame({
    'report_year': np.arange(2003,2017),
    'util_cpi':  [228.86, 230.13, 234.59, 225.21, 218.99, 220.37, 214.19,
                  210.70, 220.02, 200.63, 194.70, 179.04, 161.89, 154.54]
})
util_inflation.set_index('report_year', inplace=True)
util_inflation['usd_2016_ratio'] = util_inflation.util_cpi / util_inflation.at[(2016,'util_cpi')]
util_inflation.reset_index(inplace=True)
ferc1_steam = pd.merge(ferc1_steam, util_inflation[['report_year','usd_2016_ratio']], on='report_year')
ferc1_steam['expns_nonfuel_usd2016'] = ferc1_steam.expns_nonfuel * ferc1_steam.usd_2016_ratio

In [5]:
diesel_strings = [
    'DIESEL',
    'Diesel Engine',
    'Diesel Turbine',
]

geothermal_strings = [
    'Steam - Geothermal',
]
natural_gas_strings = [
    'Combined Cycle',
    'Combustion Turbine',
    'GT',
    'GAS TURBINE',
    'Comb. Turbine',
    'Gas Turbine #1',
    'Combine Cycle Oper',
    'Combustion',
    'Combined',
    'Gas Turbine/Steam',
    'Gas Turbine Peaker',
    'Gas Turbine - Note 1',
    'Resp Share Gas Note3',
    'Gas Turbines',
    'Simple Cycle',
    'Gas / Steam',
    'GasTurbine',
    'Combine Cycle',
    'CTG/Steam-Gas',
    'GTG/Gas',
    'CTG/Steam -Gas',
    'Steam/Gas Turbine',
    'CombustionTurbine',
    'Gas Turbine-Simple',
    'STEAM & GAS TURBINE',
    'Gas & Steam Turbine',
    'Gas',
    'Gas Turbine (2)',
    'COMBUSTION AND GAS',
    'Com Turbine Peaking',
    'Gas Turbine Peaking',
    'Comb Turb Peaking',
    'JET ENGINE',
    'Comb. Cyc',
    'Com. Cyc',
    'Com. Cycle',
    'GAS TURB-COMBINED CY',
    'Gas Turb',
    'Combined Cycle - 40%',
    'IGCC/Gas Turbine',
    'CC',
    'Combined Cycle Oper',
    'Simple Cycle Turbine',
    'Steam and CC',
    'Com Cycle Gas Turb',
    'I.C.E/  Gas Turbine',
    'Combined Cycle CTG',
    'GAS-TURBINE',
    'Gas Expander Turbine',
    'Gas Turbine (Leased)',
    'Gas Turbine # 1',
    'Gas Turbine (Note 1)',
    'COMBUSTINE TURBINE',
    'Gas Turb, Int. Comb.',
    'Combined Turbine',
    'Comb Turb Peak Units',
    'Combustion Tubine',
    'Comb. Cycle',
    'COMB.TURB.PEAK.UNITS',
    'Steam  and  CC',
    'I.C.E. /Gas Turbine',
    'Conbustion Turbine',
    'Gas Turbine/Int Comb',
    'Steam & CC',
    'GAS TURB. & HEAT REC',
    'Gas Turb/Comb. Cyc',
    'Comb. Turine',
]
nuclear_strings = [
    'Nuclear',
    'Nuclear (3)',
]
other_strings = [
    'IC',
    'Internal Combustion',
    'Int Combust - Note 1',
    'Resp. Share - Note 2',
    'Int. Combust - Note1',
    'Resp. Share - Note 4',
    'Resp Share - Note 5',
    'Resp. Share - Note 7',
    'Internal Comb Recip',
    'Reciprocating Engine',
    'Internal Comb',
    'Resp. Share - Note 8',
    'Resp. Share - Note 9',
    'Resp Share - Note 11',
    'Resp. Share - Note 6',
    'INT.COMBUSTINE',
    'Steam (Incl I.C.)',
    'Other',
    'Int Combust (Note 1)',
    'Resp. Share (Note 2)',
    'Int. Combust (Note1)',
    'Resp. Share (Note 8)',
    'Resp. Share (Note 9)',
    'Resp Share (Note 11)',
    'Resp. Share (Note 4)',
    'Resp. Share (Note 6)',
    'Plant retired- 2013',
    'Retired - 2013',
]
steam_strings = [
    'Steam',
    'Steam Units 1, 2, 3',
    'Resp Share St Note 3',
    'Steam Turbine',
    'Steam-Internal Comb',
    'IGCC',
    'Steam- 72%',
    'Steam (1)',
    'Steam (1)',
    'Steam Units 1,2,3',
    'Steam/Fossil',
    'Steams',
    'Steam - 72%',
    'Steam - 100%',
    'Stream',
    'Steam Units 4, 5',
    'Steam - 64%',
    'Common',
    'Steam (A)',
    'Coal',
    'Steam;Retired - 2013',
    'Steam Units 4 & 6',
]
wind_strings = [
    'Wind',
    'Wind Turbine',
    'Wind - Turbine',
    'Wind Energy',
]
solar_strings = [
    'Solar Photovoltaic',
    'Solar Thermal',
    'SOLAR PROJECT',
    'Solar',
    'Photovoltaic',
]
cpi_plant_kind_map = {
    'natural_gas': natural_gas_strings,
    'diesel': diesel_strings,
    'geothermal': geothermal_strings,
    'nuclear': nuclear_strings,
    'steam': steam_strings,
    'wind': wind_strings,
    'solar': solar_strings,
    'other': other_strings,
}

ferc1_steam['plant_kind_cpi'] = pudl.transform.pudl.cleanstrings(ferc1_steam.plant_kind,
                                                                 cpi_plant_kind_map,
                                                                 unmapped='')
ferc1_tmp = ferc1_steam[~ferc1_steam.filter_1].copy()
ferc1_steam = pd.merge(ferc1_steam, ferc1_tmp, how='left')

In [6]:
matching_cols = [
    'report_year',
    'spplmnt_num',
    'row_number',
    'respondent_id',
    'plant_name',
    'plant_kind_cpi',
    'yr_const',
    'filter_1',
    'check_1',
]

In [7]:
ferc1_tomatch = ferc1_steam[matching_cols][(~ferc1_steam.filter_1) & (~ferc1_steam.check_1)]
ferc1_tomatch.drop(['filter_1','check_1'], axis=1, inplace=True)

In [8]:
#plant_names = ferc1_tomatch.plant_name.unique()
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(5,5))
tf_idf_matrix = vectorizer.fit_transform(ferc1_tomatch.plant_name)

## Outline
* TF-IDF matrix is L2 normalized. It's a vectorization of just the string feature (plant_name)
* Need to combine it with three other vectorized categorical features: plant kind, construction year, and respondent ID.
* Each of these components can be multiplied by a weighting factor as needed before being combined into a single composite vector.
* Once the vectors have been combined, we calculate the cosine similarity between each record, and all the other records to see which ones match best.
* For each record, we need to select up to one record from each year of data, to stitch together a continuous time series for each plant.
* We can't use any record in more than one time series.

In [9]:
def get_matches_df(sparse_matrix, name_vector):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    cossim = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        cossim[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                         'right_side': right_side,
                         'cossim': cossim})

In [10]:
matches = cosine_similarity(tf_idf_matrix, dense_output=False)
matches_df = get_matches_df(matches, ferc1_tomatch.plant_name)
#plt.hist(matches_df.cossim, bins=100, range=(0.4,1.0));

KeyError: 10120

In [None]:
good_matches = matches_df[(matches_df['cossim'] < 0.99999) &
                          (matches_df['cossim'] > 0.5)]
good_matches.sort_values(['cossim'], ascending=False).head(10)

In [None]:
good_matches.sample(10)

## A simple set of one-hot dictionary features...

In [None]:
plant_kind_onehot = pd.get_dummies(ferc1_steam, columns=['plant_kind_cpi']).filter(regex='^plant_kind_cpi_.*')
#respondent_onehot = pd.get_dummies(ferc1_steam, columns=['respondent_id'])
#yr_const_onehot = pd.get_dummies(ferc1_steam, columns=['yr_const'])

In [None]:
sklearn.preprocessing.normalize(plant_kind_onehot, norm='l2')*3

In [None]:
plant_kind_onehot[['plant_kind', 'plant_kind_cpi', 'plant_kind_cpi_natural_gas']]