In [1]:
import sys
import os
import numpy as np
import pandas as pd
import sqlalchemy as sa
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants
from pudl import models, models_ferc1, models_eia923
from pudl import clean_eia923, clean_ferc1, clean_pudl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pudl_engine  = pudl.db_connect_pudl()

In [287]:
#Pull in the generation table
plants_eia923_tbl = models.PUDLBase.metadata.tables['plants_eia923']
p_select = sa.sql.select([plants_eia923_tbl,]) #creates a sql Select object
p = pd.read_sql(p_select, pudl_engine) #converts sql object to pandas dataframe

In [289]:
p[p['plant_id'] == 527]

Unnamed: 0,plant_id,plant_name,plant_id_pudl
1415,527,Nucla,1516


In [3]:
#Pull in the generation table
g_tbl = models.PUDLBase.metadata.tables['generation_eia923']
g_select = sa.sql.select([g_tbl,]) #creates a sql Select object
g = pd.read_sql(g_select, pudl_engine) #converts sql object to pandas dataframe

In [290]:
g[(g['plant_id'] == 602)]# & (g['prime_mover']== 'CA') & (g['generator_id']== '2')]

Unnamed: 0,id,plant_id,prime_mover,generator_id,report_date,net_generation_mwh,gen_code,gen_boiler_code_2
2652,2653,602,ST,1,2009-01-01,436667.0,602_1_ST,602_ST
2653,2654,602,ST,1,2009-02-01,340491.0,602_1_ST,602_ST
2654,2655,602,ST,1,2009-03-01,348869.0,602_1_ST,602_ST
2655,2656,602,ST,1,2009-04-01,319313.0,602_1_ST,602_ST
2656,2657,602,ST,1,2009-05-01,116343.0,602_1_ST,602_ST
2657,2658,602,ST,1,2009-06-01,321103.0,602_1_ST,602_ST
2658,2659,602,ST,1,2009-07-01,329374.0,602_1_ST,602_ST
2659,2660,602,ST,1,2009-08-01,279907.0,602_1_ST,602_ST
2660,2661,602,ST,1,2009-09-01,6103.0,602_1_ST,602_ST
2661,2662,602,ST,1,2009-10-01,0.0,602_1_ST,602_ST


In [4]:
g['gen_code'] = g['plant_id'].map(str) + "_" + g['generator_id'] + "_" + g['prime_mover']
g['gen_boiler_code_2'] = g['plant_id'].map(str) + "_" + g['prime_mover']
gen_code = g[['plant_id', 'prime_mover', 'generator_id', 'gen_code', 'gen_boiler_code_2']].copy()
gen_code['gen_boiler_code'] = gen_code['gen_code']
gen_code = gen_code.drop_duplicates()
gen_code = gen_code.rename(columns={'plant_id': 'plant_id_gen',
                         'prime_mover': 'prime_mover_gen'})

In [283]:
#Pull in the generation table
b_tbl = models.PUDLBase.metadata.tables['boiler_fuel_eia923']
b_select = sa.sql.select([b_tbl,]) #creates a sql Select object
b = pd.read_sql(b_select, pudl_engine) #converts sql object to pandas dataframe

In [286]:
b[b['plant_id'] == 527].drop_duplicates(subset='boiler_id')

Unnamed: 0,id,plant_id,boiler_id,prime_mover,fuel_type,report_date,fuel_qty_consumed,fuel_mmbtu_per_unit,sulfur_content,ash_content
4069,4069,527,1,ST,BIT,2009-01-01,30374.0,21.11,0.71,21.2


In [120]:
b['boiler_code'] = b['plant_id'].map(str) + "_" + b['boiler_id'] + "_" + b['prime_mover']
b['gen_boiler_code_2'] = b['plant_id'].map(str) + "_" + b['prime_mover']
boiler_code = b.copy()
boiler_code['gen_boiler_code'] = boiler_code['boiler_code']
boiler_code = boiler_code.drop_duplicates()
boiler_code = boiler_code.rename(columns={'plant_id': 'plant_id_boiler',
                            'prime_mover': 'prime_mover_boiler'})
boiler_code

Unnamed: 0,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2,gen_boiler_code
0,3,1,ST,3_1_ST,3_ST,3_1_ST
1,3,2,ST,3_2_ST,3_ST,3_2_ST
2,3,3,ST,3_3_ST,3_ST,3_3_ST
3,3,4,ST,3_4_ST,3_ST,3_4_ST
4,3,5,ST,3_5_ST,3_ST,3_5_ST
5,3,6A,CA,3_6A_CA,3_CA,3_6A_CA
6,3,6B,CA,3_6B_CA,3_CA,3_6B_CA
7,3,7A,CA,3_7A_CA,3_CA,3_7A_CA
8,3,7B,CA,3_7B_CA,3_CA,3_7B_CA
9,7,1,ST,7_1_ST,7_ST,7_1_ST


In [8]:
gen_boiler_map = pd.merge(gen_code, boiler_code, how='outer', on=['gen_boiler_code'])
gen_boiler_map['mapped'] = (gen_boiler_map['gen_code'].notnull() & gen_boiler_map['boiler_code'].notnull())
gen_boiler_map.count()

plant_id_gen           4967
prime_mover_gen        4967
generator_id           4967
gen_code               4967
gen_boiler_code_2_x    4967
gen_boiler_code        8249
plant_id_boiler        5785
boiler_id              5785
prime_mover_boiler     5785
boiler_code            5785
gen_boiler_code_2_y    5785
mapped                 8249
dtype: int64

In [9]:
mapped_gen_boiler = gen_boiler_map[gen_boiler_map['mapped'] == True]
unmapped_gen_boiler = gen_boiler_map[gen_boiler_map['mapped'] == False]

In [10]:
mapped_gen_boiler.dtypes

plant_id_gen           float64
prime_mover_gen         object
generator_id            object
gen_code                object
gen_boiler_code_2_x     object
gen_boiler_code         object
plant_id_boiler        float64
boiler_id               object
prime_mover_boiler      object
boiler_code             object
gen_boiler_code_2_y     object
mapped                    bool
dtype: object

In [11]:
unmapped_gen = unmapped_gen_boiler[['plant_id_gen','prime_mover_gen','generator_id','gen_code','gen_boiler_code_2_x']].copy()
unmapped_gen.dropna(inplace=True)
unmapped_gen = unmapped_gen.rename(columns={'gen_boiler_code_2_x': 'gen_boiler_code_2'})
unmapped_gen

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2
5,3.0,CA,A1ST,3_A1ST_CA,3_CA
6,3.0,CA,A2ST,3_A2ST_CA,3_CA
20,26.0,ST,ST4,26_ST4_ST,26_ST
44,108.0,ST,1,108_1_ST,108_ST
51,117.0,CA,C4-2,117_C4-2_CA,117_CA
52,117.0,CA,C5-3,117_C5-3_CA,117_CA
55,120.0,ST,ST1,120_ST1_ST,120_ST
57,126.0,ST,ST1,126_ST1_ST,126_ST
58,126.0,ST,ST2,126_ST2_ST,126_ST
59,126.0,ST,ST3,126_ST3_ST,126_ST


In [12]:
unmapped_boiler = unmapped_gen_boiler[['plant_id_boiler','boiler_id','prime_mover_boiler','boiler_code','gen_boiler_code_2_y']].copy()
unmapped_boiler.dropna(inplace=True)
unmapped_boiler = unmapped_boiler.rename(columns={'gen_boiler_code_2_y': 'gen_boiler_code_2'})
unmapped_boiler

Unnamed: 0,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2
4967,3.0,6A,CA,3_6A_CA,3_CA
4968,3.0,6B,CA,3_6B_CA,3_CA
4969,3.0,7A,CA,3_7A_CA,3_CA
4970,3.0,7B,CA,3_7B_CA,3_CA
4971,26.0,4,ST,26_4_ST,26_ST
4972,79.0,3,ST,79_3_ST,79_ST
4973,108.0,SGU1,ST,108_SGU1_ST,108_ST
4974,117.0,CC4,CA,117_CC4_CA,117_CA
4975,117.0,CC5A,CA,117_CC5A_CA,117_CA
4976,117.0,CC5B,CA,117_CC5B_CA,117_CA


In [13]:
gen_boiler_map_2 = pd.merge(unmapped_gen, unmapped_boiler, how='outer', on=['gen_boiler_code_2'])
gen_boiler_map_2['mapped'] = (gen_boiler_map_2['plant_id_gen'].notnull() & 
                              gen_boiler_map_2['plant_id_boiler'].notnull() &
                              ~gen_boiler_map_2.duplicated(subset='gen_boiler_code_2',keep=False))
#gen_boiler_map_2
mapped_gen_boiler_2 = gen_boiler_map_2[gen_boiler_map_2['mapped'] == True]
unmapped_gen_boiler_2 = gen_boiler_map_2[gen_boiler_map_2['mapped'] == False]

In [14]:
mapped_gen_boiler = mapped_gen_boiler.append(mapped_gen_boiler_2)
mapped_gen_boiler

Unnamed: 0,boiler_code,boiler_id,gen_boiler_code,gen_boiler_code_2,gen_boiler_code_2_x,gen_boiler_code_2_y,gen_code,generator_id,mapped,plant_id_boiler,plant_id_gen,prime_mover_boiler,prime_mover_gen
0,3_1_ST,1,3_1_ST,,3_ST,3_ST,3_1_ST,1,True,3.0,3.0,ST,ST
1,3_2_ST,2,3_2_ST,,3_ST,3_ST,3_2_ST,2,True,3.0,3.0,ST,ST
2,3_3_ST,3,3_3_ST,,3_ST,3_ST,3_3_ST,3,True,3.0,3.0,ST,ST
3,3_4_ST,4,3_4_ST,,3_ST,3_ST,3_4_ST,4,True,3.0,3.0,ST,ST
4,3_5_ST,5,3_5_ST,,3_ST,3_ST,3_5_ST,5,True,3.0,3.0,ST,ST
7,7_1_ST,1,7_1_ST,,7_ST,7_ST,7_1_ST,1,True,7.0,7.0,ST,ST
8,7_2_ST,2,7_2_ST,,7_ST,7_ST,7_2_ST,2,True,7.0,7.0,ST,ST
9,8_10_ST,10,8_10_ST,,8_ST,8_ST,8_10_ST,10,True,8.0,8.0,ST,ST
10,8_6_ST,6,8_6_ST,,8_ST,8_ST,8_6_ST,6,True,8.0,8.0,ST,ST
11,8_7_ST,7,8_7_ST,,8_ST,8_ST,8_7_ST,7,True,8.0,8.0,ST,ST


In [79]:
unmapped_gen_2 = unmapped_gen.merge(gen_boiler_map_2[['plant_id_gen',
                                     'prime_mover_gen',
                                     'generator_id',
                                     'gen_code',
                                     'gen_boiler_code_2',
                                     'mapped']],
                   on=['plant_id_gen',
                       'prime_mover_gen',
                       'generator_id',
                       'gen_code',
                       'gen_boiler_code_2'])
unmapped_gen_2 = unmapped_gen_2[unmapped_gen_2['mapped'] == False].drop_duplicates()
unmapped_gen_2.dtypes

plant_id_gen         float64
prime_mover_gen       object
generator_id          object
gen_code              object
gen_boiler_code_2     object
mapped                  bool
dtype: object

In [36]:
unmapped_boiler_2 = unmapped_boiler.merge(gen_boiler_map_2[['plant_id_boiler',
                                                            'boiler_id',
                                                            'prime_mover_boiler',
                                                            'boiler_code',
                                                            'gen_boiler_code_2',
                                                            'mapped']],
                   on=['plant_id_boiler',
                        'boiler_id',
                        'prime_mover_boiler',
                        'boiler_code',
                        'gen_boiler_code_2'])
unmapped_boiler_2 = unmapped_boiler_2[unmapped_boiler_2['mapped'] == False].drop_duplicates()
unmapped_boiler_2

Unnamed: 0,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2
4967,3.0,6A,CA,3_6A_CA,3_CA
4968,3.0,6B,CA,3_6B_CA,3_CA
4969,3.0,7A,CA,3_7A_CA,3_CA
4970,3.0,7B,CA,3_7B_CA,3_CA
4971,26.0,4,ST,26_4_ST,26_ST
4972,79.0,3,ST,79_3_ST,79_ST
4973,108.0,SGU1,ST,108_SGU1_ST,108_ST
4974,117.0,CC4,CA,117_CC4_CA,117_CA
4975,117.0,CC5A,CA,117_CC5A_CA,117_CA
4976,117.0,CC5B,CA,117_CC5B_CA,117_CA


In [None]:
# either boiler codes or generator codes include ST/CC/ boiler: FB
# 

In [110]:
unmapped_gen_2['generator_id_stripped'] = unmapped_gen_2['generator_id']
unmapped_gen_2['generator_id_stripped'] = unmapped_gen_2['generator_id'].str.strip('STCTGENCCKABCLIMBLRHRSGUIPOWPBRF-')
unmapped_gen_2['plant_id_gen'] = unmapped_gen_2['plant_id_gen'].astype(np.int64)

unmapped_gen_2['gen_boiler_code_3'] = unmapped_gen_2['plant_id_gen'].map(str) + "_" +\
                                      unmapped_gen_2['generator_id_stripped'] + "_" +\
                                      unmapped_gen_2['prime_mover_gen']

In [111]:
unmapped_gen_2

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2,mapped,generator_id_stripped,gen_boiler_code_3
0,3,CA,A1ST,3_A1ST_CA,3_CA,False,1,3_1_CA
4,3,CA,A2ST,3_A2ST_CA,3_CA,False,2,3_2_CA
10,117,CA,C4-2,117_C4-2_CA,117_CA,False,4-2,117_4-2_CA
13,117,CA,C5-3,117_C5-3_CA,117_CA,False,5-3,117_5-3_CA
17,126,ST,ST1,126_ST1_ST,126_ST,False,1,126_1_ST
20,126,ST,ST2,126_ST2_ST,126_ST,False,2,126_2_ST
23,126,ST,ST3,126_ST3_ST,126_ST,False,3,126_3_ST
26,141,ST,AF1,141_AF1_ST,141_ST,False,1,141_1_ST
29,141,ST,AF2,141_AF2_ST,141_ST,False,2,141_2_ST
32,141,ST,AF3,141_AF3_ST,141_ST,False,3,141_3_ST


In [112]:
unmapped_boiler_2['boiler_id_stripped'] = unmapped_boiler_2['boiler_id']
unmapped_boiler_2['boiler_id_stripped'] = unmapped_boiler_2['boiler_id'].str.strip('STCTGENCCKABCLIMBLRHRSGUIPOWPBRF-')
unmapped_boiler_2['plant_id_boiler'] = unmapped_boiler_2['plant_id_boiler'].astype(np.int64)

unmapped_boiler_2['gen_boiler_code_3'] = unmapped_boiler_2['plant_id_boiler'].map(str) + "_" +\
                                         unmapped_boiler_2['boiler_id_stripped'] + "_" +\
                                         unmapped_boiler_2['prime_mover_boiler']

In [113]:
unmapped_boiler_2

Unnamed: 0,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2,mapped,boiler_id_stripped,gen_boiler_code_3
0,3,6A,CA,3_6A_CA,3_CA,False,6,3_6_CA
2,3,6B,CA,3_6B_CA,3_CA,False,6,3_6_CA
4,3,7A,CA,3_7A_CA,3_CA,False,7,3_7_CA
6,3,7B,CA,3_7B_CA,3_CA,False,7,3_7_CA
9,79,3,ST,79_3_ST,79_ST,False,3,79_3_ST
11,117,CC4,CA,117_CC4_CA,117_CA,False,4,117_4_CA
13,117,CC5A,CA,117_CC5A_CA,117_CA,False,5,117_5_CA
15,117,CC5B,CA,117_CC5B_CA,117_CA,False,5,117_5_CA
18,126,1,ST,126_1_ST,126_ST,False,1,126_1_ST
21,126,2,ST,126_2_ST,126_ST,False,2,126_2_ST


In [114]:
gen_boiler_map_3 = pd.merge(unmapped_gen_2, unmapped_boiler_2, how='outer', on=['gen_boiler_code_3'])
gen_boiler_map_3['mapped'] = (gen_boiler_map_3['plant_id_gen'].notnull() & 
                              gen_boiler_map_3['plant_id_boiler'].notnull() &
                              ~gen_boiler_map_3.duplicated(subset='gen_boiler_code_3',keep=False))

In [115]:
gen_boiler_map_3[gen_boiler_map_3['mapped'] == True].drop_duplicates(subset='gen_boiler_code_3')

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2_x,mapped_x,generator_id_stripped,gen_boiler_code_3,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2_y,mapped_y,boiler_id_stripped,mapped
4,126.0,ST,ST1,126_ST1_ST,126_ST,False,1,126_1_ST,126.0,1,ST,126_1_ST,126_ST,False,1,True
5,126.0,ST,ST2,126_ST2_ST,126_ST,False,2,126_2_ST,126.0,2,ST,126_2_ST,126_ST,False,2,True
6,126.0,ST,ST3,126_ST3_ST,126_ST,False,3,126_3_ST,126.0,3,ST,126_3_ST,126_ST,False,3,True
7,141.0,ST,AF1,141_AF1_ST,141_ST,False,1,141_1_ST,141.0,1,ST,141_1_ST,141_ST,False,1,True
8,141.0,ST,AF2,141_AF2_ST,141_ST,False,2,141_2_ST,141.0,2,ST,141_2_ST,141_ST,False,2,True
9,141.0,ST,AF3,141_AF3_ST,141_ST,False,3,141_3_ST,141.0,3,ST,141_3_ST,141_ST,False,3,True
12,160.0,ST,ST2,160_ST2_ST,160_ST,False,2,160_2_ST,160.0,2,ST,160_2_ST,160_ST,False,2,True
13,160.0,ST,ST3,160_ST3_ST,160_ST,False,3,160_3_ST,160.0,3,ST,160_3_ST,160_ST,False,3,True
17,246.0,ST,ST1,246_ST1_ST,246_ST,False,1,246_1_ST,246.0,1,ST,246_1_ST,246_ST,False,1,True
18,246.0,ST,ST2,246_ST2_ST,246_ST,False,2,246_2_ST,246.0,2,ST,246_2_ST,246_ST,False,2,True


In [15]:
unmapped_gen_boiler_2[(unmapped_gen_boiler_2['plant_id_gen'] == 1391.0) | (unmapped_gen_boiler_2['plant_id_boiler'] == 1391.0)]

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,mapped
7176,1391.0,ST,1A,1391_1A_ST,1391_ST,,,,,False
7177,1391.0,ST,2A,1391_2A_ST,1391_ST,,,,,False
7178,1391.0,ST,3A,1391_3A_ST,1391_ST,,,,,False
7726,,,,,1391_CA,1391.0,9,CA,1391_9_CA,False
7727,,,,,1391_CA,1391.0,4A,CA,1391_4A_CA,False
7728,,,,,1391_CA,1391.0,5A,CA,1391_5A_CA,False


In [117]:
unmapped_gen_boiler_2[(unmapped_gen_boiler_2['plant_id_gen'] == 50392.0) | (unmapped_gen_boiler_2['plant_id_boiler'] == 50392.0)]

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,mapped
3299,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,1,ST,50392_1_ST,False
3300,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,2,ST,50392_2_ST,False
3301,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,3,ST,50392_3_ST,False
3302,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,4,ST,50392_4_ST,False
3303,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,5,ST,50392_5_ST,False
3304,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,6,ST,50392_6_ST,False
3305,50392.0,ST,TG1,50392_TG1_ST,50392_ST,50392.0,6A,ST,50392_6A_ST,False
3306,50392.0,ST,TG2,50392_TG2_ST,50392_ST,50392.0,1,ST,50392_1_ST,False
3307,50392.0,ST,TG2,50392_TG2_ST,50392_ST,50392.0,2,ST,50392_2_ST,False
3308,50392.0,ST,TG2,50392_TG2_ST,50392_ST,50392.0,3,ST,50392_3_ST,False


In [None]:
unmapped_gen_boiler_2[(unmapped_gen_boiler_2['plant_id_gen'] == 55358.0) | (unmapped_gen_boiler_2['plant_id_boiler'] == 55358.0)]

In [None]:
unmapped_gen_boiler_2[unmapped_gen_boiler_2['plant_id_.*'] == 54512.0]

In [None]:
unmapped_gen_boiler_2[unmapped_gen_boiler_2['plant_id_gen'] == 54780].drop_duplicates('boiler_id')

In [None]:
unmapped_gen_boiler_2[unmapped_gen_boiler_2['plant_id_gen'] == 54780].drop_duplicates('generator_id')

In [None]:
unmapped_gen_boiler_2.sample(20)

In [105]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [217]:
boiler_code['prime_mover_boiler'].drop_duplicates()

0       ST
5       CA
4260    CT
4271    CS
Name: prime_mover_boiler, dtype: object

In [221]:
prime_movers = gen_code['prime_mover_gen'].drop_duplicates().values
prime_movers

array(['ST', 'CA', 'CT', 'CS'], dtype=object)

In [267]:
plant_ids = gen_code['plant_id_gen'].drop_duplicates().head(100).values
plant_ids

array([  3,   7,   8,  10,  26,  47,  50,  51,  56,  59,  60,  79,  87,
       108, 113, 116, 117, 118, 120, 126, 127, 130, 136, 141, 147, 160,
       165, 167, 168, 169, 170, 173, 201, 202, 203, 207, 228, 246, 259,
       260, 271, 273, 298, 302, 310, 315, 329, 330, 331, 335, 345, 350,
       356, 358, 377, 384, 389, 400, 404, 408, 420, 460, 462, 465, 468,
       469, 470, 477, 478, 492, 493, 525, 527, 533, 546, 548, 550, 562,
       564, 568, 589, 593, 594, 599, 602, 603, 609, 610, 617, 619, 620,
       621, 628, 634, 638, 641, 642, 643, 645, 663])

In [275]:
plant_prime_mover = gen_code[['plant_id_gen','prime_mover_gen']].drop_duplicates().head(100).values
plant_prime_mover

array([[3, 'ST'],
       [3, 'CA'],
       [7, 'ST'],
       [8, 'ST'],
       [10, 'ST'],
       [26, 'ST'],
       [47, 'ST'],
       [50, 'ST'],
       [51, 'ST'],
       [56, 'ST'],
       [59, 'ST'],
       [60, 'ST'],
       [79, 'ST'],
       [87, 'ST'],
       [108, 'ST'],
       [113, 'ST'],
       [116, 'ST'],
       [117, 'CA'],
       [118, 'ST'],
       [120, 'ST'],
       [126, 'ST'],
       [127, 'ST'],
       [130, 'ST'],
       [136, 'ST'],
       [141, 'ST'],
       [147, 'ST'],
       [147, 'CA'],
       [160, 'CA'],
       [160, 'ST'],
       [165, 'ST'],
       [167, 'ST'],
       [168, 'ST'],
       [169, 'ST'],
       [170, 'ST'],
       [173, 'ST'],
       [201, 'CA'],
       [202, 'ST'],
       [203, 'ST'],
       [207, 'ST'],
       [228, 'ST'],
       [246, 'ST'],
       [259, 'ST'],
       [260, 'ST'],
       [271, 'ST'],
       [273, 'ST'],
       [298, 'ST'],
       [302, 'ST'],
       [310, 'ST'],
       [315, 'ST'],
       [329, 'ST'],
       [329, 'CA']

In [276]:
plant_prime_mover = [ (x[0], x[1]) for x in plant_prime_mover]

In [270]:
gen_gb = gen_code.groupby(by=['plant_id_gen','prime_mover_gen'])
gen_gb.get_group((3,'CA'))

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2,gen_boiler_code
60,3,CA,A1ST,3_A1ST_CA,3_CA,3_A1ST_CA
72,3,CA,A2ST,3_A2ST_CA,3_CA,3_A2ST_CA


In [239]:
index = zip(gen_gb.first().index)
index

<zip at 0x116743548>

In [271]:
boiler_gb = boiler_code.groupby(by=['plant_id_boiler','prime_mover_boiler'])
boiler_gb.get_group((3,'CA'))

Unnamed: 0,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2,gen_boiler_code
5,3,6A,CA,3_6A_CA,3_CA,3_6A_CA
6,3,6B,CA,3_6B_CA,3_CA,3_6B_CA
7,3,7A,CA,3_7A_CA,3_CA,3_7A_CA
8,3,7B,CA,3_7B_CA,3_CA,3_7B_CA


In [282]:
for plant_pm in plant_prime_mover:
    boiler_gb = boiler_code.groupby(by=['plant_id_boiler','prime_mover_boiler'])
    boiler_ids = boiler_gb.get_group(plant_pm)['boiler_id'].values
    gen_gb = gen_code.groupby(by=['plant_id_gen','prime_mover_gen'])
    gen_ids = gen_gb.get_group(plant_pm)
    for g_id in gen_ids['generator_id']:
        print(plant_pm[0],plant_pm[1], g_id)
        print(process.extract(g_id, boiler_ids))

3 ST 1
[('1', 100), ('2', 0), ('3', 0), ('4', 0), ('5', 0)]
3 ST 2
[('2', 100), ('1', 0), ('3', 0), ('4', 0), ('5', 0)]
3 ST 3
[('3', 100), ('1', 0), ('2', 0), ('4', 0), ('5', 0)]
3 ST 4
[('4', 100), ('1', 0), ('2', 0), ('3', 0), ('5', 0)]
3 ST 5
[('5', 100), ('1', 0), ('2', 0), ('3', 0), ('4', 0)]
3 CA A1ST
[('6A', 45), ('7A', 45), ('6B', 0), ('7B', 0)]
3 CA A2ST
[('6A', 45), ('7A', 45), ('6B', 0), ('7B', 0)]
7 ST 1
[('1', 100), ('2', 0)]
7 ST 2
[('2', 100), ('1', 0)]
8 ST 10
[('10', 100), ('6', 0), ('7', 0), ('8', 0), ('9', 0)]
8 ST 6
[('6', 100), ('10', 0), ('7', 0), ('8', 0), ('9', 0)]
8 ST 7
[('7', 100), ('10', 0), ('6', 0), ('8', 0), ('9', 0)]
8 ST 8
[('8', 100), ('10', 0), ('6', 0), ('7', 0), ('9', 0)]
8 ST 9
[('9', 100), ('10', 0), ('6', 0), ('7', 0), ('8', 0)]
10 ST 1
[('1', 100), ('2', 0)]
10 ST 2
[('2', 100), ('1', 0)]
26 ST 1
[('1', 100), ('2', 0), ('3', 0), ('4', 0), ('5', 0)]
26 ST 2
[('2', 100), ('1', 0), ('3', 0), ('4', 0), ('5', 0)]
26 ST 3
[('3', 100), ('1', 0), ('2',

KeyError: (620, 'ST')

In [204]:
#gen_input = gen_gb['generator_id'].first()
gen_gb.get_group((3,'ST'))

Unnamed: 0,gen_boiler_code,gen_boiler_code_2,gen_code,generator_id
0,3_1_ST,3_ST,3_1_ST,1
12,3_2_ST,3_ST,3_2_ST,2
24,3_3_ST,3_ST,3_3_ST,3
36,3_4_ST,3_ST,3_4_ST,4
48,3_5_ST,3_ST,3_5_ST,5


In [181]:
gen_code_input = gen_code[(gen_code['plant_id_gen'] == 3) & (gen_code['prime_mover_gen'] == 'CA')]
boiler_code_input = boiler_code[(boiler_code['plant_id_boiler'] == 3) & (boiler_code['prime_mover_boiler'] == 'CA')]

In [182]:
gen_code_input

Unnamed: 0,plant_id_gen,prime_mover_gen,generator_id,gen_code,gen_boiler_code_2,gen_boiler_code
60,3,CA,A1ST,3_A1ST_CA,3_CA,3_A1ST_CA
72,3,CA,A2ST,3_A2ST_CA,3_CA,3_A2ST_CA


In [183]:
boiler_code_input

Unnamed: 0,plant_id_boiler,boiler_id,prime_mover_boiler,boiler_code,gen_boiler_code_2,gen_boiler_code
5,3,6A,CA,3_6A_CA,3_CA,3_6A_CA
6,3,6B,CA,3_6B_CA,3_CA,3_6B_CA
7,3,7A,CA,3_7A_CA,3_CA,3_7A_CA
8,3,7B,CA,3_7B_CA,3_CA,3_7B_CA


In [184]:
choices = boiler_code_input['boiler_id'].values

In [185]:
choices

array(['6A', '6B', '7A', '7B'], dtype=object)

In [186]:
for g_id in gen_code_input['generator_id']:
    print(g_id)
    print(process.extract(g_id, choices))

A1ST
[('6A', 45), ('7A', 45), ('6B', 0), ('7B', 0)]
A2ST
[('6A', 45), ('7A', 45), ('6B', 0), ('7B', 0)]


In [116]:
fuzz.ratio('1391_CA', '1391_5A_CA')

82

In [None]:
gen_boiler_map[gen_boiler_map['gen_boiler_code'].str.contains("^10784_")]