In [1]:
import pathlib
import numpy as np
import pandas as pd
import json
import re
from tqdm import tqdm_notebook
import pdb
import spacy
import pytextrank
import datetime
import zipfile


In [2]:
h = pathlib.Path.home()
pet_data = open(f"{h}/downloads/PET 4.txt",'r').readlines()
ng_data = open(f"{h}/downloads/NG.txt",'r').readlines()
total_data = open(f"{h}/downloads/TOTAL.txt",'r').readlines()


In [3]:
pet_data_json = [json.loads(d) for d in pet_data if 'series_id' in d]
pet_categories_json = [json.loads(d) for d in pet_data if 'category_id' in d]

In [4]:
total_data_json = [json.loads(d) for d in total_data if 'series_id' in d]
total_catergories_json = [json.loads(d) for d in total_data if 'category_id' in d]

In [5]:
ng_data_json = [json.loads(d) for d in ng_data if 'series_id' in d]
ng_categories_json = [json.loads(d) for d in ng_data if 'category_id' in d]

In [6]:
all_keys =[]
for d in pet_data_json:
    k = list(d.keys())
    all_keys = list(set(all_keys + k))
all_keys

['unitsshort',
 'units',
 'description',
 'data',
 'name',
 'source',
 'geography2',
 'series_id',
 'copyright',
 'geography',
 'start',
 'f',
 'last_updated',
 'iso3166',
 'end']

In [7]:
all_keys =[]
for d in pet_categories_json:
    k = list(d.keys())
    all_keys = list(set(all_keys + k))
all_keys

['parent_category_id', 'name', 'notes', 'childseries', 'category_id']

In [8]:
'yes' if re.search("c", "abcdef") else 'no'

'yes'

In [9]:
def petdata(d):
    if 'data' not in d:
        return None
    data_col = d['name'].replace(',','').replace('(','').replace(')','').lower()
    values = np.array(d['data'])
    dates = values[:,0].astype(int)
    try:
        vals = values[:,1].astype(float)
    except:
        vals = [np.nan if not re.search('^[-+]{0,1}[.0-9]{1,}$',str(v)) else float(v) for v in values[:,1]]#values[:,1].astype(float)
    freq = d['f']
    units = d['units'].lower()
    start = int(d['start'])
    end = int(d['end'])
    ret = {
        'name':data_col,
        'freq':freq,
        'units':units,
        'start':start,
        'end':end,
        'dates':dates,
        'vals':vals}
    return ret
#     return data_col,dates,vals


In [10]:
def normalize_date_array(darray,varray,all_dates):
    r = {ad:None for ad in all_dates}
    dv = {darray[i]:varray[i] for i in range(len(darray))}
    for k in dv.keys():
        r[k] = dv[k]
    
    ksorted = sorted(list(r.keys()),reverse=True)
    return [r[k] for k in ksorted]


In [13]:
def build_df_eia(json_data,freq='M'):
    tbeg = datetime.datetime.now()
    print(tbeg)
    freq_dict = {'M':'monthly','W':'weekly','D':'daily','A':'annual'}
    all_values = [petdata(v) for v  in json_data if v['f']==freq]
    most_dates = []
    for av in all_values:
        most_dates = list(set(most_dates + list(av['dates'])))
    all_dates = sorted(most_dates,reverse=True)
    cols = [av['name'] for av in all_values]
    darrays = [av['dates'] for av in all_values]
    varrays = [av['vals'] for av in all_values]
    vals = [
        normalize_date_array(darrays[i],varrays[i],all_dates)
        for i in range(len(cols))
    ]
    units = [av['units'] for av in all_values]
    freqs = [av['freq'] for av in all_values]
    df_cols = pd.DataFrame({'col':cols,'units':units,'freq':freqs})

    dict_df = {cols[i]:vals[i] for i in range(len(cols))}
    df_date = pd.DataFrame({'date':all_dates})
    
    df_vals = pd.DataFrame(dict_df)
    val_cols = list(df_vals.columns.values)
    df_vals['date'] = all_dates
    df_vals = df_vals[['date'] + val_cols]
    
    time_word_to_remove = freq_dict[freq]
    df_vals.columns = [c.lower().replace(time_word_to_remove,'') for c in df_vals.columns.values]
    tend = datetime.datetime.now()
    print(tend)
    print(tend-tbeg)
    return df_vals,df_cols


In [14]:
df_total_weekly,df_total_weekly_cols = build_df_eia(total_data_json,freq='W')
df_total_monthly,df_total_monthly_cols = build_df_eia(total_data_json,freq='M')
df_total_annual,df_total_annual_cols = build_df_eia(total_data_json,freq='A')


2020-08-30 19:02:01.923753
2020-08-30 19:02:01.927572
0:00:00.003819
2020-08-30 19:02:01.927915
2020-08-30 19:02:03.246796
0:00:01.318881
2020-08-30 19:02:03.266642
2020-08-30 19:02:03.521709
0:00:00.255067


In [15]:
df_ng_weekly,df_ng_weekly_cols = build_df_eia(ng_data_json,freq='W')
df_ng_monthly,df_ng_monthly_cols = build_df_eia(ng_data_json,freq='M')
df_ng_annual,df_ng_weekly_cols = build_df_eia(ng_data_json,freq='A')


2020-08-30 19:02:08.013323
2020-08-30 19:02:08.053544
0:00:00.040221
2020-08-30 19:02:08.054366
2020-08-30 19:02:09.482681
0:00:01.428315
2020-08-30 19:02:09.518238
2020-08-30 19:02:11.339005
0:00:01.820767


In [16]:
df_pet_weekly,df_pet_weekly_cols = build_df_eia(pet_data_json,freq='W')
df_pet_monthly,df_pet_weekly_cols = build_df_eia(pet_data_json,freq='M')
df_pet_annual,df_pet_weekly_cols = build_df_eia(pet_data_json,freq='A')

2020-08-30 19:02:13.147698
2020-08-30 19:02:16.384693
0:00:03.236995
2020-08-30 19:02:16.447980
2020-08-30 19:03:04.513979
0:00:48.065999
2020-08-30 19:03:06.134722
2020-08-30 19:03:18.541824
0:00:12.407102


In [17]:
import base64
import io
import zipfile

def df_to_zipiofile(df,filename):
    sio2 = io.StringIO()
    df.to_csv(sio2,index=False)
    sio2.seek(0)
    zoio2 = io.BytesIO()
    f = zipfile.ZipFile(zoio2,'a',zipfile.ZIP_DEFLATED,False)
    f.writestr(filename,sio2.read())
    f.close() 
    zoio2.seek(0)
    return zoio2

def df_to_zipfile(df,filename,fullpath):
    zz = df_to_zipiofile(df,filename)
    ff = open(fullpath,'wb')
    ff.write(zz.getbuffer())
    ff.close()

In [18]:
df_to_zipfile(df_pet_monthly,'df_pet_monthly.csv','./temp_folder/df_pet_monthly.csv.zip')
df_to_zipfile(df_pet_weekly,'df_pet_weekly.csv','./temp_folder/df_pet_weekly.csv.zip')
df_to_zipfile(df_ng_monthly,'df_ng_monthly.csv','./temp_folder/df_ng_monthly.csv.zip')
df_to_zipfile(df_ng_weekly,'df_ng_weekly.csv','./temp_folder/df_ng_weekly.csv.zip')


In [20]:
df_ng_weekly.date.dtype

dtype('int64')

## Begin NLP feature analysis here

In [434]:
def create_nlp_doc(df):
    
    reports_text = '\n'.join(df.apply(lambda r:' '.join(r.values.astype(str)),axis=1).values)[:999999]

    # example text
    text = reports_text
    # load a spaCy model, depending on language, scale, etc.
    nlp = spacy.load("en_core_web_sm")

    # add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(text)

    # examine the top-ranked phrases in the document
    df_docs = pd.DataFrame(
        {
            'prank':[p.rank for p in doc._.phrases],
            'pcount':[p.count for p in doc._.phrases],
            'ptext':[p.text for p in doc._.phrases],
            'pcomb':[p.rank*p.count for p in doc._.phrases]
        }
    )
    return df_docs

In [455]:
df_cols = pd.DataFrame({'col':[s.replace('weekly','') for s in df_pet_weekly.columns.values]})
df_pet_doc  = create_nlp_doc(df_cols)

In [456]:
df_pet_doc

Unnamed: 0,prank,pcount,ptext,pcomb
0,0.148426,31,conventional retail gasoline prices,4.601199
1,0.144988,126,retail gasoline prices,18.268511
2,0.143159,1,u.s. regular conventional retail gasoline prices,0.143159
3,0.143155,7,regular conventional retail gasoline prices,1.002082
4,0.142033,1,u.s. midgrade conventional retail gasoline prices,0.142033
...,...,...,...,...
520,0.008491,1,3 gross,0.008491
521,0.008359,5,1 oxygenate,0.041797
522,0.007998,6,2 rbob,0.047986
523,0.007136,6,all countries,0.042813


In [457]:
# df_pet_doc2 = create_nlp_doc(df_pet_doc[['ptext']])
df_pet_doc2 = df_pet_doc[df_pet_doc.ptext.str.split(' ').str.len()<=3]
df_pet_doc2[~df_pet_doc2.ptext.str.contains('^[0-9]')].sort_values('prank',ascending=False)


Unnamed: 0,prank,pcount,ptext,pcomb
1,0.144988,126,retail gasoline prices,18.268511
52,0.127827,45,conventional motor gasoline,5.752234
57,0.123374,36,reformulated motor gasoline,4.441457
65,0.118500,15,motor gasoline,1.777493
67,0.117521,1,oil residential price,0.117521
...,...,...,...,...
516,0.011313,1,colorado,0.011313
517,0.011313,16,cleveland,0.181008
518,0.010614,1,ngpls/lrgs,0.010614
523,0.007136,6,all countries,0.042813


## build column categories

In [843]:
PAD_REGEX = "(padd [0-9][a-z]{0,1}){1,2}"
PRODUCT_TYPE = "(kerosene-type jet fuel)|(gasoline)|(no 2[.]{0,1} diesel)|(propane( and propylene){0,1})|(crude)|(no. 2 heating oil)|(oxygenate)"
GRADE = "(regular)|(premium)|(midgrade)|(all grades)"
UNIT_TYPE = '(prices)|(inputs)|(net input)|(capacity)|(net production)|(production)|(imports)|(ending stocks)|(utilization)'
PROCESSOR_TYPE = '(refiner and blender)|(refiner)|(blender)|(oxygenate plant)|(gas plant)'
FORMULATION = "(all formulations)|(conventional)|(reformulated)"

def has_padd(col):
    return len(re.findall(PAD_REGEX,col.lower()))>0

def process_pad(cols,f,exclude_words=[]):
    vals = []
    for c in cols:
        if not has_padd(c):
            continue
        if any([ew in c for ew in exclude_words]):
            continue
        val = f(c)
        vals.append(val.strip().rstrip())
    return vals

def _padd(c):
    val = re.split(PAD_REGEX,c.lower())
    val = ''.join(val[1:-1])
    return val.strip().lstrip()

def get_padd(cols):
    return process_pad(cols,_padd)

def _padd_region(c):
    val = re.split(PAD_REGEX,c.lower())
    val = val[0].strip()
    g = GRADE
    val = re.split(g,val)[0]
    return val.strip().lstrip()

def get_padd_region(cols,**args):
    return process_pad(cols,_padd_region,**args)

def _pad_category(c):
    val = re.split(PAD_REGEX,c.lower())
    val = val[-1].strip().lower().split('of')[0]
    return val.strip().lstrip()

def get_padd_category(cols,**args):
    return process_pad(cols,_pad_category,**args)

def get_padd_category2(cols):
    vals = []
    for c in cols:
        if not has_padd(c):
            vals.append(None)
            continue
        val = re.split(PAD_REGEX,c.lower())
        val = val[-1].strip().lower().split('of')[0]
        if 'price' in val:
            vals.append(None)
            continue
        vals.append(val.strip().rstrip())
    return vals

def _pad_source(c):
    val = re.split(PAD_REGEX,c.lower())
    val = val[-1].strip().lower().split('of')[-1]
    return val.strip().lstrip()

def get_padd_source(cols,**args):
    return process_pad(cols,_pad_source,**args)


def get_padd_gasoline_type(cols,**args):
    vals = get_padd_source(cols,**args)
    vals = [re.split('gasoline',v)[0].strip().lstrip() for v in vals]
    return vals

def _padd_processor_type(c):
    val = re.split(PAD_REGEX,c.lower())
    val = ''.join(val[1:-1])
    return val.strip().lstrip()

def get_padd_processor_type(cols,**args):
    return process_pad(cols,_padd_processor_type,**args)

def get_ids(cols):
    def _findall(r,c):
        return ''.join([v for v in re.split(r,c.lower())[1:-1] if v is not None])
        l = re.findall(attribute_regexes[i],c)
        if l is None or len(l)<1:
            return ''
        if type(l[0]) == tuple:
            l = l[0]
        return ' '.join(l).strip().lstrip()
        
    all_attributes = {}
    attribute_regexes = [PAD_REGEX,PRODUCT_TYPE,GRADE,FORMULATION,UNIT_TYPE,PROCESSOR_TYPE]
    attribute_names = ['PAD_REGEX','PRODUCT_TYPE','GRADE','FORMULATION','UNIT_TYPE','PROCESSOR_TYPE']
    for i in range(len(attribute_regexes)):
        r = attribute_regexes[i]
        alist = [_findall(r,c) for c in cols]
        all_attributes[attribute_names[i]] = alist
    all_attributes['col'] = cols
    return pd.DataFrame(all_attributes)



In [786]:
set(get_padd_gasoline_type(df_pet_lower.columns.values))

{'',
 'all grades all formulations retail',
 'all grades conventional retail',
 'all grades reformulated retail',
 'commercial crude oil imports excluding spr weekly',
 'commercial kerosene-type jet fuel weekly',
 'conventional cbob',
 'conventional gtab',
 'conventional motor',
 'conventional other',
 'crude oil and petroleum products weekly',
 'crude oil weekly',
 'distillate fuel oil 0 to 15 ppm sulfur weekly',
 'distillate fuel oil greater than 15 to 500 ppm sulfur weekly',
 'distillate fuel oil greater than 2000 ppm sulfur weekly',
 'distillate fuel oil greater than 500 ppm sulfur weekly',
 'distillate fuel oil greater than 500 to 2000 ppm sulfur weekly',
 'distillate fuel oil weekly',
 'except california all grades all formulations retail',
 'except california all grades conventional retail',
 'except california all grades reformulated retail',
 'except california midgrade all formulations retail',
 'except california midgrade conventional retail',
 'except california midgrade re

In [838]:
df_cols = get_ids(df_pet_lower.columns.values)
df_cols

Unnamed: 0,PAD_REGEX,PRODUCT_TYPE,GRADE,FORMULATION,UNIT_TYPE,PROCESSOR_TYPE,col
0,,,,,,,date
1,,gasoline,midgrade,conventional,prices,,florida midgrade conventional retail gasoline prices weekly
2,,gasoline,premium,all formulations,prices,,boston ma premium all formulations retail gasoline prices weekly
3,,gasoline,premium,reformulated,prices,,san francisco ca premium reformulated retail gasoline prices weekly
4,,gasoline,premium,reformulated,prices,,new york harbor premium reformulated retail gasoline prices weekly
...,...,...,...,...,...,...,...
1128,padd 3,gasoline,,reformulated,ending stocks,,gulf coast padd 3 ending stocks of reformulated rbob with alcohol gasoline blending components weekly
1129,padd 4,gasoline,,reformulated,ending stocks,,rocky mountain padd 4 ending stocks of reformulated rbob with alcohol gasoline blending components weekly
1130,padd 5,gasoline,,reformulated,ending stocks,,west coast padd 5 ending stocks of reformulated rbob with alcohol gasoline blending components weekly
1131,,gasoline,,reformulated,ending stocks,,u.s. ending stocks of reformulated rbob with ether gasoline blending components weekly


In [839]:
dfc = create_unique_values(df_cols)
dfc

Unnamed: 0,FORMULATION,GRADE,PAD_REGEX,PROCESSOR_TYPE,PRODUCT_TYPE,UNIT_TYPE
0,conventional,midgrade,padd 5,refiner blender and gas plant,gasoline,prices
1,all formulations,premium,padd 1c,refiner and blender,no 2 diesel,imports
2,reformulated,all grades,padd 1b,blender,propane,net production
3,,regular,padd 1a,refiner,propane and propylene and propylene,net input
4,,,padd 2,oxygenate plant,crude,ending stocks
5,,,padd 3,,oxygenate,production
6,,,padd 4,,no. 2 heating oil,utilization of refinery operable capacity
7,,,padd 1,,kerosene-type jet fuel,capacity
8,,,padd 4 and west coast padd 5,,gasoline non-oxygenate,inputs
9,,,padd 4 and padd 5,,,


In [824]:
padd_list = ['padd 1','padd 2','padd 3','padd 4','padd 5']
prod_type_list = ['gasoline']
grade_list = []#['regular']
unit_type_list = ['imports']#['ending stocks']
processor_type_list = []

all_true = pd.Series([True for _ in range(len(df_cols))])
do_screen = lambda c,l:all_true if len(l)<1 else df_cols[c].isin(l)

pad_regex_screen = do_screen('PAD_REGEX',padd_list)
pad_prod_type_screen = do_screen('PRODUCT_TYPE',prod_type_list)
pad_unit_type_screen = do_screen('UNIT_TYPE',unit_type_list)
pad_grade_screen = do_screen('GRADE',grade_list)
pad_processor_type_screen = do_screen('PROCESSOR_TYPE',processor_type_list)

screens = pad_regex_screen & pad_prod_type_screen & pad_unit_type_screen & pad_grade_screen & pad_processor_type_screen
cols_to_view = list(df_cols[screens].col.values)
cols_to_view = ['date'] + cols_to_view
df_screen = df_pet_lower[cols_to_view].fillna(0)
valid_cols = [c for c in df_screen.columns.values if df_screen[c].iloc[0:2].sum()>0]
df_screen[valid_cols]


Unnamed: 0,date,east coast padd 1 imports from all countries of motor gasoline blending components rbob weekly,west coast padd 5 imports of finished motor gasoline weekly,east coast padd 1 imports of finished motor gasoline weekly,east coast padd 1 imports of other conventional motor gasoline weekly,east coast padd 1 imports of conventional gtab gasoline blending components weekly,west coast padd 5 imports of conventional motor gasoline weekly,east coast padd 1 imports of conventional other gasoline blending components weekly,gulf coast padd 3 imports of conventional other gasoline blending components weekly,rocky mountain padd 4 conventional other gasoline blending components imports weekly,...,east coast padd 1 imports of total gasoline weekly,gulf coast padd 3 imports of total gasoline weekly,rocky mountain padd 4 imports of total gasoline weekly,west coast padd 5 imports of total gasoline weekly,east coast padd 1 imports of conventional cbob gasoline blending components weekly,east coast padd 1 imports of gasoline blending components weekly,gulf coast padd 3 imports of gasoline blending components weekly,rocky mountain padd 4 imports of gasoline blending components weekly,west coast padd 5 imports of gasoline blending components weekly,east coast padd 1 imports of reformulated rbob with alcohol gasoline blending components weekly
0,20200817,219.0,3.0,143.0,143.0,136.0,3.0,424.0,52.0,6.0,...,962.0,52.0,6.0,3.0,41.0,820.0,52.0,6.0,0.0,248.0
1,20200810,168.0,3.0,68.0,68.0,198.0,3.0,159.0,51.0,4.0,...,599.0,51.0,4.0,4.0,6.0,531.0,51.0,4.0,1.0,338.0
2,20200803,147.0,2.0,120.0,120.0,31.0,2.0,470.0,47.0,3.0,...,800.0,47.0,3.0,74.0,32.0,680.0,47.0,3.0,72.0,172.0
3,20200727,165.0,9.0,20.0,20.0,21.0,9.0,281.0,12.0,6.0,...,515.0,12.0,6.0,10.0,27.0,494.0,12.0,6.0,0.0,183.0
4,20200720,229.0,39.0,128.0,128.0,14.0,39.0,43.0,17.0,4.0,...,432.0,17.0,4.0,39.0,18.0,304.0,17.0,4.0,1.0,199.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,20030623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
896,20030616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
897,20030609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
898,20030602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [811]:
pad_regex_screen
type(pad_unit_type_screen)

pandas.core.series.Series

In [795]:
from dashapp import dashapp2 as dashapp
from plotly.offline import iplot

ys_per_graph = 4
clen = len(df_screen.columns.values)
glen = int(clen/ys_per_graph) + (1 if clen % ys_per_graph > 0 else 0)
all_y_cols = [c for c in df_screen.columns.values if c!='date']
for i in range(glen):
    b = i*ys_per_graph
    e = b + ys_per_graph
    cols = ['date'] + all_y_cols[b:e]
    iplot(dashapp.plotly_plot(df_screen[cols],x_column='date'))

In [736]:
glen

0

In [710]:
set(get_padd(df_pet_lower.columns.values))

{'padd 1',
 'padd 1a',
 'padd 1b',
 'padd 1c',
 'padd 2',
 'padd 2r',
 'padd 3',
 'padd 4',
 'padd 4 and padd 5',
 'padd 4 and west coast padd 5',
 'padd 5'}

In [478]:
set(get_padd_region(df_pet_lower.columns.values))

{'',
 'central atlantic',
 'east coast',
 'gulf coast',
 'lower atlantic',
 'midwest',
 'new england',
 'rocky mountain',
 'rocky mountains',
 'weekly gulf coast',
 'weekly rocky mountain',
 'west coast'}

In [490]:
ss =set(get_padd_source(df_pet_lower.columns.values,exclude_words=['price']))
len(ss),ss

(54,
 {'commercial crude oil imports excluding spr weekly',
  'commercial kerosene-type jet fuel weekly',
  'conventional cbob gasoline blending components weekly',
  'conventional gtab gasoline blending components weekly',
  'conventional motor gasoline ed55 and lower weekly',
  'conventional motor gasoline greater than ed55 weekly',
  'conventional motor gasoline weekly',
  'conventional motor gasoline with fuel ethanol weekly',
  'conventional other gasoline blending components imports weekly',
  'conventional other gasoline blending components weekly',
  'crude oil and petroleum products weekly',
  'crude oil weekly',
  'distillate fuel oil 0 to 15 ppm sulfur weekly',
  'distillate fuel oil greater than 15 to 500 ppm sulfur weekly',
  'distillate fuel oil greater than 2000 ppm sulfur weekly',
  'distillate fuel oil greater than 500 ppm sulfur weekly',
  'distillate fuel oil greater than 500 to 2000 ppm sulfur weekly',
  'distillate fuel oil weekly',
  'finished conventional motor g

In [494]:
ss = set(get_padd_category(df_pet_lower.columns.values,exclude_words=['price']))
# ss = [s for s in ss if len(re.findall(unit_type,s))>0]#[s for s in ss if 'blender' in s]
len(ss),ss

(24,
 {'blender net production',
  'commercial crude oil imports excluding spr weekly',
  'conventional other gasoline blending components imports weekly',
  'efiner and blender net production',
  'ending stocks',
  'ending stocks excluding spr',
  'ending stocks excluding spr and including lease stock',
  'gross inputs into refineries weekly',
  'imports',
  'imports from',
  'imports from  all countries',
  'operable crude oil distillation capacity weekly',
  'oxygenate plant production',
  'percent utilization',
  'propane and propylene ending stocks excluding propylene at terminal weekly',
  'rbob with alcohol gasoline blending components imports weekly',
  'rbob with ether gasoline blending components ending stocks weekly',
  'rbob with ether gasoline blending components imports weekly',
  'refiner and blender net input',
  'refiner and blender net production',
  'refiner blender and gas plant net production',
  'refiner net input',
  'refiner net production',
  'reformulated gaso

In [501]:
ss = set(get_padd_gasoline_type(df_pet_lower.columns.values,exclude_words=['price']))
len(ss),ss

(42,
 {'',
  'commercial crude oil imports excluding spr weekly',
  'commercial kerosene-type jet fuel weekly',
  'conventional cbob',
  'conventional gtab',
  'conventional motor',
  'conventional other',
  'crude oil and petroleum products weekly',
  'crude oil weekly',
  'distillate fuel oil 0 to 15 ppm sulfur weekly',
  'distillate fuel oil greater than 15 to 500 ppm sulfur weekly',
  'distillate fuel oil greater than 2000 ppm sulfur weekly',
  'distillate fuel oil greater than 500 ppm sulfur weekly',
  'distillate fuel oil greater than 500 to 2000 ppm sulfur weekly',
  'distillate fuel oil weekly',
  'finished conventional motor',
  'finished motor',
  'finished reformulated motor',
  'fuel ethanol weekly',
  'gross inputs into refineries weekly',
  'kerosene-type jet fuel weekly',
  'military kerosene-type jet fuel weekly',
  'motor',
  'operable crude oil distillation capacity weekly',
  'other conventional motor',
  'other finished conventional motor',
  'other finished reformu

In [508]:
ss = set(get_padd_category(df_pet_lower.columns.values,exclude_words=['price']))
ss = [re.split(UNIT_TYPE,s)[0] for s in ss]
len(ss),set(ss)

(24,
 {'',
  'blender ',
  'commercial crude oil ',
  'conventional other gasoline blending components ',
  'efiner and blender ',
  'gross ',
  'operable crude oil distillation ',
  'oxygenate plant ',
  'percent ',
  'propane and propylene ',
  'rbob with alcohol gasoline blending components ',
  'rbob with ether gasoline blending components ',
  'refiner ',
  'refiner and blender ',
  'refiner blender and gas plant ',
  'reformulated gasoline non-oxygenated '})

In [511]:
ss = set([c for c in df_pet_lower.columns.values if len(re.findall(PROCESSOR_TYPE,c))>0])
len(ss),ss

(211,
 {'central atlantic padd 1b refiner blender and gas plant net production of propane and propylene weekly',
  'east coast padd 1  refiner and blender net production of commercial kerosene-type jet fuel weekly',
  'east coast padd 1  refiner and blender net production of distillate fuel oil greater than 500 ppm sulfur weekly',
  'east coast padd 1  refiner and blender net production of military kerosene-type jet fuel weekly',
  'east coast padd 1 blender net production of finished motor gasoline weekly',
  'east coast padd 1 gross inputs into refineries weekly',
  'east coast padd 1 oxygenate plant production of fuel ethanol weekly',
  'east coast padd 1 percent utilization of refinery operable capacity weekly',
  'east coast padd 1 refiner and blender net input of conventional cbob gasoline blending components weekly',
  'east coast padd 1 refiner and blender net input of conventional gtab gasoline blending components weekly',
  'east coast padd 1 refiner and blender net input of 