#### Set up the dataset, features, and targets ####

In [70]:
import pandas
import numpy

recs = pandas.read_csv('./Project-2/data/recs2015_public_v4.csv')
isna = recs.isna().sum()
isna[isna>0]

NGXBTU    2382
dtype: int64

In [71]:
import pandas
import numpy

recs = pandas.read_csv('./Project-2/data/recs2015_public_v4.csv')

# Strictly speaking, neither of these are necesarry
# but they might come in useful for future data or models
recs['NGXBTU'] = recs['NGXBTU'].fillna(0)
recs['CLIMATE_REGION_PUB'] = recs['CLIMATE_REGION_PUB'].fillna('Unknown')

# These are the initial targets we'll be predicting
potential_targets = ['TOTALBTU',
					'TOTALDOL',
					'KWH',
					'BTUEL',
					'DOLLAREL',
					'CUFEETNG',
					'BTUNG',
					'DOLLARNG',
					'GALLONLP',
					'BTULP',
					'DOLLARLP',
					'GALLONFO',
					'BTUFO',
					'DOLLARFO',
					'WOODAMT',
					'WOODBTU']

# These are the features in the data set with leakage
not_features = ["KWH",	# Numeric	8	Total site electricity usage, in kilowatthours, 2015
				"KWHSPH",	# Numeric	8	Electricity usage for space heating, main and secondary, in kilowatthours, 2015
				"KWHCOL",	# Numeric	8	Electricity usage for air conditioning (central systems and individual units), in kilowatthours, 2015
				"KWHWTH",	# Numeric	8	Electricity usage for water heating, main and secondary, in kilowatthours, 2015
				"KWHRFG",	# Numeric	8	Electricity usage for all refrigerators, in kilowatthours, 2015
				"KWHRFG1",	# Numeric	8	Electricity usage for first refrigerators, in kilowatthours, 2015
				"KWHRFG2",	# Numeric	8	Electricity usage for second refrigerators, in kilowatthours, 2015
				"KWHFRZ",	# Numeric	8	Electricity usage for freezers, in kilowatthours, 2015
				"KWHCOK",	# Numeric	8	Electricity usage for cooking (stoves, cooktops, and ovens), in kilowatthours, 2015
				"KWHMICRO",	# Numeric	8	Electricity usage for microwaves, in kilowatthours, 2015
				"KWHCW",	# Numeric	8	Electricity usage for clothes washers, in kilowatthours, 2015
				"KWHCDR",	# Numeric	8	Electricity usage for clothes dryers, in kilowatthours, 2015
				"KWHDWH",	# Numeric	8	Electricity usage for dishwashers, in kilowatthours, 2015
				"KWHLGT",	# Numeric	8	Electricity usage for indoor and outdoor lighting, in kilowatthours, 2015
				"KWHTVREL",	# Numeric	8	Electricity usage for all televisions and related peripherals, in kilowatthours, 2015
				"KWHTV1",	# Numeric	8	Electricity usage for first televisions, in kilowatthours, 2015
				"KWHTV2",	# Numeric	8	Electricity usage for second televisions, in kilowatthours, 2015
				"KWHAHUHEAT",	# Numeric	8	Electricity usage for air handlers and boiler pumps used for heating, in kilowatthours, 2015
				"KWHAHUCOL",	# Numeric	8	Electricity usage for air handlers used for cooling, in kilowatthours, 2015
				"KWHEVAPCOL",	# Numeric	8	Electricity usage for evaporative coolers, in kilowatthours, 2015
				"KWHCFAN",	# Numeric	8	Electricity usage for ceiling fans, in kilowatthours, 2015
				"KWHDHUM",	# Numeric	8	Electricity usage for dehumidifiers, in kilowatthours, 2015
				"KWHHUM",	# Numeric	8	Electricity usage for humidifiers, in kilowatthours, 2015
				"KWHPLPMP",	# Numeric	8	Electricity usage for swimming pool pumps, in kilowatthours, 2015
				"KWHHTBPMP",	# Numeric	8	Electricity usage for hot tub pumps, in kilowatthours, 2015
				"KWHHTBHEAT",	# Numeric	8	Electricity usage for hot tub heaters, in kilowatthours, 2015
				"KWHNEC",	# Numeric	8	Electricity usage for other devices and purposes not elsewhere classified, in kilowatthours, 2015
				"BTUEL",	# Numeric	8	Total site electricity usage, in thousand Btu, 2015
				"BTUELSPH",	# Numeric	8	Electricity usage for space heating, main and secondary, in thousand Btu, 2015
				"BTUELCOL",	# Numeric	8	Electricity usage for air conditioning (central systems and individual units), in thousand Btu, 2015
				"BTUELWTH",	# Numeric	8	Electricity usage for water heating, main and secondary, in thousand Btu, 2015
				"BTUELRFG",	# Numeric	8	Electricity usage for all refrigerators, in thousand Btu, 2015
				"BTUELRFG1",	# Numeric	8	Electricity usage for first refrigerators, in thousand Btu, 2015
				"BTUELRFG2",	# Numeric	8	Electricity usage for second refrigerators, in thousand Btu, 2015
				"BTUELFRZ",	# Numeric	8	Electricity usage for freezers, in thousand Btu, 2015
				"BTUELCOK",	# Numeric	8	Electricity usage for cooking (stoves, cooktops, and ovens), in thousand Btu, 2015
				"BTUELMICRO",	# Numeric	8	Electricity usage for microwaves, in thousand Btu, 2015
				"BTUELCW",	# Numeric	8	Electricity usage for clothes washers, in thousand Btu, 2015
				"BTUELCDR",	# Numeric	8	Electricity usage for clothes dryers, in thousand Btu, 2015
				"BTUELDWH",	# Numeric	8	Electricity usage for dishwashers, in thousand Btu, 2015
				"BTUELLGT",	# Numeric	8	Electricity usage for indoor and outdoor lighting, in thousand Btu, 2015
				"BTUELTVREL",	# Numeric	8	Electricity usage for all televisions and related peripherals, in thousand Btu, 2015
				"BTUELTV1",	# Numeric	8	Electricity usage for first televisions, in thousand Btu, 2015
				"BTUELTV2",	# Numeric	8	Electricity usage for second televisions, in thousand Btu, 2015
				"BTUELAHUHEAT",	# Numeric	8	Electricity usage for air handlers and boiler pumps used for heating, in thousand Btu, 2015
				"BTUELAHUCOL",	# Numeric	8	Electricity usage for air handlers used for cooling, in thousand Btu, 2015
				"BTUELEVAPCOL",	# Numeric	8	Electricity usage for evaporative coolers, in thousand Btu, 2015
				"BTUELCFAN",	# Numeric	8	Electricity usage for ceiling fans, in thousand Btu, 2015
				"BTUELDHUM",	# Numeric	8	Electricity usage for dehumidifiers, in thousand Btu, 2015
				"BTUELHUM",	# Numeric	8	Electricity usage for humidifiers, in thousand Btu, 2015
				"BTUELPLPMP",	# Numeric	8	Electricity usage for swimming pool pumps, in thousand Btu, 2015
				"BTUELHTBPMP",	# Numeric	8	Electricity usage for hot tub pumps, in thousand Btu, 2015
				"BTUELHTBHEAT",	# Numeric	8	Electricity usage for hot tub heaters, in thousand Btu, 2015
				"BTUELNEC",	# Numeric	8	Electricity usage for other devices and purposes not elsewhere classified, in thousand Btu, 2015
				"DOLLAREL",	# Numeric	8	Total electricity cost, in dollars, 2015
				"DOLELSPH",	# Numeric	8	Electricity cost for space heating, main and secondary, in dollars, 2015
				"DOLELCOL",	# Numeric	8	Electricity cost for air conditioning (central systems and individual units), in dollars, 2015
				"DOLELWTH",	# Numeric	8	Electricity cost for water heating, main and secondary, in dollars, 2015
				"DOLELRFG",	# Numeric	8	Electricity cost for all refrigerators, in dollars, 2015
				"DOLELRFG1",	# Numeric	8	Electricity cost for first refrigerators, in dollars, 2015
				"DOLELRFG2",	# Numeric	8	Electricity cost for second refrigerators, in dollars, 2015
				"DOLELFRZ",	# Numeric	8	Electricity cost for freezers, in dollars, 2015
				"DOLELCOK",	# Numeric	8	Electricity cost for cooking (stoves, cooktops, and ovens), in dollars, 2015
				"DOLELMICRO",	# Numeric	8	Electricity cost for microwaves, in dollars, 2015
				"DOLELCW",	# Numeric	8	Electricity cost for clothes washers, in dollars, 2015
				"DOLELCDR",	# Numeric	8	Electricity cost for clothes dryers, in dollars, 2015
				"DOLELDWH",	# Numeric	8	Electricity cost for dishwashers, in dollars, 2015
				"DOLELLGT",	# Numeric	8	Electricity cost for indoor and outdoor lighting, in dollars, 2015
				"DOLELTVREL",	# Numeric	8	Electricity cost for all televisions and related peripherals, in dollars, 2015
				"DOLELTV1",	# Numeric	8	Electricity cost for first televisions, in dollars, 2015
				"DOLELTV2",	# Numeric	8	Electricity cost for second televisions, in dollars, 2015
				"DOLELAHUHEAT",	# Numeric	8	Electricity cost for air handlers and boiler pumps used for heating, in dollars, 2015
				"DOLELAHUCOL",	# Numeric	8	Electricity cost for air handlers used for cooling, in dollars, 2015
				"DOLELEVAPCOL",	# Numeric	8	Electricity cost for evaporative coolers, in dollars, 2015
				"DOLELCFAN",	# Numeric	8	Electricity cost for ceiling fans, in dollars, 2015
				"DOLELDHUM",	# Numeric	8	Electricity cost for dehumidifiers, in dollars, 2015
				"DOLELHUM",	# Numeric	8	Electricity cost for humidifiers, in dollars, 2015
				"DOLELPLPMP",	# Numeric	8	Electricity cost for swimming pool pumps, in dollars, 2015
				"DOLELHTBPMP",	# Numeric	8	Electricity cost for hot tub pumps, in dollars, 2015
				"DOLELHTBHEAT",	# Numeric	8	Electricity cost for hot tub heaters, in dollars, 2015
				"DOLELNEC",	# Numeric	8	Electricity cost for other devices and purposes not elsewhere classified, in dollars, 2015
				"CUFEETNG",	# Numeric	8	Total natural gas usage, in hundred cubic feet, 2015
				"CUFEETNGSPH",	# Numeric	8	Natural gas usage for space heating, main and secondary, in hundred cubic feet, 2015
				"CUFEETNGWTH",	# Numeric	8	Natural gas usage for water heating , main and secondary, in hundred cubic feet, 2015
				"CUFEETNGCOK",	# Numeric	8	Natural gas usage for cooking (stoves, cooktops, and ovens), in hundred cubic feet, 2015
				"CUFEETNGCDR",	# Numeric	8	Natural gas usage for clothes dryers, in hundred cubic feet, 2015
				"CUFEETNGPLHEAT",	# Numeric	8	Natural gas usage for swimming pool heaters, in hundred cubic feet, 2015
				"CUFEETNGHTBHEAT",	# Numeric	8	Natural gas usage for hot tub heaters, in hundred cubic feet, 2015
				"CUFEETNGNEC",	# Numeric	8	Natural gas usage for other devices and purposes not elsewhere classified, in hundred cubic feet, 2015
				"BTUNG",	# Numeric	8	Total natural gas usage, in thousand Btu, 2015
				"BTUNGSPH",	# Numeric	8	Natural gas usage for space heating, main and secondary, in thousand Btu, 2015
				"BTUNGWTH",	# Numeric	8	Natural gas usage for water heating, main and secondary, in thousand Btu, 2015
				"BTUNGCOK",	# Numeric	8	Natural gas usage for cooking (stoves, cooktops, and ovens), in thousand Btu, 2015
				"BTUNGCDR",	# Numeric	8	Natural gas usage for clothes dryers, in thousand Btu, 2015
				"BTUNGPLHEAT",	# Numeric	8	Natural gas usage for swimming pool heaters, in thousand Btu, 2015
				"BTUNGHTBHEAT",	# Numeric	8	Natural gas usage for hot tub heaters, in thousand Btu, 2015
				"BTUNGNEC",	# Numeric	8	Natural gas usage for other devices and purposes not elsewhere classified, in thousand Btu, 2015
				"DOLLARNG",	# Numeric	8	Total natural gas cost, in dollars, 2015
				"DOLNGSPH",	# Numeric	8	Natural Gas cost for space heating, main and secondary, in dollars, 2015
				"DOLNGWTH",	# Numeric	8	Natural gas cost for water heating, main and secondary, in dollars, 2015
				"DOLNGCOK",	# Numeric	8	Natural gas cost for cooking (stoves, cooktops, and ovens), in dollars, 2015
				"DOLNGCDR",	# Numeric	8	Natural gas cost for clothes dryers, in dollars, 2015
				"DOLNGPLHEAT",	# Numeric	8	Natural gas cost for swimming pool heaters, in dollars, 2015
				"DOLNGHTBHEAT",	# Numeric	8	Natural gas cost for hot tub heaters, in dollars, 2015
				"DOLNGNEC",	# Numeric	8	Natural gas cost for other devices and purposes not elsewhere classified, in dollars, 2015
				"GALLONLP",	# Numeric	8	Total propane usage, in gallons, 2015
				"GALLONLPSPH",	# Numeric	8	Propane usage for space heating, main and secondary, in gallons, 2015
				"GALLONLPWTH",	# Numeric	8	Propane usage for water heating, main and secondary, in gallons, 2015
				"GALLONLPCOK",	# Numeric	8	Propane usage for cooking (stoves, cooktops, and ovens), in gallons, 2015
				"GALLONLPCDR",	# Numeric	8	Propane usage for clothes dryers, in gallons, 2015
				"GALLONLPNEC",	# Numeric	8	Propane usage for other devices and purposes not elsewhere classified, in gallons, 2015
				"BTULP",	# Numeric	8	Total propane usage, in thousand Btu, 2015
				"BTULPSPH",	# Numeric	8	Propane usage for space heating, main and secondary, in thousand Btu, 2015
				"BTULPWTH",	# Numeric	8	Propane usage for water heating, main and secondary, in thousand Btu, 2015
				"BTULPCOK",	# Numeric	8	Propane usage for cooking (stoves, cooktops, and ovens), in thousand Btu, 2015
				"BTULPCDR",	# Numeric	8	Propane usage for clothes dryers, in thousand Btu, 2015
				"BTULPNEC",	# Numeric	8	Propane usage for other devices and purposes not elsewhere classified, in thousand Btu, 2015
				"DOLLARLP",	# Numeric	8	Total cost of propane, in dollars, 2015
				"DOLLPSPH",	# Numeric	8	Propane cost for space heating, main and secondary, in dollars, 2015
				"DOLLPWTH",	# Numeric	8	Propane cost for water heating, main and secondary, in dollars, 2015
				"DOLLPCOK",	# Numeric	8	Propane cost for cooking (stoves, cooktops, and ovens), in dollars, 2015
				"DOLLPCDR",	# Numeric	8	Propane cost for clothes dryers, in dollars, 2015
				"DOLLPNEC",	# Numeric	8	Propane cost for other devices and purposes not elsewhere classified, in dollars, 2015
				"GALLONFO",	# Numeric	8	Total fuel oil/kerosene usage, in gallons, 2015
				"GALLONFOSPH",	# Numeric	8	Fuel oil/kerosene usage for space heating, main and secondary, in gallons, 2015
				"GALLONFOWTH",	# Numeric	8	Fuel oil/kerosene usage for water heating, main and secondary, in gallons, 2015
				"GALLONFONEC",	# Numeric	8	Fuel oil/kerosene usage for other devices and purposes not elsewhere classified, in gallons, 2015
				"BTUFO",	# Numeric	8	Total fuel oil/kerosene usage, in thousand Btu, 2015
				"BTUFOSPH",	# Numeric	8	Fuel oil/kerosene usage for space heating, main and secondary, in thousand Btu, 2015
				"BTUFOWTH",	# Numeric	8	Fuel oil/kerosene usage for water heating, main and secondary, in thousand Btu, 2015
				"BTUFONEC",	# Numeric	8	Fuel oil/kerosene usage for other devices and purposes not elsewhere classified, in thousand Btu, 2015
				"DOLLARFO",	# Numeric	8	Total cost of fuel oil/kerosene, in dollars, 2015
				"DOLFOSPH",	# Numeric	8	Fuel oil/kerosene cost for space heating, main and secondary, in dollars, 2015
				"DOLFOWTH",	# Numeric	8	Fuel oil/kerosene cost for water heating, main and secondary, in dollars, 2015
				"DOLFONEC",	# Numeric	8	Fuel oil/kerosene cost for other devices and purposes not elsewhere classified, in dollars, 2015
				"TOTALBTU",	# Numeric	8	Total usage, in thousand Btu, 2015
				"TOTALDOL",	# Numeric	8	Total cost, in dollars, 2015 
				"TOTALBTUSPH",	# Numeric	8	Total usage for space heating, main and secondary, in thousand Btu, 2015
				"TOTALDOLSPH",	# Numeric	8	Total cost for space heating, main and secondary, in dollars, 2015
				"TOTALBTUWTH",	# Numeric	8	Total usage for water heating, main and secondary, in thousand Btu, 2015
				"TOTALDOLWTH",	# Numeric	8	Total cost for water heating, main and secondary, in dollars, 2015
				"TOTALBTUCOK",	# Numeric	8	Total usage for cooking (stoves, cooktops, and ovens), in thousand Btu, 2015
				"TOTALDOLCOK",	# Numeric	8	Total cost for cooking (stoves, cooktops, and ovens), in dollars, 2015
				"TOTALBTUCDR",	# Numeric	8	Total usage for clothes dryers, in thousand Btu, 2015
				"TOTALDOLCDR",	# Numeric	8	Total cost for clothes dryers, in dollars, 2015
				"TOTALBTUPL",	# Numeric	8	Total usage for swimming pool pumps and heaters, in thousand Btu, 2015
				"TOTALDOLPL",	# Numeric	8	Total cost for swimming pool pumps and heaters, in dollars, 2015
				"TOTALBTUHTB",	# Numeric	8	Total usage for hot tub pumps and heaters, in thousand Btu, 2015
				"TOTALDOLHTB",	# Numeric	8	Total cost for hot tub pumps and heaters, in dollars, 2015
				"TOTALBTUNEC",	# Numeric	8	Total usage for other devices and purposes not elsewhere classified, in thousand Btu, 2015
				"TOTALDOLNEC",	# Numeric	8	Total cost for other devices and purposes not elsewhere classified, in thousand Btu, 2015
				"WOODAMT",	# Numeric	8	Cords of wood used in the last year
				"ZWOODAMT",	# Numeric	8	Imputation flag for WOODAMT
				"WOODBTU",	# Numeric	8	Total cordwood usage, in thousand Btu, 2015 (Wood consumption is not included in TOTALBTU or TOTALDOL)
				"PELLETAMT",	# Numeric	8	Number of 40-pound wood pellet bags used in the last year
				"ZPELLETAMT",	# Numeric	8	Imputation flag for PELLETAMT
				"PELLETBTU"	# Numeric	8	Total wood pellet usage, in thousand Btu, 2015 (Wood consumption is not included in TOTALBTU or TOTALDOL)
]

# Log-scale all of our targets
for target in potential_targets[:]:
    # We use log(3+n) for simplicity's sake
    # as Not Applicable is encoded as -2 in the dataset
	recs[target+'_LOG'] = numpy.log(3+recs[target])
	not_features.append(target+'_LOG')
	potential_targets.append(target+'_LOG')

# The features we'll be using for our predictions
# These are the most important features for our various targets
# features = recs.columns.drop(not_features)
features = ['TOTSQFT_EN',
			'TOTCSQFT',
			'TOTHSQFT',
			'FUELHEAT',
			'REGIONC',
			'NHSLDMEM',
			'TOTROOMS',
			'WASHLOAD',
			'NUMADULT',
			'FUELH2O',
			'DRYRUSE',
			'CLIMATE_REGION_PUB',
			'DBT99',
			'FUELAUX']
weights = ['NWEIGHT']
# targets = ['btu_log']
# features


In [72]:
# Get some info on the dataset, for scaling input sliders
recs[features].describe(percentiles=[0.025, 0.05,0.5,0.95, 0.975])

Unnamed: 0,TOTSQFT_EN,TOTCSQFT,TOTHSQFT,FUELHEAT,REGIONC,NHSLDMEM,TOTROOMS,WASHLOAD,NUMADULT,FUELH2O,DRYRUSE,DBT99,FUELAUX
count,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0
mean,2081.443546,1454.523391,1815.814281,2.590046,2.760816,2.577383,6.191347,3.609743,1.972212,2.93739,3.48892,20.361115,0.459022
std,1282.660286,1253.846163,1178.533895,2.302666,1.004187,1.432224,2.360918,4.019369,0.855589,1.98799,4.093,14.748237,3.404171
min,221.0,0.0,0.0,-2.0,1.0,1.0,1.0,-2.0,1.0,1.0,-2.0,-19.6,-2.0
2.5%,544.25,0.0,0.0,-2.0,1.0,1.0,2.0,-2.0,1.0,1.0,-2.0,-8.475,-2.0
5%,640.0,0.0,385.0,1.0,1.0,1.0,3.0,-2.0,1.0,1.0,-2.0,-1.8,-2.0
50%,1773.5,1218.5,1559.0,1.0,3.0,2.0,6.0,3.0,2.0,2.0,3.0,19.1,-2.0
95%,4504.0,3850.0,4072.0,5.0,4.0,5.0,10.0,10.0,4.0,5.0,10.0,42.9,7.0
97.5%,5366.875,4415.75,4820.0,7.0,4.0,6.0,11.0,14.0,4.0,5.0,14.0,46.0,7.0
max,8501.0,8066.0,8066.0,21.0,4.0,12.0,19.0,30.0,10.0,21.0,30.0,63.9,21.0


In [73]:
# And info for defaults for non-numeric dropdowns
recs[features].describe(exclude=numpy.number)

Unnamed: 0,CLIMATE_REGION_PUB
count,5686
unique,5
top,Cold/Very Cold
freq,2008


In [74]:
# Our full list of targets
potential_targets

['TOTALBTU',
 'TOTALDOL',
 'KWH',
 'BTUEL',
 'DOLLAREL',
 'CUFEETNG',
 'BTUNG',
 'DOLLARNG',
 'GALLONLP',
 'BTULP',
 'DOLLARLP',
 'GALLONFO',
 'BTUFO',
 'DOLLARFO',
 'WOODAMT',
 'WOODBTU',
 'TOTALBTU_LOG',
 'TOTALDOL_LOG',
 'KWH_LOG',
 'BTUEL_LOG',
 'DOLLAREL_LOG',
 'CUFEETNG_LOG',
 'BTUNG_LOG',
 'DOLLARNG_LOG',
 'GALLONLP_LOG',
 'BTULP_LOG',
 'DOLLARLP_LOG',
 'GALLONFO_LOG',
 'BTUFO_LOG',
 'DOLLARFO_LOG',
 'WOODAMT_LOG',
 'WOODBTU_LOG']

In [75]:
# We'll be one-hot encoding each column with <= 11 unique values
uniques = recs[features].nunique()
encode = list(uniques[uniques<=11].index)

In [76]:
from category_encoders import OneHotEncoder
from sklearn.pipeline import Pipeline

oneHot = OneHotEncoder(use_cat_names=True, cols=encode)

# features_transformed = oneHot.fit_transform(recs[features])


In [77]:
# Due to the limited hyperparameter tuning and data available, we do a two-way split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(recs[features].sort_index(axis=1), recs[potential_targets], recs[weights], random_state=3)

A multioutput linear regression model gets R^2 of 0.63

In [78]:
from sklearn.linear_model import LinearRegression

_lr = LinearRegression()

lr_pipeline = Pipeline([('OneHotEncoder', oneHot),
                        ('LinearRegression', _lr)])

lr_pipeline.fit(X_train, y_train)

lr_pipeline.score(X_val, y_val)



0.6291696692851972

And individual target scores:

In [79]:
from sklearn.linear_model import LinearRegression

_lr = LinearRegression()

lr_pipeline = Pipeline([('OneHotEncoder', oneHot),
                        ('LinearRegression', _lr)])
for i in range(len(potential_targets)):
    target = potential_targets[i]
    print('Training on', target)
    y_train_t = y_train.iloc[:,i]
    y_val_t = y_val.iloc[:,i]
    lr_pipeline.fit(X_train, y_train_t)

    print(target, 'score:', lr_pipeline.score(X_val, y_val_t))

Training on TOTALBTU
TOTALBTU score: 0.581088418882282
Training on TOTALDOL
TOTALDOL score: 0.5104241832316625
Training on KWH
KWH score: 0.5058279180679004
Training on BTUEL
BTUEL score: 0.5058278711604289
Training on DOLLAREL
DOLLAREL score: 0.43644164638738175
Training on CUFEETNG
CUFEETNG score: 0.6698378846585337
Training on BTUNG
BTUNG score: 0.6695750656497823
Training on DOLLARNG
DOLLARNG score: 0.685096654889336
Training on GALLONLP
GALLONLP score: 0.8041021376138633
Training on BTULP
BTULP score: 0.8041020665360166
Training on DOLLARLP
DOLLARLP score: 0.7820852115956208
Training on GALLONFO
GALLONFO score: 0.829222037442764
Training on BTUFO
BTUFO score: 0.8287814949697866
Training on DOLLARFO
DOLLARFO score: 0.823105415856515
Training on WOODAMT
WOODAMT score: 0.8746927841362786
Training on WOODBTU
WOODBTU score: 0.6424448095752731
Training on TOTALBTU_LOG
TOTALBTU_LOG score: 0.6282491132474428
Training on TOTALDOL_LOG
TOTALDOL_LOG score: 0.5289751167746957
Training on KWH_L

A multioutput RFR shows some improvement

In [80]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

rfr_model = RandomForestRegressor(n_estimators=170, min_samples_leaf=12, n_jobs=-1, verbose=0)

pipeline = Pipeline([('OneHotEncoder', oneHot),
                    ('RandomForestRegressor', rfr_model)])

searchCV = RandomizedSearchCV(	pipeline,
								param_distributions={},
								n_iter=1,
								cv=9,
								scoring='r2',
								verbose=0,
								return_train_score=True,
								n_jobs=-1)

searchCV.fit(X_train, y_train)

print('best_score_: ', searchCV.best_score_)#.score(X_val, y_val)
print('best_estimator_.score: ', searchCV.best_estimator_.score(X_val, y_val))

best_score_:  0.6525034240493588
best_estimator_.score:  0.6602883575740279


Individual output predictions

In [81]:
from sklearn.metrics import r2_score

y_pred = searchCV.best_estimator_.predict(X_val)
for i in range(len(potential_targets)):
	target = potential_targets[i]
	y_val_t = y_val.iloc[:,i]
	y_pred_t = y_pred[:,i]
	print(target, 'score:', r2_score(y_val_t, y_pred_t))

TOTALBTU score: 0.6009056228636007
TOTALDOL score: 0.47213855116836856
KWH score: 0.4899967891146956
BTUEL score: 0.4899969524537454
DOLLAREL score: 0.41515272831598415
CUFEETNG score: 0.7478671484686906
BTUNG score: 0.7439516028916531
DOLLARNG score: 0.7296682158962877
GALLONLP score: 0.742850926500848
BTULP score: 0.7428509315968315
DOLLARLP score: 0.6873702717516195
GALLONFO score: 0.8797776469062608
BTUFO score: 0.879525830718417
DOLLARFO score: 0.869804749697185
WOODAMT score: 0.5496895160984494
WOODBTU score: 0.6235845679194743
TOTALBTU_LOG score: 0.6506439872014367
TOTALDOL_LOG score: 0.5144608697279833
KWH_LOG score: 0.530429934475584
BTUEL_LOG score: 0.5303626240050128
DOLLAREL_LOG score: 0.4441253003613952
CUFEETNG_LOG score: 0.9161539839702866
BTUNG_LOG score: 0.9092959578937245
DOLLARNG_LOG score: 0.9083544983734795
GALLONLP_LOG score: 0.7136805974955145
BTULP_LOG score: 0.6594913385632899
DOLLARLP_LOG score: 0.684781534428611
GALLONFO_LOG score: 0.8939613228435527
BTUFO_LO

#### Training on individual targets, then save the pipelines ####

In [82]:
import joblib

models = {}

for i in range(len(potential_targets)):
    target = potential_targets[i]
    print('Training on', target)
    y_train_t = y_train.iloc[:,i]
    y_val_t = y_val.iloc[:,i]

    #n_estimators=120
    rfr_model_t = RandomForestRegressor(n_estimators=270, min_samples_leaf=12, n_jobs=-2, verbose=0)

    pipeline_t = Pipeline([ ('OneHotEncoder', oneHot),
                            ('RandomForestRegressor', rfr_model)])

    searchCV = RandomizedSearchCV(	pipeline,
                                    param_distributions={},
                                    n_iter=1,
                                    cv=3,
                                    scoring='r2',
                                    verbose=0,
                                    return_train_score=True,
                                    n_jobs=-2)

    searchCV.fit(X_train, y_train_t)
    searchCV.best_estimator_.verbose=0

    models[target] = searchCV.best_estimator_
    print(target, 'score:', models[target].score(X_val, y_val_t))

    joblib.dump(models[target], './Project-2/pipelines/'+target+'_pipeline.joblib', compress=True)



Training on TOTALBTU
TOTALBTU score: 0.5995394059869292
Training on TOTALDOL
TOTALDOL score: 0.5181794532749524
Training on KWH
KWH score: 0.5250271328584624
Training on BTUEL
BTUEL score: 0.5235438767966549
Training on DOLLAREL
DOLLAREL score: 0.4494952620156775
Training on CUFEETNG
CUFEETNG score: 0.7616862715540488
Training on BTUNG
BTUNG score: 0.7627776686813691
Training on DOLLARNG
DOLLARNG score: 0.774710953426889
Training on GALLONLP
GALLONLP score: 0.8232821684047951
Training on BTULP
BTULP score: 0.8239573010995185
Training on DOLLARLP
DOLLARLP score: 0.7862896796773782
Training on GALLONFO
GALLONFO score: 0.8937446955559482
Training on BTUFO
BTUFO score: 0.8948328534302867
Training on DOLLARFO
DOLLARFO score: 0.8853943705775262
Training on WOODAMT
WOODAMT score: 0.9078965558443368
Training on WOODBTU
WOODBTU score: 0.755085643647531
Training on TOTALBTU_LOG
TOTALBTU_LOG score: 0.6428810344669216
Training on TOTALDOL_LOG
TOTALDOL_LOG score: 0.5472195297346523
Training on KWH_

#### Train an XGBRegressor model on each target ####
Note that we do not use these models in the webapp, due to memory constraints

In [83]:
from xgboost import XGBRegressor

xgbr_models = {}

xgbr_preprocessor = Pipeline([('OneHotEncoder', oneHot)])
X_train_t = xgbr_preprocessor.fit_transform(X_train)
X_val_t = xgbr_preprocessor.transform(X_val)

for i in range(len(potential_targets)):
    target = potential_targets[i]
    print('Training on', target)
    y_train_t = y_train.iloc[:,i]
    y_val_t = y_val.iloc[:,i]

    xgbr_model = None
    xgbr_model = XGBRegressor(  random_state=1,
                                n_jobs=-2,
                                n_estimators=4000,
                                min_samples_leaf=35,
                                # max_depth=9,
                                learning_rate=.025,
                                verbosity=0)

    eval_set = [(X_train_t, y_train_t),
                (X_val_t, y_val_t)]
    xgbr_model.fit(X_train_t, y_train_t, eval_set=eval_set, early_stopping_rounds=50)

    xgbr_models[target] = xgbr_model
    print(target, 'score:', xgbr_models[target].score(X_val_t, y_val_t))



# 

# multioutputregressor = MultiOutputRegressor(xgbr_model)
# # multioutputregressor.set_params(eval_set=eval_set, early_stopping_rounds=50)
# 

# # xgbr_model.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=50)

# # xgbr_model.score(X_val, y_val)

.696511	validation_1-rmse:0.617724
[197]	validation_0-rmse:0.695525	validation_1-rmse:0.617501
[198]	validation_0-rmse:0.695476	validation_1-rmse:0.617498
[199]	validation_0-rmse:0.695429	validation_1-rmse:0.617493
[200]	validation_0-rmse:0.695383	validation_1-rmse:0.617464
[201]	validation_0-rmse:0.694431	validation_1-rmse:0.61727
[202]	validation_0-rmse:0.693819	validation_1-rmse:0.617125
[203]	validation_0-rmse:0.693776	validation_1-rmse:0.617126
[204]	validation_0-rmse:0.693223	validation_1-rmse:0.617061
[205]	validation_0-rmse:0.693182	validation_1-rmse:0.617063
[206]	validation_0-rmse:0.693143	validation_1-rmse:0.617066
[207]	validation_0-rmse:0.692614	validation_1-rmse:0.617014
[208]	validation_0-rmse:0.692576	validation_1-rmse:0.616992
[209]	validation_0-rmse:0.692281	validation_1-rmse:0.616915
[210]	validation_0-rmse:0.692244	validation_1-rmse:0.61692
[211]	validation_0-rmse:0.692056	validation_1-rmse:0.616874
[212]	validation_0-rmse:0.692021	validation_1-rmse:0.616889
[213]	v

In [84]:
# Save each model
for i in range(len(potential_targets)):
    target = potential_targets[i]
    y_val_t = y_val.iloc[:,i]
    print(target, 'score:', xgbr_models[target].score(X_val_t, y_val_t))

    joblib.dump(xgbr_models[target], './Project-2/xgbr_models/'+target+'_model.joblib', compress=True)

joblib.dump(xgbr_preprocessor, './Project-2/xgbr_models/preprocessor_pipeline.joblib', compress=True)

TOTALBTU score: 0.6137520894217305
TOTALDOL score: 0.5398324314709099
KWH score: 0.5487202492864383
BTUEL score: 0.5484183369934006
DOLLAREL score: 0.47901158691385276
CUFEETNG score: 0.7640836777388428
BTUNG score: 0.765697741591308
DOLLARNG score: 0.7838371338662781
GALLONLP score: 0.8206387288250659
BTULP score: 0.820635685908099
DOLLARLP score: 0.8127246653392889
GALLONFO score: 0.9053669244215432
BTUFO score: 0.904555144611364
DOLLARFO score: 0.8814024898652105
WOODAMT score: 0.8891307430239677
WOODBTU score: 0.7194427516849117
TOTALBTU_LOG score: 0.6733350460697828
TOTALDOL_LOG score: 0.5815517282065408
KWH_LOG score: 0.58497215173913
BTUEL_LOG score: 0.5855407391651758
DOLLAREL_LOG score: 0.5099109193587947
CUFEETNG_LOG score: 0.9627945363927972
BTUNG_LOG score: 0.9629009102141136
DOLLARNG_LOG score: 0.9594143258170718
GALLONLP_LOG score: 0.9106651887977384
BTULP_LOG score: 0.8694739877833287
DOLLARLP_LOG score: 0.8861833539882978
GALLONFO_LOG score: 0.9797290309180081
BTUFO_LOG

['./Project-2/xgbr_models/preprocessor_pipeline.joblib']