In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.metrics import mean_squared_error

In [2]:

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
%%time
root = Path('../input/ashrae-feather-format-for-fast-loading')

train_df = pd.read_feather(root/'train.feather')
test_df = pd.read_feather(root/'test.feather')
#weather_train_df = pd.read_feather(root/'weather_train.feather')
#weather_test_df = pd.read_feather(root/'weather_test.feather')
building_meta_df = pd.read_feather(root/'building_metadata.feather')


`nthreads` argument is deprecated, pass `use_threads` instead


.labels was deprecated in version 0.24.0. Use .codes instead.



CPU times: user 2.52 s, sys: 1.39 s, total: 3.91 s
Wall time: 6.73 s


In [4]:
# i'm now using my leak data station kernel to shortcut.
leak_df = pd.read_feather('../input/ashrae-leak-data-station/leak.feather')

leak_df.fillna(0, inplace=True)
leak_df = leak_df[(leak_df.timestamp.dt.year > 2016) & (leak_df.timestamp.dt.year < 2019)]
leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0 # remove large negative values
leak_df = leak_df[leak_df.building_id!=245]
lead_site_id_0 = leak_df[leak_df['meter']==0]

In [5]:
sample_submission3 = pd.read_csv("../input/ashrae-final-submission/fe2_lgbm_tz.csv", index_col=0)
sample_submission2 = pd.read_csv("../input/ashrae-final-submission/submission.csv", index_col=0)
sample_submission1 = pd.read_csv("../input/ashrae-final-submission/submission_final_day.csv", index_col=0)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [6]:
test_df['pred1'] = sample_submission1.meter_reading
test_df['pred2'] = sample_submission2.meter_reading
test_df['pred3'] = sample_submission3.meter_reading


del  sample_submission1,  sample_submission2,  sample_submission3
gc.collect()

test_df = reduce_mem_usage(test_df)
leak_df = reduce_mem_usage(leak_df)

Memory usage of dataframe is 1550.87 MB
Memory usage after optimization is: 1073.68 MB
Decreased by 30.8%
Memory usage of dataframe is 460.05 MB
Memory usage after optimization is: 299.03 MB
Decreased by 35.0%


In [7]:
leak_df = leak_df.merge(test_df[['building_id', 'meter', 'timestamp', 'pred1', 'pred2', 'pred3','row_id']], left_on = ['building_id', 'meter', 'timestamp'], right_on = ['building_id', 'meter', 'timestamp'], how = "left")
leak_df = leak_df.merge(building_meta_df[['building_id', 'site_id']], on='building_id', how='left')

In [8]:
leak_df['pred1_l1p'] = np.log1p(leak_df.pred1)
leak_df['pred2_l1p'] = np.log1p(leak_df.pred2)
leak_df['pred3_l1p'] = np.log1p(leak_df.pred3)
leak_df['meter_reading_l1p'] = np.log1p(leak_df.meter_reading)

In [9]:
v = 0.1 * leak_df['pred1'].values + 0.6 * leak_df['pred2'].values + 0.2 * leak_df['pred3'].values
vl1p = np.log1p(v)
curr_score = np.sqrt(mean_squared_error(vl1p, leak_df.meter_reading_l1p))   
print(curr_score)

0.9719773


# Submit

In [10]:
sample_submission = pd.read_feather(os.path.join(root, 'sample_submission.feather'))

# extract best combination
#final_combi = filtered_combis[best_combi[0][0]]
w1 = 0.1#final_combi[0]
w2 = 0.6#final_combi[1]
w3 = 0.2#final_combi[2]
print("The weights are: w1=" + str(w1) + ", w2=" + str(w2) + ", w3=" + str(w3))

sample_submission['meter_reading'] = w1 * test_df.pred1 +  w2 * test_df.pred2  + w3 * test_df.pred3
sample_submission.loc[sample_submission.meter_reading < 0, 'meter_reading'] = 0


`nthreads` argument is deprecated, pass `use_threads` instead


.labels was deprecated in version 0.24.0. Use .codes instead.



The weights are: w1=0.1, w2=0.6, w3=0.2


In [11]:
sample_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,167.188847
1,1,77.627264
2,2,6.786628
3,3,264.870483
4,4,1275.658191


In [12]:
#sns.distplot(np.log1p(sample_submission.meter_reading))

In [13]:
leak_df = leak_df[['meter_reading', 'row_id']].set_index('row_id').dropna()
sample_submission.loc[leak_df.index, 'meter_reading'] = leak_df['meter_reading']

In [14]:
#sns.distplot(np.log1p(sample_submission.meter_reading))

In [15]:
sample_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,173.3703
1,1,53.512718
2,2,6.143042
3,3,101.701469
4,4,1141.240723


In [16]:
sample_submission.to_csv('submission.csv', index=False, float_format='%.4f')

# Future Work

- Increase the range of weights
- Vary tolerance for sum of weights (currently tol = 0.95)