Based on https://www.kaggle.com/ragnar123/another-1-08-lb-no-leak


#HARDWARE & SOFTWARE :  
Cloud Computing Services: AWS    
Instance Type: p3.2xlarge (8 vCPU, 61MB)   
AMI ID: Deep Learning AMI (Ubuntu 16.04) Version 26.0 (ami-0e30cdd8359d89531)     
Python: Python 3.6.6 :: Anaconda, Inc. (python packages are detailed separately in `requirements.txt`)   


Note: this version was an early version, is overfitting and can be improved. Try removing the building id, changing the hyperameters (usage of 'per_float_feature_quantization' on building_id and meter, add regularization, decrease the depth), and decrease the number of features. 

Load competitions files

In [None]:
!kaggle competitions download -c ashrae-energy-prediction

In [1]:
# !python3 -m pip install meteocalc --user
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm_notebook as tqdm
import datetime
from meteocalc import feels_like, Temp
from sklearn import metrics
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# !python3 -m pip install tqdm --user

# Load Data

In [3]:
train_df = pd.read_csv('train.csv')
building_df = pd.read_csv('building_metadata.csv')
weather_df = pd.read_csv('weather_train.csv')

In [4]:
# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude

def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
    
    def get_meteorological_features(data):
        def calculate_rh(df):
            df['relative_humidity'] = 100 * (np.exp((17.625 * df['dew_temperature']) / (243.04 + df['dew_temperature'])) / np.exp((17.625 * df['air_temperature'])/(243.04 + df['air_temperature'])))
        def calculate_fl(df):
            flike_final = []
            flike = []
            # calculate Feels Like temperature
            for i in range(len(df)):
                at = df['air_temperature'][i]
                rh = df['relative_humidity'][i]
                ws = df['wind_speed'][i]
                flike.append(feels_like(Temp(at, unit = 'C'), rh, ws))
            for i in range(len(flike)):
                flike_final.append(flike[i].f)
            df['feels_like'] = flike_final
            del flike_final, flike, at, rh, ws
        calculate_rh(data)
        calculate_fl(data)
        return data

    weather_df = get_meteorological_features(weather_df)
    
    return weather_df

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def features_engineering(df):
    
    # Sort by timestamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.weekday
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                    "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                    "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                    "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                    "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                    "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                    "2019-01-01"]
    df['group'] = df['timestamp'].dt.month
    df['group'].replace((1, 2, 3, 4), 1, inplace = True)
    df['group'].replace((5, 6, 7, 8), 2, inplace = True)
    df['group'].replace((9, 10, 11, 12), 3, inplace = True)
    
    #########################################################
    
    df['monthth'] = df['timestamp'].dt.month
    df['monthth'].replace((1, 2, 3, 12), 1, inplace = True)
    df['monthth'].replace((4,11), 2, inplace = True)
    df['monthth'].replace((5,10), 3, inplace = True)
    df['monthth'].replace((6,9), 4, inplace = True)
    df['monthth'].replace((7,8), 5, inplace = True)
    
    
    df['doy'] = df['timestamp'].dt.dayofyear
    
    
    ##########################################################
    df["is_holiday"] = (df.timestamp.isin(holidays)).astype(int)
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Remove Unused Columns
    drop = ["timestamp"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [5]:
# weather manipulation
weather_df = fill_weather_dataset(weather_df)

# memory reduction
train_df = reduce_mem_usage(train_df,use_float16=True)
building_df = reduce_mem_usage(building_df,use_float16=True)
weather_df = reduce_mem_usage(weather_df,use_float16=True)

# merge data
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
del weather_df
gc.collect()

# feature engineering
train_df = features_engineering(train_df)

# transform target variable
train_df['meter_reading'] = np.log1p(train_df["meter_reading"])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 11.79 MB
Memory usage after optimization is: 3.19 MB
Decreased by 72.9%


In [6]:
# eliminate bad rows
bad_rows = pd.read_csv('rows_to_drop.csv')
train_df.drop(bad_rows.loc[:, '0'], inplace = True)
train_df.reset_index(drop = True, inplace = True)

In [7]:
# !python3 -m pip install catboost -U --user

In [8]:
train_df.columns

Index(['building_id', 'meter', 'meter_reading', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed',
       'relative_humidity', 'feels_like', 'hour', 'dayofweek', 'group',
       'monthth', 'doy', 'is_holiday'],
      dtype='object')

In [9]:
# train_df['square_feet'] = train_df['square_feet'].apply(lambda x: int(x/100))

train_df['ew'] = (1.0007+3.46*1e-6*train_df["sea_level_pressure"])*(6.1121+np.exp(17.502*train_df["air_temperature"]/(train_df["sea_level_pressure"]+240.97)))

latitude_dict = {0 :28.5383,
1 :50.9097,
2 :33.4255,
3 :38.9072,
4 :37.8715,
5 :50.9097,
6 :40.7128,
7 :45.4215,
8 :28.5383,
9 :30.2672,
10 :40.10677,
11 :45.4215,
12 :53.3498,
13 :44.9375,
14 :38.0293,
15: 40.7128,}
## the latitudes were set up using this post: https://www.kaggle.com/c/ashrae-energy-prediction/discussion/115698 

train_df['latitude'] = train_df['site_id'].map(latitude_dict)
train_df['solarHour'] = (train_df['hour']-12)*15
train_df['solarDec'] = -23.45*np.cos(np.deg2rad(360*(train_df['doy']+10)/365))
train_df['horizsolar'] = np.cos(np.deg2rad(train_df['solarHour']))*np.cos(np.deg2rad(train_df['solarDec']))*np.cos(np.deg2rad(train_df['latitude'])) + np.sin(np.deg2rad(train_df['solarDec']))*np.sin(np.deg2rad(train_df['latitude']))

train_df['horizsolar'] = train_df['horizsolar'].apply(lambda x: 0 if x <0 else x)

In [10]:
for col in [ 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'primary_use',]:

    temp = train_df[col].value_counts().to_dict()
    train_df[col+'_cnt'] = train_df[col].map(temp)

In [11]:
train_df["cloud_coverage"] = train_df["cloud_coverage"].apply(lambda x: int(x))

In [12]:
train_df.fillna(-1, inplace=True)

In [13]:
# declare target, categorical and numeric columns
target = 'meter_reading'
categorical = ['building_id', 'site_id', 'primary_use', 'meter', 'is_holiday', 'dayofweek', 'cloud_coverage']
numeric_cols = [col for col in train_df.columns if col not in categorical + [target, 'timestamp', 'group','cloud_coverage_cnt', 'wind_speed', 'precip_depth_1_hr',
       'precip_depth_1_hr_cnt', 'is_holiday', 'dew_temperature_cnt', 'air_temperature_cnt',
       'cloud_coverage_cnt','doy', 'monthth',

 'solarHour',
 'solarDec',]]
features = categorical + numeric_cols

In [14]:
features

['building_id',
 'site_id',
 'primary_use',
 'meter',
 'is_holiday',
 'dayofweek',
 'cloud_coverage',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'dew_temperature',
 'sea_level_pressure',
 'wind_direction',
 'relative_humidity',
 'feels_like',
 'hour',
 'ew',
 'latitude',
 'horizsolar',
 'primary_use_cnt']

In [15]:
categorical

['building_id',
 'site_id',
 'primary_use',
 'meter',
 'is_holiday',
 'dayofweek',
 'cloud_coverage']

In [16]:
import catboost as cb
# https://www.kaggle.com/corochann/ashrae-training-lgbm-by-meter-type?scriptVersionId=22053887

def run_cat(train, cat_features = categorical, num_rounds = 20000, folds = 3):
    kf = GroupKFold(n_splits = folds)
    models = []
    metric = 'RMSE'
    SEED = 2019

    params = {
        'n_estimators': 2000,
        'learning_rate': 0.1,
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',
        'random_seed': SEED,
        'metric_period': 10,
        'task_type': 'GPU',
        'depth': 8,
    }
    for col in categorical:
        train[col] = train[col].astype('category')

    oof = np.zeros(len(train))
    for tr_idx, val_idx in tqdm(kf.split(train, groups = train['group']), total = folds):
        tr_x, tr_y = train[features].iloc[tr_idx], train[target].iloc[tr_idx]
        vl_x, vl_y = train[features].iloc[val_idx], train[target].iloc[val_idx]
#         tr_data = lgb.Dataset(tr_x, label = tr_y,  categorical_feature = categorical)
#         vl_data = lgb.Dataset(vl_x, label = vl_y,  categorical_feature = categorical)

        model = cb.CatBoostRegressor(**params)

        model.fit(tr_x, tr_y, 
              eval_set=(vl_x, vl_y),
              cat_features=categorical, 
              use_best_model=True, 
              verbose=100,  # show every 100 iteration
              early_stopping_rounds=100)
#         clf = lgb.train(param, tr_data, num_rounds, valid_sets = [tr_data, vl_data], verbose_eval = 25, 
#                         early_stopping_rounds = 50)
        models.append(model)
        oof[val_idx] = model.predict(vl_x)
        gc.collect()
    score = np.sqrt(metrics.mean_squared_error(train[target], np.clip(oof, a_min=0, a_max=None)))
    print('Our oof cv is :', score)
    return models
models = run_cat(train_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

0:	learn: 1.8546270	test: 1.8733931	best: 1.8733931 (0)	total: 775ms	remaining: 25m 49s
100:	learn: 0.9191525	test: 1.0418195	best: 1.0418195 (100)	total: 1m 20s	remaining: 25m 20s
200:	learn: 0.8597472	test: 0.9825743	best: 0.9825743 (200)	total: 2m 45s	remaining: 24m 37s
300:	learn: 0.8268386	test: 0.9518883	best: 0.9518883 (300)	total: 4m 9s	remaining: 23m 28s
400:	learn: 0.8042454	test: 0.9324997	best: 0.9324997 (400)	total: 5m 35s	remaining: 22m 18s
500:	learn: 0.7857685	test: 0.9186286	best: 0.9186286 (500)	total: 7m	remaining: 20m 58s
600:	learn: 0.7717064	test: 0.9085100	best: 0.9085100 (600)	total: 8m 24s	remaining: 19m 34s
700:	learn: 0.7601561	test: 0.8997179	best: 0.8997179 (700)	total: 9m 53s	remaining: 18m 19s
800:	learn: 0.7495656	test: 0.8910615	best: 0.8910615 (800)	total: 11m 21s	remaining: 17m
900:	learn: 0.7414145	test: 0.8851393	best: 0.8851393 (900)	total: 12m 51s	remaining: 15m 41s
1000:	learn: 0.7344328	test: 0.8811104	best: 0.8811104 (1000)	total: 14m 15s	remai

In [17]:
df_fimp_1 = pd.DataFrame()
df_fimp_1["feature"] = train_df[features].columns
df_fimp_1["importance"] = models[0].get_feature_importance()
df_fimp_1["half"] = 1
df_fimp_1.sort_values(by='importance', ascending=False)['feature']#[-8:].values


3                  meter
0            building_id
7            square_feet
1                site_id
15            feels_like
18              latitude
11       dew_temperature
2            primary_use
16                  hour
8             year_built
5              dayofweek
10       air_temperature
19            horizsolar
6         cloud_coverage
12    sea_level_pressure
17                    ew
20       primary_use_cnt
9            floor_count
13        wind_direction
14     relative_humidity
4             is_holiday
Name: feature, dtype: object

In [18]:
# read test
test_df = pd.read_csv('test.csv')
row_ids = test_df["row_id"]
test_df.drop("row_id", axis=1, inplace=True)
test_df = reduce_mem_usage(test_df)

# merge with building info
test_df = test_df.merge(building_df,left_on='building_id',right_on='building_id',how='left')
del building_df
gc.collect()

# fill test weather data
weather_df = pd.read_csv('weather_test.csv')
weather_df = fill_weather_dataset(weather_df)
weather_df = reduce_mem_usage(weather_df)

# merge weather data
test_df = test_df.merge(weather_df,how='left',on=['timestamp','site_id'])
del weather_df
gc.collect()

# feature engineering
test_df = features_engineering(test_df)

Memory usage of dataframe is 954.38 MB
Memory usage after optimization is: 199.59 MB
Decreased by 79.1%


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Memory usage of dataframe is 23.53 MB
Memory usage after optimization is: 11.18 MB
Decreased by 52.5%


In [19]:
test_df['ew'] = (1.0007+3.46*1e-6*test_df["sea_level_pressure"])*(6.1121+np.exp(17.502*test_df["air_temperature"]/(test_df["sea_level_pressure"]+240.97)))

latitude_dict = {0 :28.5383,
1 :50.9097,
2 :33.4255,
3 :38.9072,
4 :37.8715,
5 :50.9097,
6 :40.7128,
7 :45.4215,
8 :28.5383,
9 :30.2672,
10 :40.10677,
11 :45.4215,
12 :53.3498,
13 :44.9375,
14 :38.0293,
15: 40.7128,}

test_df['latitude'] = test_df['site_id'].map(latitude_dict)
test_df['solarHour'] = (test_df['hour']-12)*15
test_df['solarDec'] = -23.45*np.cos(np.deg2rad(360*(test_df['doy']+10)/365))
test_df['horizsolar'] = np.cos(np.deg2rad(test_df['solarHour']))*np.cos(np.deg2rad(test_df['solarDec']))*np.cos(np.deg2rad(test_df['latitude'])) + np.sin(np.deg2rad(test_df['solarDec']))*np.sin(np.deg2rad(test_df['latitude']))

test_df['horizsolar'] = test_df['horizsolar'].apply(lambda x: 0 if x <0 else x)

In [20]:
for col in [ 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'primary_use',]:

    temp = test_df[col].value_counts().to_dict()
    test_df[col+'_cnt'] = test_df[col].map(temp)

In [21]:
test_df["cloud_coverage"] = test_df["cloud_coverage"].apply(lambda x: int(x))

In [22]:
test_df.fillna(-1, inplace=True)

In [26]:
for c in train_df[features].columns:
    if c not in test_df.columns:
        print(c)

In [27]:
# test_df[features]

In [28]:
for col in categorical:
    test_df[col] = test_df[col].astype('category')

In [29]:
test_df[features].dtypes

building_id           category
site_id               category
primary_use           category
meter                 category
is_holiday            category
dayofweek             category
cloud_coverage        category
square_feet            float64
year_built             float16
floor_count            float16
air_temperature        float32
dew_temperature        float32
sea_level_pressure     float32
wind_direction         float32
relative_humidity      float32
feels_like             float32
hour                     int64
ew                     float64
latitude               float64
horizsolar             float64
primary_use_cnt          int64
dtype: object

In [30]:
%%time
def predictions(models, iterations = 50):
    # split test data into batches
    set_size = len(test_df)
    batch_size = set_size // iterations
    meter_reading = []
    for i in tqdm(range(iterations)):
        pos = i*batch_size
        fold_preds = [np.expm1(model.predict(test_df[features].iloc[pos : pos+batch_size])) for model in models]
        meter_reading.extend(np.mean(fold_preds, axis=0))

    print(len(meter_reading))
    assert len(meter_reading) == set_size
    submission = pd.read_csv('sample_submission.csv')
    submission['meter_reading'] = np.clip(meter_reading, a_min=0, a_max=None) # clip min at zero
    submission.to_csv('submission_catboost002.csv.gz',
                         index=False,compression='gzip', 
                         float_format='%.4f', 
                         chunksize=25000)
    print('We are done!')
predictions(models)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


41697600
We are done!
CPU times: user 4h 15min 4s, sys: 7min 51s, total: 4h 22min 55s
Wall time: 43min 58s


In [31]:
!kaggle competitions submit -c ashrae-energy-prediction -f submission_catboost002.csv.gz -m "catboost"


100%|████████████████████████████████████████| 261M/261M [00:05<00:00, 51.8MB/s]
Successfully submitted to ASHRAE - Great Energy Predictor III