In [14]:
# General imports
import numpy as np
import pandas as pd
import os, gc, sys, warnings, random, math, psutil, pickle

warnings.filterwarnings('ignore')

In [15]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [16]:
########################### Vars
#################################################################################
SEED = 42
LOCAl_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [17]:
########################### DATA LOAD
#################################################################################
print('Load Data')
# train_df = pd.read_pickle('../input/ashrae-data-minification/train.pkl')
test_df = pd.read_pickle('../output/ashrae-data-minification/test.pkl')

building_df = pd.read_pickle('../output/ashrae-data-minification/building_metadata.pkl')

# train_weather_df = pd.read_pickle('../input/ashrae-data-minification/weather_train.pkl')
test_weather_df = pd.read_pickle('../output/ashrae-data-minification/weather_test.pkl')

Load Data


In [18]:
########################### Building DF merge through concat 
#################################################################################
# Benefits of concat:
## Faster for huge datasets (columns number)
## No dtype change for dataset
## Consume less memmory 

# temp_df = train_df[['building_id']]
# temp_df = temp_df.merge(building_df, on=['building_id'], how='left')
# del temp_df['building_id']
# train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['building_id']]
temp_df = temp_df.merge(building_df, on=['building_id'], how='left')
del temp_df['building_id']
test_df = pd.concat([test_df, temp_df], axis=1)

del building_df, temp_df

In [19]:
########################### Weather DF merge over concat (to not lose type)
#################################################################################
# Benefits of concat:
## Faster for huge datasets (columns number)
## No dtype change for dataset
## Consume less memmory 

# temp_df = train_df[['site_id','timestamp']]
# temp_df = temp_df.merge(train_weather_df, on=['site_id','timestamp'], how='left')
# del temp_df['site_id'], temp_df['timestamp']
# train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['site_id','timestamp']]
temp_df = temp_df.merge(test_weather_df, on=['site_id','timestamp'], how='left')
del temp_df['site_id'], temp_df['timestamp']
test_df = pd.concat([test_df, temp_df], axis=1)

del test_weather_df, temp_df

In [20]:
########################### Trick to use kernel hdd to store results
#################################################################################

# You can save just test_df or both if have sufficient space
# train_df.to_pickle('train_df.pkl')
test_df.to_pickle('../output/ashrae-baseline-lgbm-predict/test_df.pkl')
   
del test_df
gc.collect()

117

In [21]:
########################### Check memory usage
#################################################################################
for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
print('Memory in Gb', get_memory_usage())

                           _ii:   807.0B
                          _i10:   807.0B
                          _i19:   807.0B
                           _i2:   715.0B
                           _i6:   715.0B
                          _i15:   715.0B
                          _iii:   712.0B
                           _i9:   712.0B
                          _i18:   712.0B
                           _i8:   604.0B
Memory in Gb 0.09


In [22]:
gc.collect()

40

In [23]:
########################### Predict
#################################################################################
if not LOCAl_TEST:
    
    # Load test_df from hdd
    test_df = pd.read_pickle('../output/ashrae-baseline-lgbm-predict/test_df.pkl')

    remove_columns = ['timestamp',TARGET,'site_id','row_id']
    features_columns = [col for col in list(test_df) if col not in remove_columns]
    
    # Remove unused columns
    test_df = test_df[features_columns]
    
    # Remove test_df from hdd
    os.system('rm test_df.pkl')
    
    # Read submission file
    submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

    # Remove row_id for a while
    del submission['row_id']
    
    for i in range(2, 7):
        print(f'Predictions for seed 4{i}')
        estimator = pickle.load(open(f'../output/test-lgb-model/lgbm__seed_4{i}.bin', 'rb'))

        predictions = []
        batch_size = 2000000
        for batch in range(int(len(test_df)/batch_size)+1):
            print('Predicting batch:', batch)
            predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))
            
        submission['meter_reading'] += predictions
        del estimator
        gc.collect()
        
    # Average over models
    submission['meter_reading'] /= 5
    
    # Delete test_df
    del test_df
     
    # Fix negative values
    submission['meter_reading'] = submission['meter_reading'].clip(0,None)

    # Restore row_id
    submission['row_id'] = submission.index
    
    ########################### Check
    print(submission.iloc[:20])
    print(submission['meter_reading'].describe())

Predictions for seed 42
Predicting batch: 0
Predicting batch: 1
Predicting batch: 2
Predicting batch: 3
Predicting batch: 4
Predicting batch: 5
Predicting batch: 6
Predicting batch: 7
Predicting batch: 8
Predicting batch: 9
Predicting batch: 10
Predicting batch: 11
Predicting batch: 12
Predicting batch: 13
Predicting batch: 14
Predicting batch: 15
Predicting batch: 16
Predicting batch: 17
Predicting batch: 18
Predicting batch: 19
Predicting batch: 20
Predictions for seed 43
Predicting batch: 0
Predicting batch: 1
Predicting batch: 2
Predicting batch: 3
Predicting batch: 4
Predicting batch: 5
Predicting batch: 6
Predicting batch: 7
Predicting batch: 8
Predicting batch: 9
Predicting batch: 10
Predicting batch: 11
Predicting batch: 12
Predicting batch: 13
Predicting batch: 14
Predicting batch: 15
Predicting batch: 16
Predicting batch: 17
Predicting batch: 18
Predicting batch: 19
Predicting batch: 20
Predictions for seed 44
Predicting batch: 0
Predicting batch: 1
Predicting batch: 2
Predic

In [24]:
########################### Export
#################################################################################
if not LOCAl_TEST:
    submission.to_csv('../output/ashrae-baseline-lgbm-predict/submission.csv', index=False)