In [1]:
# General imports
import numpy as np
import pandas as pd
import os, gc, sys, warnings, random, math, psutil, pickle
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!=TARGET:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
########################### Vars
#################################################################################
SEED = 42
LOCAl_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [4]:
# from os.path import join as pjoin
# RAW_DATA_DIR = '/kaggle/input/ashrae-energy-prediction/'

# weather_dtypes = {
#     'site_id': np.uint8,
#     'air_temperature': np.float32,
#     'cloud_coverage': np.float32,
#     'dew_temperature': np.float32,
#     'precip_depth_1_hr': np.float32,
#     'sea_level_pressure': np.float32,
#     'wind_direction': np.float32,
#     'wind_speed': np.float32,
# }

# weather_train = pd.read_csv(pjoin(RAW_DATA_DIR, 'weather_train.csv'),dtype=weather_dtypes,
#     parse_dates=['timestamp'])
# weather_test = pd.read_csv(pjoin(RAW_DATA_DIR, 'weather_test.csv'),dtype=weather_dtypes,
#     parse_dates=['timestamp'])

# weather = pd.concat([weather_train,weather_test],ignore_index=True)
# del weather_train, weather_test
# weather_key = ['site_id', 'timestamp']
# temp_skeleton = weather[weather_key + ['air_temperature']].drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()
# del weather

In [5]:
# # calculate ranks of hourly temperatures within date/site_id chunks
# temp_skeleton['temp_rank'] = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.date])['air_temperature'].rank('average')

# # create a dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
# df_2d = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

# # Subtract the columnID of temperature peak by 14, getting the timestamp alignment gap.
# site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)
# site_ids_offsets.index.name = 'site_id'

# def timestamp_align(df):
#     df['offset'] = df.site_id.map(site_ids_offsets)
#     df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
#     df['timestamp'] = df['timestamp_aligned']
#     del df['timestamp_aligned']
#     return df

In [6]:
# ########################### DATA LOAD
# #################################################################################
# print('Load Data')
# train_df = pd.read_pickle('../input/as-data-minification/train.pkl')
# test_df = pd.read_pickle('../input/as-data-minification/test.pkl')

# building_df = pd.read_pickle('../input/as-data-minification/building_metadata.pkl')

# train_weather_df = pd.read_pickle('../input/as-data-minification/weather_train.pkl')
# test_weather_df = pd.read_pickle('../input/as-data-minification/weather_test.pkl')

In [7]:
# train_weather_df = timestamp_align(train_weather_df)
# test_weather_df = timestamp_align(test_weather_df)

In [8]:
# # # 单独插值，变弱
# # train_weather_df = train_weather_df.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))

# def add_lag_feature(weather_df, window=3):
#     group_df = weather_df.groupby('site_id')
#     cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
#     rolled = group_df[cols].rolling(window=window, min_periods=0)
#     lag_mean = rolled.mean().reset_index().astype(np.float16)
#     lag_max = rolled.max().reset_index().astype(np.float16)
#     lag_min = rolled.min().reset_index().astype(np.float16)
#     lag_std = rolled.std().reset_index().astype(np.float16)
#     lag_median = rolled.median().reset_index().astype(np.float16)
#     lag_skew = rolled.skew().reset_index().astype(np.float16)
#     for col in cols:
#         weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
#         weather_df[f'{col}_max_lag{window}'] = lag_max[col]
#         weather_df[f'{col}_min_lag{window}'] = lag_min[col]
#         weather_df[f'{col}_std_lag{window}'] = lag_std[col]
#         weather_df[f'{col}_median_lag{window}'] = lag_median[col]
#         weather_df[f'{col}_skew_lag{window}'] = lag_skew[col]

# add_lag_feature(train_weather_df, window=3)
# # add_lag_feature(train_weather_df, window=72)

# add_lag_feature(train_weather_df, window=18)

# add_lag_feature(test_weather_df, window=3)
# # add_lag_feature(test_weather_df, window=72)

# add_lag_feature(test_weather_df, window=18)

# # add_lag_feature(train_weather_df, window=24)
# # add_lag_feature(train_weather_df, window=168)

In [9]:
# weather_col = ['site_id', 'timestamp', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'offset',
#               'air_temperature_min_lag3','air_temperature_min_lag18','air_temperature_std_lag18','cloud_coverage_median_lag18']

In [10]:
# train_weather_df = train_weather_df[weather_col]
# test_weather_df = test_weather_df[weather_col]

# # train_weather_df = reduce_mem_usage(train_weather_df)
# # test_weather_df = reduce_mem_usage(test_weather_df)
# gc.collect()

In [11]:
# ########################### Building DF merge through concat 
# #################################################################################
# # Benefits of concat:
# ## Faster for huge datasets (columns number)
# ## No dtype change for dataset
# ## Consume less memmory 

# temp_df = train_df[['building_id']]
# temp_df = temp_df.merge(building_df, on=['building_id'], how='left')
# del temp_df['building_id']
# train_df = pd.concat([train_df, temp_df], axis=1)

# temp_df = test_df[['building_id']]
# temp_df = temp_df.merge(building_df, on=['building_id'], how='left')
# del temp_df['building_id']
# test_df = pd.concat([test_df, temp_df], axis=1)

# del building_df, temp_df

In [12]:
# ########################### Weather DF merge over concat (to not lose type)
# #################################################################################
# # Benefits of concat:
# ## Faster for huge datasets (columns number)
# ## No dtype change for dataset
# ## Consume less memmory 

# temp_df = train_df[['site_id','timestamp']]
# temp_df = temp_df.merge(train_weather_df, on=['site_id','timestamp'], how='left')
# del temp_df['site_id'], temp_df['timestamp']
# train_df = pd.concat([train_df, temp_df], axis=1)

# temp_df = test_df[['site_id','timestamp']]
# temp_df = temp_df.merge(test_weather_df, on=['site_id','timestamp'], how='left')
# del temp_df['site_id'], temp_df['timestamp']
# test_df = pd.concat([test_df, temp_df], axis=1)

# del train_weather_df, test_weather_df, temp_df

In [13]:
# %%time
# # Building and site id and primary_use
# for enc_col in ['site_id']:
#     # 该操作有重复
#     temp_df = train_df.groupby([enc_col])['meter'].agg(['unique'])
#     temp_df['unique'] = temp_df['unique'].apply(lambda x: '_'.join(str(x))).astype(str)

#     le = LabelEncoder()
#     temp_df['unique'] = le.fit_transform(temp_df['unique']).astype(np.int8)
#     temp_df = temp_df['unique'].to_dict()

#     train_df[enc_col+'_uid_enc'] = train_df[enc_col].map(temp_df)
#     test_df[enc_col+'_uid_enc'] = test_df[enc_col].map(temp_df)
# del temp_df

# for enc_col in ['site_id_uid_enc']:
#     train_df['g_meter_'+enc_col] = train_df['meter'].astype(str) +'_'+\
#                     train_df[enc_col].astype(str)

#     test_df['g_meter_'+enc_col] = test_df['meter'].astype(str) +'_'+\
#                         test_df[enc_col].astype(str)

#     le = LabelEncoder()
#     train_df['g_meter_'+enc_col] = le.fit_transform(train_df['g_meter_'+enc_col]).astype(np.int8)
#     test_df['g_meter_'+enc_col] = le.fit_transform(test_df['g_meter_'+enc_col]).astype(np.int8)
#     gc.collect()

In [14]:
# train_df['meter_reading_log1p'] = np.log1p(train_df['meter'])
# df_group = train_df.groupby('building_id')['meter_reading_log1p']
# # building_mean = df_group.mean().astype(np.float16)
# building_median = df_group.median().astype(np.float16)
# building_min = df_group.min().astype(np.float16)
# # building_max = df_group.max().astype(np.float16)
# # building_std = df_group.std().astype(np.float16)

# # train_df['building_mean'] = train_df['building_id'].map(building_mean)
# train_df['building_median'] = train_df['building_id'].map(building_median)
# train_df['building_min'] = train_df['building_id'].map(building_min)
# # train_df['building_max'] = train_df['building_id'].map(building_max)
# # train_df['building_std'] = train_df['building_id'].map(building_std)

# # test_df['building_mean'] = test_df['building_id'].map(building_mean)
# test_df['building_median'] = test_df['building_id'].map(building_median)
# test_df['building_min'] = test_df['building_id'].map(building_min)
# # test_df['building_max'] = test_df['building_id'].map(building_max)
# # test_df['building_std'] = test_df['building_id'].map(building_std)

# i_cols = [
#          'meter_reading_log1p',
#         ]

# for col in i_cols:
#     del train_df[col]

In [15]:
# # 变强啦
# i_cols = [
#         'building_id',
#         'site_id',
#         'primary_use',
# #         'DT_M',
#         'floor_count',
# #         'building_id_uid_enc', 
# #         'site_id_uid_enc',
#         'g_meter_site_id_uid_enc'
# ]

# for col in i_cols:
#     train_df[col] = train_df[col].astype('category')
#     test_df[col] = test_df[col].astype('category')

In [16]:
# del train_df
# gc.collect()

In [17]:
# Models saving
model_filename = 'xgb'

test_path = os.path.join('..', 'output', 'fork-of-as-2kfold-model6-test-df')

# Load train_df from hdd
test_df = pd.read_pickle(os.path.join(train_path, 'test_df.pkl'))

In [18]:
########################### Check memory usage
#################################################################################
for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
print('Memory in Gb', get_memory_usage())

                       test_df:   3.9GiB
                           _i8:   3.2KiB
                           _i2:   2.5KiB
                          _i13:   2.2KiB
                          _i14:   1.3KiB
                  LabelEncoder:   1.0KiB
                           _i4:   996.0B
                           _i5:   924.0B
                          _i12:   847.0B
                          _iii:   820.0B
Memory in Gb 4.2


In [19]:
gc.collect()

22

In [20]:
import xgboost as xgb

In [21]:
gc.collect()

0

In [22]:
########################### Predict
#################################################################################
if not LOCAl_TEST:
    
    # Load test_df from hdd
#     test_df = pd.read_pickle('test_df.pkl')

    remove_columns = ['timestamp',TARGET,'row_id','DT_M']
    features_columns = [col for col in list(test_df) if col not in remove_columns]
    
    # CV1最终
    features_columns = ['square_feet','building_id','meter','air_temperature','year_built','primary_use','floor_count','dew_temperature','DT_hour','sea_level_pressure','DT_day_week','site_id',
                       'DT_day_month','cloud_coverage','wind_speed','wind_direction']
    
    # 添加g_meter_site_id_uid_enc，超过cv2
    features_columns += ['g_meter_site_id_uid_enc']
    
    # 都强点
    features_columns += ['building_median']

    # 验证强点，测试强多
    features_columns += ['building_min']

    # 强一丢丢
    features_columns += ['air_temperature_min_lag3']

    # 都强
    features_columns += ['air_temperature_min_lag18']

    # # 都强
    features_columns += ['air_temperature_std_lag18']

    # 都强
    features_columns += ['cloud_coverage_median_lag18']
    
    
    
    # Remove unused columns
    test_df = test_df[features_columns]
    gc.collect()
    
    # Remove test_df from hdd
#     os.system('rm test_df.pkl')
    
    # Read submission file
    submission = pd.read_csv(os.path.join('..', 'input', 'ashrae-energy-prediction', 'sample_submission.csv'))

    # Remove row_id for a while
    del submission['row_id']
    
    for i in range(1, 3):
        print(f'Predictions for kfold_{i}')
        if i == 1:
#             estimator = xgb.Booster(model_file='../input/fork-of-as-2kfold-model6-xgb-fold0/xgb_kfold_1.model')
            model_path = os.path.join('..', 'output', 'fork-of-as-2kfold-model6-xgb-fr7d12-fold0', 'xgb_kfold_1.bin')
            estimator = pickle.load(open(model_path, 'rb'))
        elif i == 2:
#             estimator = xgb.Booster(model_file='../input/fork-of-as-2kfold-model6-xgb-fold1/xgb_kfold_2.model')
            model_path = os.path.join('..', 'output', 'fork-of-as-2kfold-model6-xgb-fr7d12-fold1', 'xgb_kfold_2.bin')
            estimator = pickle.load(open(model_path, 'rb'))

        predictions = []
        batch_size = 2000000
        for batch in range(int(len(test_df)/batch_size)+1):
            print('Predicting batch:', batch)
#             predictions += list(np.expm1(estimator.predict(xgb.DMatrix(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size]))))
            predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))
            
        submission['meter_reading'] += predictions
        del estimator
        gc.collect()
        
    # Average over models
    submission['meter_reading'] /= 2
    
    # Delete test_df
    del test_df
     
    # Fix negative values
    submission['meter_reading'] = submission['meter_reading'].clip(0,None)

    # Restore row_id
    submission['row_id'] = submission.index
    
    ########################### Check
    print(submission.iloc[:20])
    print(submission['meter_reading'].describe())

Predictions for kfold_1
Predicting batch: 0
Predicting batch: 1
Predicting batch: 2
Predicting batch: 3
Predicting batch: 4
Predicting batch: 5
Predicting batch: 6
Predicting batch: 7
Predicting batch: 8
Predicting batch: 9
Predicting batch: 10
Predicting batch: 11
Predicting batch: 12
Predicting batch: 13
Predicting batch: 14
Predicting batch: 15
Predicting batch: 16
Predicting batch: 17
Predicting batch: 18
Predicting batch: 19
Predicting batch: 20
Predictions for kfold_2
Predicting batch: 0
Predicting batch: 1
Predicting batch: 2
Predicting batch: 3
Predicting batch: 4
Predicting batch: 5
Predicting batch: 6
Predicting batch: 7
Predicting batch: 8
Predicting batch: 9
Predicting batch: 10
Predicting batch: 11
Predicting batch: 12
Predicting batch: 13
Predicting batch: 14
Predicting batch: 15
Predicting batch: 16
Predicting batch: 17
Predicting batch: 18
Predicting batch: 19
Predicting batch: 20
    meter_reading  row_id
0      137.287292       0
1       67.444572       1
2        7.2

In [23]:
########################### Export
#################################################################################
if not LOCAl_TEST:
    output_path = os.path.join('..', 'output', 'fork-of-as-2kfold-model6-xgb-fr7d12-pred')
    submission.to_csv(os.path.join(output_path, 'submission.csv'), index=False)