In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [2]:
building_metadata = pd.read_csv("gs://123test_bucket/building_metadata.csv")
sample_submission = pd.read_csv("gs://123test_bucket/sample_submission.csv")
test = pd.read_csv("gs://123test_bucket/test.csv")
train = pd.read_csv("gs://123test_bucket/train.csv")
weather_test = pd.read_csv("gs://123test_bucket/weather_test.csv")
weather_train = pd.read_csv("gs://123test_bucket/weather_train.csv")

In [3]:
building_metadata.head(2)

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,


In [4]:
# Reducing memory

## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Reducing memory
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

weather_train = reduce_mem_usage(weather_train)
weather_test = reduce_mem_usage(weather_test)
building_metadata = reduce_mem_usage(building_metadata)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


In [5]:
print(building_metadata.head(2))
print(test.head(2))
print(train.head(2))
print(weather_test.head(2))
print(weather_train.head(2))

   site_id  building_id primary_use  square_feet  year_built  floor_count
0        0            0   Education         7432      2008.0          NaN
1        0            1   Education         2720      2004.0          NaN
   row_id  building_id  meter            timestamp
0       0            0      0  2017-01-01 00:00:00
1       1            1      0  2017-01-01 00:00:00
   building_id  meter            timestamp  meter_reading
0            0      0  2016-01-01 00:00:00            0.0
1            1      0  2016-01-01 00:00:00            0.0
   site_id            timestamp  air_temperature  cloud_coverage  \
0        0  2017-01-01 00:00:00        17.796875             4.0   
1        0  2017-01-01 01:00:00        17.796875             2.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_direction  \
0        11.703125                NaN              1021.5           100.0   
1        12.796875                0.0              1022.0           130.0   

   wind_speed 

In [6]:
weather_train = weather_train.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))
weather_test = weather_test.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))

In [7]:
train_sub = train[train.building_id.isin(np.arange(0,10))]
# Sorting
train_sub = train_sub.sort_values(by = ['building_id', 'meter', 'timestamp'])

In [8]:
lag = 24*7
all_cols = set(train_sub.columns)
cols = set(all_cols) - set(['timestamp'])

def ts_lag(df, cols=cols,  group =['building_id', 'meter'], lag = lag):
    for i in range(0, lag):
        num = i+1
        cols_shift = {x:x + str(num) for x in cols}
        sub = df[cols]
        sub_shift = sub.groupby(group).shift(num).rename(cols_shift, axis=1)
        df = df.merge(sub_shift, 
                      left_on = df.index,
                      right_on = sub_shift.index).drop('key_0', axis = 1)
        
    return df
    

In [9]:
%%time
train_sub_lag = ts_lag(train_sub)

CPU times: user 12.1 s, sys: 748 ms, total: 12.9 s
Wall time: 12.9 s


In [None]:
%%time
train_lag = ts_lag(train)