In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
kaggle = True  

In [None]:
if kaggle:
  # This Python 3 environment comes with many helpful analytics libraries installed
  # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
  # For example, here's several helpful packages to load in 

  import numpy as np # linear algebra
  import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

  # Input data files are available in the "../input/" directory.
  # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

  import os
  for dirname, _, filenames in os.walk('/kaggle/input'):
      for filename in filenames:
          print(os.path.join(dirname, filename))

  # Any results you write to the current directory are saved as output.

In [None]:
if not kaggle:
  !pip install kaggle
  from getpass import getpass
  import os
  user = 'ahmadelsallab'
  key = '6b7ffe97ff5bc0656e325b746b72fa31'

  if '.kaggle' not in os.listdir('/root'):
      !mkdir ~/.kaggle
  !touch /root/.kaggle/kaggle.json
  !chmod 666 /root/.kaggle/kaggle.json
  with open('/root/.kaggle/kaggle.json', 'w') as f:
      f.write('{"username":"%s","key":"%s"}' % (user, key))
  !chmod 600 /root/.kaggle/kaggle.json

  !kaggle competitions download -c ashrae-energy-prediction

  !unzip -n train.csv.zip
  !unzip -n weather_train.csv.zip
  !unzip -n building_metadata.csv.zip

In [None]:
# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude

def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
        
    return weather_df


def fill_with_po3(df):
    return df.fillna(df.interpolate(method='polynomial', order=3))

def fill_with_lin(df):
    return df.fillna(df.interpolate(method='linear'))

def fill_with_mix(df):
    df = (df.fillna(df.interpolate(method='linear', limit_direction='both')) +
               df.fillna(df.interpolate(method='polynomial', order=3, limit_direction='both'))
              ) * 0.5
    # workaround: fill last NANs with neighbour
    assert df.count().min() >= len(df)-1 # only the first item is missing
    return df.fillna(df.iloc[1])         # fill with second item

def fill_temps(weather):
   #fill_with_lin(weather)
    df = None
    for col in ['air_temperature', 'dew_temperature']:
        filled = fill_with_mix(weather.pivot(index='timestamp', columns='site_id', values=col))
        filled = filled.sort_index().unstack().to_frame(col)
        if df is None:
            df = filled
        else:
            df[col] = filled[col]
    return df   

def fill_missing_weather(weather):
  #return fill_temps(weather)
  wf = fill_temps(weather)
  wf = wf.reset_index().merge(weather[['site_id', 'timestamp', 'cloud_coverage', 'precip_depth_1_hr', 'wind_direction', 'wind_speed']],
                          how='left', on=['site_id', 'timestamp'])#.set_index(['site_id', 'timestamp'])
  for col in ['cloud_coverage', 'precip_depth_1_hr', 'wind_direction', 'wind_speed']:
      wf.loc[wf[col] < 0, col] = 0
      wf.fillna(0, inplace=True)  
  return wf

def load_data(data_path, building_path, weather_path):
  weather = pd.read_csv(weather_path, parse_dates=['timestamp'])
  weather = fill_missing_weather(weather)
  return pd.merge(pd.merge(pd.read_csv(data_path, parse_dates=['timestamp']), pd.read_csv(building_path), on='building_id', how='left'), weather, on=['timestamp','site_id'], how='left')

def load_train_data():
  if kaggle:
    train_path = '/kaggle/input/ashrae-energy-prediction/train.csv'
    building_train_path = '/kaggle/input/ashrae-energy-prediction/building_metadata.csv'
    weather_train_path = '/kaggle/input/ashrae-energy-prediction/weather_train.csv'
  else:
    train_path = 'train.csv'
    building_train_path = 'building_metadata.csv'
    weather_train_path = 'weather_train.csv'

  #return pd.merge(pd.merge(pd.read_csv(train_path), pd.read_csv(building_train_path), on='building_id', how='left'), pd.read_csv(weather_train_path), on=['timestamp','site_id'], how='left')
  return load_data(train_path, building_train_path, weather_train_path)
'''
def load_train_data():
  if kaggle:
    train_path = '/kaggle/input/ashrae-energy-prediction/train.csv'
    building_train_path = '/kaggle/input/ashrae-energy-prediction/building_metadata.csv'
    weather_train_path = '/kaggle/input/ashrae-energy-prediction/weather_train.csv'
  else:
    train_path = 'train.csv'
    building_train_path = 'building_metadata.csv'
    weather_train_path = 'weather_train.csv'

  return pd.merge(pd.merge(pd.read_csv(train_path), pd.read_csv(building_train_path), on='building_id', how='left'), pd.read_csv(weather_train_path), on=['timestamp','site_id'], how='left')
'''
train_df = load_train_data()
train_df.head()
  

In [None]:

train_df.meter_reading = train_df.meter_reading.apply(np.log1p)

In [None]:
train_df.square_feet = train_df.square_feet.apply(np.log1p)
#train_df['square_feet'] =  np.log1p(train_df['square_feet'])

In [None]:
'''
reduce_mem from 1.08
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df
'''
'''Function to reduce the DF size'''
# source: https://www.kaggle.com/kernels/scriptcontent/3684066/download

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type != '<M8[ns]':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
'''Variable Description'''
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary

In [None]:
train_df = reduce_mem_usage(train_df)

In [None]:
description(train_df)

In [None]:
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
train_df["hour"] = train_df["timestamp"].dt.hour
train_df["day"] = train_df["timestamp"].dt.day
train_df["weekday"] = train_df["timestamp"].dt.weekday_name 
train_df["month"] = train_df["timestamp"].dt.month
train_df['year'] = train_df['timestamp'].dt.year

In [None]:
train_df.head()

In [None]:
month_to_season = lambda month:(month%12 + 3)//3

In [None]:
train_df['season'] = train_df.month.apply(month_to_season)

In [None]:
description(train_df)

In [None]:
import lightgbm as lgb

In [None]:
y_train = train_df.meter_reading

Note that: LGBM will ignore missing values in the features vector.
https://www.kaggle.com/c/home-credit-default-risk/discussion/57918

semissing=false, which disables handling for missing values. You can also use the zeroas_missing option to change behavior.

So we need to handle missing values for better performance. Especially for __temprature values__

In [None]:
features = ['meter',
            'site_id',
            'building_id',
            'primary_use',
            'square_feet',
            #'year_built',
            #'floor_count',
            'air_temperature',
            'dew_temperature',
            #'cloud_coverage',
            #'wind_direction',
            #'wind_speed',
            #'precip_depth_1_hr',
            'hour',
            #'weekday',
            #'day',
            'month',
            'season'
            ] # meter must be added, since not all features corr to meter_reading are the same. Same for site_id with less priority
def select_features(df):    
    return df[features] 

In [None]:

x_train = select_features(train_df)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def prep_features(dataset):
    features = []; categorical_features = []
    num_of_columns = dataset.shape[1]
 
    for i in range(0, num_of_columns):
        column_name = dataset.columns[i]
        column_type = dataset[column_name].dtypes
        '''
        if i != num_of_columns - 1: #skip target
            features.append(column_name)
        '''
        features.append(column_name)
        if column_type == 'object' or column_type.name == 'category':
            le.fit(dataset[column_name])
            feature_classes = list(le.classes_)
            encoded_feature = le.transform(dataset[column_name])
            dataset[column_name] = pd.DataFrame(encoded_feature)
            '''
            if i != num_of_columns - 1: #skip target
                categorical_features.append(column_name)
            '''
            categorical_features.append(column_name)
    '''
    if is_regression == False and i == num_of_columns - 1:
        num_of_classes = len(feature_classes)
    else:
        num_of_classes = 1
    '''
    return dataset, features, categorical_features

In [None]:

x_train, features, categorical_features = prep_features(x_train)

In [None]:
x_train.primary_use.dtype.name

In [None]:
categorical_features

In [None]:
features

In [None]:
x_train.head()

In [None]:
from sklearn.model_selection import KFold, train_test_split
# Make validation set based on train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

Although the type of some categorical features could be int, but we know they are categories, like site_id or meter.
FOr that, we manually set the cat_features

In [None]:
categorical_features = ['meter', 'site_id', 'building_id', 'primary_use', 'month', 'season']

In [None]:
'''
lgb_train = lgb.Dataset(x_train, y_train
 ,feature_name = features#+categorical_features
 , categorical_feature = categorical_features
)

params = {
 'task': 'train'
 , 'boosting_type': 'gbdt'
 , 'objective': 'regression'# if is_regression == True else 'multiclass'
 , 'num_class': 1
 , 'metric': 'rmsle'# if is_regression == True else 'multi_logloss'
 , 'min_data': 1
 , 'verbose': 1
}
 
gbm = lgb.train(params, lgb_train, num_boost_round=50)

'''


d_training = lgb.Dataset(x_train, label=y_train,categorical_feature=categorical_features, free_raw_data=False)
d_val = lgb.Dataset(x_val, label=y_val,categorical_feature=categorical_features, free_raw_data=False)

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_val], verbose_eval=25, early_stopping_rounds=50)



The warning above is not because of something wrong.
See: https://github.com/Microsoft/LightGBM/issues/1408
@JYLFamily This warning is raised because you're passing categorical features twice: when constuct Dataset and when call train.

The correct way is to pass them only while constructing Dataset via categorical_feature argument.



In [None]:
!pip install graphviz

import matplotlib.pyplot as plt
ax = lgb.plot_importance(model, max_num_features=10)
plt.show()
 
#ax = lgb.plot_tree(model)
#plt.show()

In [None]:
from sklearn.metrics import mean_squared_log_error as msle, mean_squared_error as mse
y_pred = model.predict(x_val)
score = np.sqrt(mse(y_val, y_pred))# we use mse not msle since we already make y_val as np.log1p and so the y_pred
print('Val MSE = ', score)

In [None]:
del train_df, x_train, y_train, x_val, y_val, d_training, d_val
gc.collect()

In [None]:
def load_test_data():
  if kaggle:
    test_path = '/kaggle/input/ashrae-energy-prediction/test.csv'
    weather_test_path = '/kaggle/input/ashrae-energy-prediction/weather_test.csv'
    building_train_path = '/kaggle/input/ashrae-energy-prediction/building_metadata.csv'
  else:
    test_path = 'test.csv'
    weather_test_path = 'weather_test.csv'    
    building_train_path = 'building_metadata.csv'
    
  
  return pd.merge(pd.merge(pd.read_csv(test_path), pd.read_csv(building_train_path), on='building_id', how='left'), pd.read_csv(weather_test_path), on=['timestamp','site_id'], how='left')

In [None]:

if not kaggle:
  !unzip -n test.csv.zip
  !unzip -n weather_test.csv.zip
  
test_df = load_test_data()
# Test prep and features extraction


test_df = reduce_mem_usage(test_df)

test_df["timestamp"] = pd.to_datetime(test_df["timestamp"])
test_df["hour"] = test_df["timestamp"].dt.hour
test_df["day"] = test_df["timestamp"].dt.day
test_df["weekday"] = test_df["timestamp"].dt.weekday_name 
test_df["month"] = test_df["timestamp"].dt.month
test_df['year'] = test_df['timestamp'].dt.year

test_df['season'] = test_df.month.apply(month_to_season)
#test_df = reduce_mem_usage(test_df)

x_test = select_features(test_df)
del test_df
x_test, features, categorical_features = prep_features(x_test)

The score predicts to return the abs meter reading, while we predict log(p) + 1. So we must make exp(pred) - 1.

This is all done using np.expm1 (m1 = minus 1)

In [None]:
preds = np.expm1(model.predict(x_test))

In [None]:
#!unzip -n sample_submission.csv.zip
if kaggle:
  sample = pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
else:
  !unzip -n sample_submission.csv.zip
  sample = pd.read_csv("sample_submission.csv")
sample['meter_reading'] = preds #np.expm1(gbm.predict(x_test))
sample.to_csv('submission.csv', index=False)
sample.head()