In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train_path = '/kaggle/input/ashrae-energy-prediction/train.csv'
building_train_path = '/kaggle/input/ashrae-energy-prediction/building_metadata.csv'
weather_train_path = '/kaggle/input/ashrae-energy-prediction/weather_train.csv'
train_df = pd.merge(pd.merge(pd.read_csv(train_path), pd.read_csv(building_train_path), on='building_id', how='left'), pd.read_csv(weather_train_path), on=['timestamp','site_id'], how='left')
train_df.head()


In [None]:
train_df.meter_reading = train_df.meter_reading.apply(np.log1p)

In [None]:
'''Function to reduce the DF size'''
# source: https://www.kaggle.com/kernels/scriptcontent/3684066/download

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_df = reduce_mem_usage(train_df)

In [None]:
'''Variable Description'''
def description(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary

In [None]:
description(train_df)

In [None]:
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
train_df["hour"] = train_df["timestamp"].dt.hour
train_df["day"] = train_df["timestamp"].dt.day
train_df["weekday"] = train_df["timestamp"].dt.weekday_name 
train_df["month"] = train_df["timestamp"].dt.month
train_df['year'] = train_df['timestamp'].dt.year

In [None]:
train_df.head()

In [None]:
month_to_season = lambda month:(month%12 + 3)//3

In [None]:
train_df['season'] = train_df.month.apply(month_to_season)

In [None]:
import lightgbm as lgb

In [None]:
y_train = train_df.meter_reading

In [None]:
#x_train = train_df.drop(columns=['meter_reading'])
features = ['hour','weekday', 'day', 'month', 'season'] # meter must be added, since not all features corr to meter_reading are the same. Same for site_id with less priority
#features = ['meter', 'site_id', 'hour','weekday', 'day', 'month', 'season']
x_train = train_df[features]

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def transform_features(dataset, is_regression=True):
    features = []; categorical_features = []
    num_of_columns = dataset.shape[1]
 
    for i in range(0, num_of_columns):
        column_name = dataset.columns[i]
        column_type = dataset[column_name].dtypes

        if i != num_of_columns - 1: #skip target
            features.append(column_name)

        if column_type == 'object':
            le.fit(dataset[column_name])
            feature_classes = list(le.classes_)
            encoded_feature = le.transform(dataset[column_name])
            dataset[column_name] = pd.DataFrame(encoded_feature)

            if i != num_of_columns - 1: #skip target
                categorical_features.append(column_name)

    if is_regression == False and i == num_of_columns - 1:
        num_of_classes = len(feature_classes)
    else:
        num_of_classes = 1

    return dataset, features.copy(), categorical_features.copy(), num_of_classes

In [None]:
is_regression = True
x_train, features, categorical_features, num_of_classes = transform_features(x_train, is_regression=is_regression)

In [None]:
x_train.dtypes

In [None]:
x_train.head()

In [None]:
features 

In [None]:
categorical_features

In [None]:
from sklearn.model_selection import KFold, train_test_split
# Make validation set based on train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
lgb_train = lgb.Dataset(x_train, y_train
 ,feature_name = features+categorical_features
 #, categorical_feature = categorical_features
)

In [None]:
params = {
 'task': 'train'
 , 'boosting_type': 'gbdt'
 , 'objective': 'regression' if is_regression == True else 'multiclass'
 , 'num_class': num_of_classes
 , 'metric': 'rmsle' if is_regression == True else 'multi_logloss'
 , 'min_data': 1
 , 'verbose': 1
}
 
gbm = lgb.train(params, lgb_train, num_boost_round=50)

In [None]:
!pip install graphviz

import matplotlib.pyplot as plt
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()
 
ax = lgb.plot_tree(gbm)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_log_error as msle, mean_squared_error as mse
y_pred  =gbm.predict(x_val)
score = np.sqrt(mse(y_val, y_pred))# we use mse not msle since we already make y_val as np.log1p and so the y_pred
print('Val MSE = ', score)

In [None]:
del train_df
del x_train
del x_val

In [None]:
test_path = '/kaggle/input/ashrae-energy-prediction/test.csv'
weather_test_path = '/kaggle/input/ashrae-energy-prediction/weather_test.csv'
#!unzip -n test.csv.zip
#!unzip -n weather_test.csv.zip

# Test prep and features extraction
test_df = pd.merge(pd.merge(pd.read_csv(test_path), pd.read_csv(building_train_path), on='building_id', how='left'), pd.read_csv(weather_test_path), on=['timestamp','site_id'], how='left')

test_df = reduce_mem_usage(test_df)

test_df["timestamp"] = pd.to_datetime(test_df["timestamp"])
test_df["hour"] = test_df["timestamp"].dt.hour
test_df["day"] = test_df["timestamp"].dt.day
test_df["weekday"] = test_df["timestamp"].dt.weekday_name 
test_df["month"] = test_df["timestamp"].dt.month
test_df['year'] = test_df['timestamp'].dt.year

test_df['season'] = test_df.month.apply(month_to_season)

x_test = test_df[features]
del test_df
x_test, features, categorical_features, num_classes = transform_features(x_test, is_regression=is_regression)

The score predicts to return the abs meter reading, while we predict log(p) + 1. So we must make exp(pred) - 1.

This is all done using np.expm1 (m1 = minus 1)

In [None]:
#!unzip -n sample_submission.csv.zip
sample = pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
sample['meter_reading'] = np.expm1(gbm.predict(x_test))
sample.to_csv('submission.csv', index=False)
sample.head()