## Import

In [57]:
import random
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/open/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/open/test.csv')

## Train Data Pre-Processing

In [5]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)

In [6]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [10]:
train_df = train_df.drop(columns = ['num_date_time', '일시'])
train_df

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time
0,1,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0
1,1,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1
2,1,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2
3,1,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3
4,1,18.4,0.0,2.8,43.0,0.0,0.0,986.40,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,0.5,0.0,881.04,8,24,19
203996,100,22.4,0.0,1.3,86.0,0.0,0.0,798.96,8,24,20
203997,100,21.3,0.0,1.0,92.0,0.0,0.0,825.12,8,24,21
203998,100,21.0,0.0,0.3,94.0,0.0,0.0,640.08,8,24,22


In [19]:
train_x = train_df.drop(columns = ['강수량(mm)','일조(hr)','일사(MJ/m2)','전력소비량(kWh)'])
train_y = train_df['전력소비량(kWh)']
train_x

Unnamed: 0,건물번호,기온(C),풍속(m/s),습도(%),month,day,time
0,1,18.6,0.9,42.0,6,1,0
1,1,18.0,1.1,45.0,6,1,1
2,1,17.7,1.5,45.0,6,1,2
3,1,16.7,1.4,48.0,6,1,3
4,1,18.4,2.8,43.0,6,1,4
...,...,...,...,...,...,...,...
203995,100,23.1,0.9,86.0,8,24,19
203996,100,22.4,1.3,86.0,8,24,20
203997,100,21.3,1.0,92.0,8,24,21
203998,100,21.0,0.3,94.0,8,24,22


In [54]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.33, random_state = 42)

## Regression Model Fit

In [65]:
model = RandomForestRegressor(n_estimators = 100, max_depth = 5)
model.fit(x_train, y_train)

## Test Data Pre-Processing

In [43]:
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [44]:
test_x = test_df.drop(columns=['num_date_time', '일시','강수량(mm)'])

## Inference

In [69]:
preds = model.predict(test_x)

## Submission

In [70]:
submission = pd.DataFrame()
submission['num_date_time'] = test_df['num_date_time']
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1251.401552
1,1_20220825 01,1251.401552
2,1_20220825 02,1251.401552
3,1_20220825 03,1251.401552
4,1_20220825 04,1251.401552
...,...,...
16795,100_20220831 19,1205.083374
16796,100_20220831 20,1177.985227
16797,100_20220831 21,1084.743330
16798,100_20220831 22,1035.069062


In [71]:
submission.to_csv('./submission.csv', index=False)

In [67]:
model.score(x_test, y_test)

0.8007015234768715