In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
building_info = pd.read_csv('./building_info.csv')

In [4]:
train_df.tail()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08
203999,100_20220824 23,100,20220824 23,20.7,,0.1,95.0,,,540.24


In [5]:
building_info.head()

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.0,39570.0,-,-,-
1,2,건물기타,122233.47,99000.0,-,-,-
2,3,건물기타,171243.0,113950.0,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.0,150000.0,-,2557,1000


In [6]:
building_info['태양광용량(kW)'] = building_info['태양광용량(kW)'].replace('-', 0).astype('float')
building_info['ESS저장용량(kWh)'] = building_info['ESS저장용량(kWh)'].replace('-', 0).astype('float')
building_info['PCS용량(kW)'] = building_info['PCS용량(kW)'].replace('-', 0).astype('float')

In [7]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

In [8]:
train_df.isnull().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
일조(hr)            75182
일사(MJ/m2)         87913
전력소비량(kWh)            0
건물유형                  0
연면적(m2)               0
냉방면적(m2)              0
태양광용량(kW)             0
ESS저장용량(kWh)          0
PCS용량(kW)             0
dtype: int64

In [9]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)

In [10]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x: int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x: int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x: int(x[9:11]))

In [11]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)', '건물유형'])
train_y = train_df['전력소비량(kWh)']

In [12]:
# normalization
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)

In [13]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [14]:
print('훈련용:', model.score(train_x, train_y))

훈련용: 0.9979120270443168


In [16]:
# #파라미터 튜닝(RandomizedSearchCV)
# param_distribs = {
#     'n_estimators': randint(low=1, high=100),
#     'max_features': randint(low=1, high=8),
# }
# model = RandomForestRegressor()
# rnd_model = RandomizedSearchCV(model, param_distributions=param_distribs, cv=5)
# rnd_model.fit(train_x, train_y)

In [17]:
# print('훈련용:', rnd_model.score(train_x, train_y))

훈련용: 0.985223781912792
검증용: -1.1660239515607373


In [15]:
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [16]:
test_x = test_df.drop(columns=['num_date_time', '일시', '건물유형'])

In [17]:
test_x = scaler.transform(test_x)

In [18]:
preds = model.predict(test_x)

In [19]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [20]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2120.5488
1,1_20220825 01,2089.3200
2,1_20220825 02,1984.4784
3,1_20220825 03,1955.2032
4,1_20220825 04,1916.8800
...,...,...
16795,100_20220831 19,927.8688
16796,100_20220831 20,856.4040
16797,100_20220831 21,774.5304
16798,100_20220831 22,634.6296


In [21]:
submission.to_csv('./submission.csv', index=False)