### 라이브러리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from windpowerlib.wind_speed import logarithmic_profile

## 경주풍력

### 데이터 불러오기

In [2]:
gj_train = pd.read_parquet('train_ldaps_gyeongju.parquet')
gj_test = pd.read_parquet('test_ldaps_gyeongju.parquet')
meta = pd.read_excel('windfarm_metadata.xlsx', sheet_name = 1, header = 1, index_col = None)
meta.rename(columns = {'발전기 번호': 'turbine_id','정격 출력 [kW]': '정격출력', '소재지표고(지표) [m]': '소재지표고', '허브 높이(지표) [m]': '허브높이', '로터 반경 [m]': '로터반경'}, inplace = True)
meta = meta.iloc[:, 1:11]

### 데이터 확인

In [3]:
display(gj_train.head())
display(gj_test.head())
display(meta.head())

Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id
0,2020-01-02 00:00:00+09:00,387.640625,1.0,0.286911,10.428498,10.05958,97974.59375,91.796478,0.002686,269.46756,0.021201,-0.375756,7.353266,-2.640615,WTG01
1,2020-01-02 01:00:00+09:00,387.640625,1.0,0.286911,10.472921,10.044404,97970.132812,91.729774,0.002686,269.430847,0.020706,-0.371947,7.578446,-2.414076,WTG01
2,2020-01-02 02:00:00+09:00,387.640625,1.0,0.286911,10.682985,10.478634,97951.546875,92.788666,0.002686,269.37439,0.020556,-0.354397,7.654554,-2.259913,WTG01
3,2020-01-02 03:00:00+09:00,387.640625,1.0,0.286911,10.676681,10.090029,97908.96875,92.494576,0.002686,269.352112,0.021635,-0.328609,7.275844,-2.280371,WTG01
4,2020-01-02 04:00:00+09:00,387.640625,1.0,0.286911,10.079557,9.67262,97858.398438,88.986443,0.002686,269.413269,0.024231,-0.265124,6.911751,-1.920128,WTG01


Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id
0,2023-01-01 00:00:00+09:00,387.640625,1.0,0.286911,11.116076,10.788924,97858.4375,74.741974,0.002441,269.822052,0.053748,-0.066199,7.711938,-1.282177,WTG01
1,2023-01-01 01:00:00+09:00,387.640625,1.0,0.286911,10.945148,10.69934,97847.90625,73.057632,0.002263,269.79071,0.064034,-0.212118,7.628698,-1.50997,WTG01
2,2023-01-01 02:00:00+09:00,387.640625,1.0,0.286911,11.457981,10.870457,97863.867188,71.513168,0.002197,269.840515,0.071656,-0.213301,8.034372,-1.425219,WTG01
3,2023-01-01 03:00:00+09:00,387.640625,1.0,0.286911,11.583426,11.074321,97853.390625,73.938629,0.00223,269.616333,0.069788,-0.153121,7.834104,-0.680731,WTG01
4,2023-01-01 04:00:00+09:00,387.640625,1.0,0.286911,11.028671,10.432499,97808.242188,75.667435,0.00224,269.364868,0.068385,-0.084777,7.439806,-0.103689,WTG01


Unnamed: 0,발전단지,turbine_id,제작사 turbine ID,터빈 모델,정격출력,소재지표고,허브높이,로터반경,위도,경도
0,경주풍력,WTG01,UNISON,U113,2300,492.0,100,56.5,35.724089,129.374592
1,경주풍력,WTG02,UNISON,U113,2300,456.1,100,56.5,35.722233,129.3724
2,경주풍력,WTG03,UNISON,U113,2300,476.5,100,56.5,35.721336,129.37015
3,경주풍력,WTG04,UNISON,U113,2300,467.1,100,56.5,35.719208,129.368869
4,경주풍력,WTG05,UNISON,U113,2300,502.2,100,56.5,35.716156,129.367767


In [4]:
print(gj_train.shape)
print(gj_test.shape)

(235818, 15)
(78840, 15)


#### u, v벡터로 풍속, 풍향 구하기

In [5]:
## u, v벡터로 풍속, 풍향 구하기

def uv_to_wsd(u_wind_speed, v_wind_speed):
    """ 
        Convert u, v vector to wind speed and direction.
    """
    u_ws = u_wind_speed.to_numpy()
    v_ws = v_wind_speed.to_numpy()

    # NOTE: http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv
    wind_speed = np.nansum([u_ws**2, v_ws**2], axis=0)**(1/2.)

    # math degree
    wind_direction = np.rad2deg(np.arctan2(v_ws, u_ws+1e-6))
    wind_direction[wind_direction < 0] += 360

    # meteorological degree
    wind_direction = 270 - wind_direction
    wind_direction[wind_direction < 0] += 360

    return wind_speed, wind_direction


gj_train["wind_speed"], gj_train["wind_direction"] = uv_to_wsd(
    gj_train["wind_u_10m"], gj_train["wind_v_10m"])

gj_test["wind_speed"], gj_test["wind_direction"] = uv_to_wsd(
    gj_test["wind_u_10m"], gj_test["wind_v_10m"])


display(gj_train.head())
display(gj_test.head())

Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id,wind_speed,wind_direction
0,2020-01-02 00:00:00+09:00,387.640625,1.0,0.286911,10.428498,10.05958,97974.59375,91.796478,0.002686,269.46756,0.021201,-0.375756,7.353266,-2.640615,WTG01,7.813025,289.753601
1,2020-01-02 01:00:00+09:00,387.640625,1.0,0.286911,10.472921,10.044404,97970.132812,91.729774,0.002686,269.430847,0.020706,-0.371947,7.578446,-2.414076,WTG01,7.953654,287.669006
2,2020-01-02 02:00:00+09:00,387.640625,1.0,0.286911,10.682985,10.478634,97951.546875,92.788666,0.002686,269.37439,0.020556,-0.354397,7.654554,-2.259913,WTG01,7.981191,286.448578
3,2020-01-02 03:00:00+09:00,387.640625,1.0,0.286911,10.676681,10.090029,97908.96875,92.494576,0.002686,269.352112,0.021635,-0.328609,7.275844,-2.280371,WTG01,7.624827,287.401855
4,2020-01-02 04:00:00+09:00,387.640625,1.0,0.286911,10.079557,9.67262,97858.398438,88.986443,0.002686,269.413269,0.024231,-0.265124,6.911751,-1.920128,WTG01,7.173506,285.525635


Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id,wind_speed,wind_direction
0,2023-01-01 00:00:00+09:00,387.640625,1.0,0.286911,11.116076,10.788924,97858.4375,74.741974,0.002441,269.822052,0.053748,-0.066199,7.711938,-1.282177,WTG01,7.817798,279.439575
1,2023-01-01 01:00:00+09:00,387.640625,1.0,0.286911,10.945148,10.69934,97847.90625,73.057632,0.002263,269.79071,0.064034,-0.212118,7.628698,-1.50997,WTG01,7.776699,281.195984
2,2023-01-01 02:00:00+09:00,387.640625,1.0,0.286911,11.457981,10.870457,97863.867188,71.513168,0.002197,269.840515,0.071656,-0.213301,8.034372,-1.425219,WTG01,8.159803,280.059082
3,2023-01-01 03:00:00+09:00,387.640625,1.0,0.286911,11.583426,11.074321,97853.390625,73.938629,0.00223,269.616333,0.069788,-0.153121,7.834104,-0.680731,WTG01,7.863624,274.966156
4,2023-01-01 04:00:00+09:00,387.640625,1.0,0.286911,11.028671,10.432499,97808.242188,75.667435,0.00224,269.364868,0.068385,-0.084777,7.439806,-0.103689,WTG01,7.440528,270.798492


### 소재지표고를 활용하기 위해 merge

In [6]:
data_train = pd.merge(gj_train, meta[['소재지표고', 'turbine_id']], on = 'turbine_id')
data_test = pd.merge(gj_test, meta[['소재지표고', 'turbine_id']], on = 'turbine_id')

display(data_train.head())
display(data_test.head())

Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id,wind_speed,wind_direction,소재지표고
0,2020-01-02 00:00:00+09:00,387.640625,1.0,0.286911,10.428498,10.05958,97974.59375,91.796478,0.002686,269.46756,0.021201,-0.375756,7.353266,-2.640615,WTG01,7.813025,289.753601,492.0
1,2020-01-02 01:00:00+09:00,387.640625,1.0,0.286911,10.472921,10.044404,97970.132812,91.729774,0.002686,269.430847,0.020706,-0.371947,7.578446,-2.414076,WTG01,7.953654,287.669006,492.0
2,2020-01-02 02:00:00+09:00,387.640625,1.0,0.286911,10.682985,10.478634,97951.546875,92.788666,0.002686,269.37439,0.020556,-0.354397,7.654554,-2.259913,WTG01,7.981191,286.448578,492.0
3,2020-01-02 03:00:00+09:00,387.640625,1.0,0.286911,10.676681,10.090029,97908.96875,92.494576,0.002686,269.352112,0.021635,-0.328609,7.275844,-2.280371,WTG01,7.624827,287.401855,492.0
4,2020-01-02 04:00:00+09:00,387.640625,1.0,0.286911,10.079557,9.67262,97858.398438,88.986443,0.002686,269.413269,0.024231,-0.265124,6.911751,-1.920128,WTG01,7.173506,285.525635,492.0


Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id,wind_speed,wind_direction,소재지표고
0,2023-01-01 00:00:00+09:00,387.640625,1.0,0.286911,11.116076,10.788924,97858.4375,74.741974,0.002441,269.822052,0.053748,-0.066199,7.711938,-1.282177,WTG01,7.817798,279.439575,492.0
1,2023-01-01 01:00:00+09:00,387.640625,1.0,0.286911,10.945148,10.69934,97847.90625,73.057632,0.002263,269.79071,0.064034,-0.212118,7.628698,-1.50997,WTG01,7.776699,281.195984,492.0
2,2023-01-01 02:00:00+09:00,387.640625,1.0,0.286911,11.457981,10.870457,97863.867188,71.513168,0.002197,269.840515,0.071656,-0.213301,8.034372,-1.425219,WTG01,8.159803,280.059082,492.0
3,2023-01-01 03:00:00+09:00,387.640625,1.0,0.286911,11.583426,11.074321,97853.390625,73.938629,0.00223,269.616333,0.069788,-0.153121,7.834104,-0.680731,WTG01,7.863624,274.966156,492.0
4,2023-01-01 04:00:00+09:00,387.640625,1.0,0.286911,11.028671,10.432499,97808.242188,75.667435,0.00224,269.364868,0.068385,-0.084777,7.439806,-0.103689,WTG01,7.440528,270.798492,492.0


In [7]:
### 100 + meta['소재지표고'] 값을 만듦 height변수로 만들기.

data_train['height'] = 100 + data_train['소재지표고']
data_train.drop(['소재지표고'], axis = 1, inplace = True)

data_test['height'] = 100 + data_test['소재지표고']
data_test.drop(['소재지표고'], axis = 1, inplace = True)

In [8]:
gj_train = data_train.copy()
gj_test = data_test.copy()

In [9]:
# dt가 인덱스로 설정되어 있어야 아래 logarithmic으로 변경가능
gj_train.set_index('dt', inplace = True)
gj_test.set_index('dt', inplace = True)

#### logarithmic_profile 함수로 height(100 + 소재지표고)에 따라 풍속 보정하기

In [10]:
gj_train_x = gj_train[['pressure', 'relative_humid', 'specific_humid', 'temp_air', 'wind_speed', 'wind_direction', 'height', 'surf_rough', 'turbine_id']]
# 터빈 허브 높이의 풍속을 log-wall wind profile을 적용해 산출 (경주풍력의 터빈 지표고도 = 100m, 메타데이터 참고)
x_windspeed_100m = gj_train_x.groupby("turbine_id").apply(
    lambda x:logarithmic_profile(x.wind_speed, 10, x.height, x.surf_rough)
).T.reset_index().melt(
    value_vars = gj_train_x.turbine_id.unique().tolist(), id_vars="dt", value_name="wind_speed_height"
)
gj_train_x = pd.merge(gj_train_x.reset_index(), x_windspeed_100m, on=["dt", "turbine_id"])
gj_train_x.drop(['surf_rough', 'wind_speed'], axis = 1, inplace = True)

########################################################################################################

gj_test_x = gj_test[['pressure', 'relative_humid', 'specific_humid', 'temp_air', 'wind_speed', 'wind_direction', 'height', 'surf_rough', 'turbine_id']]
# 터빈 허브 높이의 풍속을 log-wall wind profile을 적용해 산출 (경주풍력의 터빈 지표고도 = 100m, 메타데이터 참고)
x_windspeed_100m = gj_test_x.groupby("turbine_id").apply(
    lambda x:logarithmic_profile(x.wind_speed, 10, x.height, x.surf_rough)
).T.reset_index().melt(
    value_vars = gj_test_x.turbine_id.unique().tolist(), id_vars="dt", value_name="wind_speed_height"
)
gj_test_x = pd.merge(gj_test_x.reset_index(), x_windspeed_100m, on=["dt", "turbine_id"])
gj_test_x.drop(['surf_rough', 'wind_speed'], axis = 1, inplace = True)

display(gj_train_x.head())
display(gj_test_x.head())


  x_windspeed_100m = gj_train_x.groupby("turbine_id").apply(
  x_windspeed_100m = gj_test_x.groupby("turbine_id").apply(


Unnamed: 0,dt,pressure,relative_humid,specific_humid,temp_air,wind_direction,height,turbine_id,wind_speed_height
0,2020-01-02 00:00:00+09:00,97974.59375,91.796478,0.002686,269.46756,289.753601,592.0,WTG01,16.791576
1,2020-01-02 01:00:00+09:00,97970.132812,91.729774,0.002686,269.430847,287.669006,592.0,WTG01,17.093811
2,2020-01-02 02:00:00+09:00,97951.546875,92.788666,0.002686,269.37439,286.448578,592.0,WTG01,17.152994
3,2020-01-02 03:00:00+09:00,97908.96875,92.494576,0.002686,269.352112,287.401855,592.0,WTG01,16.387105
4,2020-01-02 04:00:00+09:00,97858.398438,88.986443,0.002686,269.413269,285.525635,592.0,WTG01,15.417136


Unnamed: 0,dt,pressure,relative_humid,specific_humid,temp_air,wind_direction,height,turbine_id,wind_speed_height
0,2023-01-01 00:00:00+09:00,97858.4375,74.741974,0.002441,269.822052,279.439575,592.0,WTG01,16.801833
1,2023-01-01 01:00:00+09:00,97847.90625,73.057632,0.002263,269.79071,281.195984,592.0,WTG01,16.713504
2,2023-01-01 02:00:00+09:00,97863.867188,71.513168,0.002197,269.840515,280.059082,592.0,WTG01,17.536863
3,2023-01-01 03:00:00+09:00,97853.390625,73.938629,0.00223,269.616333,274.966156,592.0,WTG01,16.90032
4,2023-01-01 04:00:00+09:00,97808.242188,75.667435,0.00224,269.364868,270.798492,592.0,WTG01,15.991014


In [11]:
feature = ['pressure', 'relative_humid', 'specific_humid', 'temp_air', 'wind_direction', 'height', 'turbine_id', 'wind_speed_height']
temp_train = gj_train_x.copy()
gj_train_feature = temp_train.pivot(index = 'dt', columns = 'turbine_id', values = feature).reset_index()
gj_train_feature.columns = [' '.join(col).strip() for col in gj_train_feature.columns.values]

temp_test = gj_test_x.copy()
gj_test_feature = temp_test.pivot(index = 'dt', columns = 'turbine_id', values = feature).reset_index()
gj_test_feature.columns = [' '.join(col).strip() for col in gj_test_feature.columns.values]

In [12]:
gj_train_feature.drop(['turbine_id WTG01', 'turbine_id WTG02', 'turbine_id WTG03', 'turbine_id WTG04',
                 'turbine_id WTG05', 'turbine_id WTG06', 'turbine_id WTG07', 'turbine_id WTG08', 'turbine_id WTG09'],
                 axis = 1, inplace = True)
gj_test_feature.drop(['turbine_id WTG01', 'turbine_id WTG02', 'turbine_id WTG03', 'turbine_id WTG04',
                 'turbine_id WTG05', 'turbine_id WTG06', 'turbine_id WTG07', 'turbine_id WTG08', 'turbine_id WTG09'],
                 axis = 1, inplace = True)

print(gj_train_feature.shape)
print(gj_test_feature.shape)

(26202, 64)
(8760, 64)


In [13]:
gj_train_feature.columns
gj_test_feature.columns

Index(['dt', 'pressure WTG01', 'pressure WTG02', 'pressure WTG03',
       'pressure WTG04', 'pressure WTG05', 'pressure WTG06', 'pressure WTG07',
       'pressure WTG08', 'pressure WTG09', 'relative_humid WTG01',
       'relative_humid WTG02', 'relative_humid WTG03', 'relative_humid WTG04',
       'relative_humid WTG05', 'relative_humid WTG06', 'relative_humid WTG07',
       'relative_humid WTG08', 'relative_humid WTG09', 'specific_humid WTG01',
       'specific_humid WTG02', 'specific_humid WTG03', 'specific_humid WTG04',
       'specific_humid WTG05', 'specific_humid WTG06', 'specific_humid WTG07',
       'specific_humid WTG08', 'specific_humid WTG09', 'temp_air WTG01',
       'temp_air WTG02', 'temp_air WTG03', 'temp_air WTG04', 'temp_air WTG05',
       'temp_air WTG06', 'temp_air WTG07', 'temp_air WTG08', 'temp_air WTG09',
       'wind_direction WTG01', 'wind_direction WTG02', 'wind_direction WTG03',
       'wind_direction WTG04', 'wind_direction WTG05', 'wind_direction WTG06',
   

### train_y 불러오기

In [14]:
train_y = pd.read_csv('train_y.csv').rename(columns = {'end_datetime': 'dt'})
train_y['dt'] = pd.to_datetime(train_y['dt']).dt.tz_convert('Asia/Seoul')
train_y.head()

Unnamed: 0,plant_name,dt,period_hours,energy_kwh
0,경주풍력,2020-01-01 01:00:00+09:00,1,9767.578125
1,경주풍력,2020-01-01 02:00:00+09:00,1,5381.835938
2,경주풍력,2020-01-01 03:00:00+09:00,1,3021.484375
3,경주풍력,2020-01-01 04:00:00+09:00,1,4400.390625
4,경주풍력,2020-01-01 05:00:00+09:00,1,4501.953125


In [15]:
gj_y = train_y.loc[train_y['plant_name'] == '경주풍력', ['dt', 'energy_kwh']]
gj_y

Unnamed: 0,dt,energy_kwh
0,2020-01-01 01:00:00+09:00,9767.578125
1,2020-01-01 02:00:00+09:00,5381.835938
2,2020-01-01 03:00:00+09:00,3021.484375
3,2020-01-01 04:00:00+09:00,4400.390625
4,2020-01-01 05:00:00+09:00,4501.953125
...,...,...
26299,2022-12-31 20:00:00+09:00,18394.531250
26300,2022-12-31 21:00:00+09:00,18443.359375
26301,2022-12-31 22:00:00+09:00,18525.390625
26302,2022-12-31 23:00:00+09:00,18529.296875


In [16]:
# merge

gj_train_data = pd.merge(gj_train_feature, gj_y, on = 'dt', how = 'inner')
gj_train_data.head()

Unnamed: 0,dt,pressure WTG01,pressure WTG02,pressure WTG03,pressure WTG04,pressure WTG05,pressure WTG06,pressure WTG07,pressure WTG08,pressure WTG09,...,wind_speed_height WTG01,wind_speed_height WTG02,wind_speed_height WTG03,wind_speed_height WTG04,wind_speed_height WTG05,wind_speed_height WTG06,wind_speed_height WTG07,wind_speed_height WTG08,wind_speed_height WTG09,energy_kwh
0,2020-01-02 00:00:00+09:00,97974.59375,99067.59375,99190.34375,99190.34375,99190.34375,99190.34375,99190.34375,99206.46875,99206.46875,...,16.791576,10.283708,10.295996,10.273541,10.355569,10.455692,10.56054,10.252203,10.259615,17827.148438
1,2020-01-02 01:00:00+09:00,97970.132812,99066.257812,99190.007812,99190.007812,99190.007812,99190.007812,99190.007812,99207.257812,99207.257812,...,17.093811,10.158219,9.86824,9.846718,9.925339,10.021302,10.121794,9.662699,9.669685,17616.210938
2,2020-01-02 02:00:00+09:00,97951.546875,99049.546875,99173.671875,99173.671875,99173.671875,99173.671875,99173.671875,99191.296875,99191.296875,...,17.152994,10.135354,9.745689,9.724434,9.802078,9.896849,9.996094,9.455277,9.462113,17904.296875
3,2020-01-02 03:00:00+09:00,97908.96875,99004.09375,99128.71875,99128.71875,99128.71875,99128.71875,99128.71875,99146.71875,99146.71875,...,16.387105,9.674766,9.03227,9.012571,9.084532,9.172365,9.264345,8.609753,8.615978,17676.757812
4,2020-01-02 04:00:00+09:00,97858.398438,98949.898438,99075.523438,99075.523438,99075.523438,99075.523438,99075.523438,99094.148438,99094.148438,...,15.417136,8.023964,7.382783,7.366681,7.4255,7.497293,7.572475,7.178662,7.183853,18519.53125


In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error


# Extract datetime features (hour of day, day of week, etc.)
gj_train_data['hour'] = gj_train_data['dt'].dt.hour
gj_train_data['day_of_week'] = gj_train_data['dt'].dt.dayofweek
gj_train_data['month'] = gj_train_data['dt'].dt.month

# Lag features: creating lagged versions of the features (1-hour lag example)
cols = gj_train_data.columns.to_list()[1:]

for col in cols:
    gj_train_data[col] = gj_train_data[col].shift(24)

# Drop rows with NaN values introduced by lagging
gj_train_data = gj_train_data.dropna()

# Split data into features (X) and target (y)
X = gj_train_data.drop(['dt', 'energy_kwh'], axis = 1)
y = gj_train_data['energy_kwh']

################################################################################
# Extract datetime features (hour of day, day of week, etc.)
gj_test_feature['hour'] = gj_test_feature['dt'].dt.hour
gj_test_feature['day_of_week'] = gj_test_feature['dt'].dt.dayofweek
gj_test_feature['month'] = gj_test_feature['dt'].dt.month

# Lag features: creating lagged versions of the features (1-hour lag example)
cols = gj_test_feature.columns.to_list()[1:]

for col in cols:
    gj_test_feature[col] = gj_test_feature[col].shift(24)

# Drop rows with NaN values introduced by lagging
# gj_test_feature = gj_test_feature.dropna()
gj_test_feature.drop(['dt'], axis = 1, inplace = True)


# train data x, y로 나누기
# Split data into features (X) and target (y)
X = gj_train_data.drop(['dt', 'energy_kwh'], axis = 1)
y = gj_train_data['energy_kwh']

# 성능평가
# Train-test split
# train_size = int(0.8 * len(gj_data))  # 80% for training, 20% for testing
# X_train, X_test = X[:train_size], X[train_size:]
# y_train, y_test = y[:train_size], y[train_size:]

# # Create and train the Random Forest model
# rf = RandomForestRegressor(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)

# # Predict on the test set
# y_pred_rf = rf.predict(X_test)

# # Evaluate the model
# mape = mean_absolute_percentage_error(y_test, y_pred_rf)
# print(f"Mean Absolute Percentage Error: {mape}")

# # You can also visualize the predictions vs actual power production
# import matplotlib.pyplot as plt

# plt.plot(y_test.index, y_test, label='Actual')
# plt.plot(y_test.index, y_pred_rf, label='Predicted')
# plt.xlabel('Time')
# plt.ylabel('Power Production')
# plt.legend()
# plt.show()


In [18]:
# 예측하기
# Create and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Predict on the test set
y_pred_rf = rf.predict(gj_test_feature)
y_pred_rf

array([1960.80078125, 1960.80078125, 1960.80078125, ..., 3805.65429688,
       4763.25195312, 4873.84765625])

In [19]:
len(y_pred_rf)

8760

In [22]:
pred_value = pd.DataFrame({'y_pred_rf': y_pred_rf})
pred_value

Unnamed: 0,y_pred_rf
0,1960.800781
1,1960.800781
2,1960.800781
3,1960.800781
4,1960.800781
...,...
8755,1177.031250
8756,3226.396484
8757,3805.654297
8758,4763.251953


### 경주풍력단지 submission 파일에 합치기

In [20]:
submission = pd.read_csv('2. submission_format.csv', encoding = 'cp949')
submission.head()


Unnamed: 0,plant_name,end_datetime,period_hours,energy_kwh
0,경주풍력,2023-01-01T01:00:00+09:00,1,0
1,경주풍력,2023-01-01T02:00:00+09:00,1,0
2,경주풍력,2023-01-01T03:00:00+09:00,1,0
3,경주풍력,2023-01-01T04:00:00+09:00,1,0
4,경주풍력,2023-01-01T05:00:00+09:00,1,0


In [24]:
submission.loc[pred_value.index, 'energy_kwh'] = pred_value['y_pred_rf']
submission

 4873.84765625]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  submission.loc[pred_value.index, 'energy_kwh'] = pred_value['y_pred_rf']


Unnamed: 0,plant_name,end_datetime,period_hours,energy_kwh
0,경주풍력,2023-01-01T01:00:00+09:00,1,1960.800781
1,경주풍력,2023-01-01T02:00:00+09:00,1,1960.800781
2,경주풍력,2023-01-01T03:00:00+09:00,1,1960.800781
3,경주풍력,2023-01-01T04:00:00+09:00,1,1960.800781
4,경주풍력,2023-01-01T05:00:00+09:00,1,1960.800781
...,...,...,...,...
17515,영광풍력,2023-12-31T20:00:00+09:00,1,0.000000
17516,영광풍력,2023-12-31T21:00:00+09:00,1,0.000000
17517,영광풍력,2023-12-31T22:00:00+09:00,1,0.000000
17518,영광풍력,2023-12-31T23:00:00+09:00,1,0.000000


## 영광풍력

In [26]:
### 데이터 불러오기
yg_train = pd.read_parquet('train_ldaps_yeonggwang.parquet')
yg_test = pd.read_parquet('test_ldaps_yeonggwang.parquet')
meta = pd.read_excel('windfarm_metadata.xlsx', sheet_name = 1, header = 1, index_col = None)
meta.rename(columns = {'발전기 번호': 'turbine_id','정격 출력 [kW]': '정격출력', '소재지표고(지표) [m]': '소재지표고', '허브 높이(지표) [m]': '허브높이', '로터 반경 [m]': '로터반경'}, inplace = True)
meta = meta.iloc[:, 1:11]

In [27]:
### 데이터 확인
display(yg_train.head())
display(yg_test.head())
display(meta.head())
print(yg_train.shape)
print(yg_test.shape)

Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id
0,2020-01-02 00:00:00+09:00,0.15625,0.021042,0.004259,3.177766,2.558847,103028.84375,61.177334,0.003413,280.260773,0.028556,-0.041619,2.841913,-4.775259,WTG01
1,2020-01-02 01:00:00+09:00,0.15625,0.021042,0.00425,3.907736,3.248139,103004.507812,61.698036,0.003413,280.260681,0.04213,-0.044463,3.091386,-3.75868,WTG01
2,2020-01-02 02:00:00+09:00,0.15625,0.021042,0.004248,3.552126,3.160641,102999.546875,62.101166,0.003418,280.268921,0.034533,-0.028805,3.005262,-3.511989,WTG01
3,2020-01-02 03:00:00+09:00,0.15625,0.021042,0.004248,3.26921,2.882998,102988.96875,61.944283,0.003418,280.276672,0.025693,-0.028163,2.610194,-3.798681,WTG01
4,2020-01-02 04:00:00+09:00,0.15625,0.021042,0.004248,3.065275,2.887952,102937.523438,61.080189,0.003418,280.266052,0.036102,-0.031177,2.678841,-3.966148,WTG01


Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id
0,2023-01-01 00:00:00+09:00,0.15625,0.021042,0.004261,5.453722,4.858016,102968.8125,53.894802,0.003174,280.996857,0.110175,-0.0297,4.781274,-3.15974,WTG01
1,2023-01-01 01:00:00+09:00,0.15625,0.021042,0.004269,6.164143,5.472655,102967.40625,52.638199,0.003174,281.004822,0.127694,-0.032369,5.404456,-3.56954,WTG01
2,2023-01-01 02:00:00+09:00,0.15625,0.021042,0.004278,6.524754,5.98325,102989.492188,51.950668,0.003169,281.012634,0.10373,-0.042067,5.573801,-3.875658,WTG01
3,2023-01-01 03:00:00+09:00,0.15625,0.021042,0.004284,6.66741,6.047954,102987.890625,51.815098,0.003169,281.012573,0.058741,-0.049514,5.666257,-4.311956,WTG01
4,2023-01-01 04:00:00+09:00,0.15625,0.021042,0.004286,7.081893,5.935673,102947.242188,53.145462,0.003169,281.015503,0.101192,-0.057647,5.362169,-4.881277,WTG01


Unnamed: 0,발전단지,turbine_id,제작사 turbine ID,터빈 모델,정격출력,소재지표고,허브높이,로터반경,위도,경도
0,경주풍력,WTG01,UNISON,U113,2300,492.0,100,56.5,35.724089,129.374592
1,경주풍력,WTG02,UNISON,U113,2300,456.1,100,56.5,35.722233,129.3724
2,경주풍력,WTG03,UNISON,U113,2300,476.5,100,56.5,35.721336,129.37015
3,경주풍력,WTG04,UNISON,U113,2300,467.1,100,56.5,35.719208,129.368869
4,경주풍력,WTG05,UNISON,U113,2300,502.2,100,56.5,35.716156,129.367767


(917070, 15)
(306600, 15)


In [28]:
## u, v벡터로 풍속, 풍향 구하기

def uv_to_wsd(u_wind_speed, v_wind_speed):
    """ 
        Convert u, v vector to wind speed and direction.
    """
    u_ws = u_wind_speed.to_numpy()
    v_ws = v_wind_speed.to_numpy()

    # NOTE: http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv
    wind_speed = np.nansum([u_ws**2, v_ws**2], axis=0)**(1/2.)

    # math degree
    wind_direction = np.rad2deg(np.arctan2(v_ws, u_ws+1e-6))
    wind_direction[wind_direction < 0] += 360

    # meteorological degree
    wind_direction = 270 - wind_direction
    wind_direction[wind_direction < 0] += 360

    return wind_speed, wind_direction


yg_train["wind_speed"], yg_train["wind_direction"] = uv_to_wsd(
    yg_train["wind_u_10m"], yg_train["wind_v_10m"])

yg_test["wind_speed"], yg_test["wind_direction"] = uv_to_wsd(
    yg_test["wind_u_10m"], yg_test["wind_v_10m"])


display(yg_train.head())
display(yg_test.head())

Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id,wind_speed,wind_direction
0,2020-01-02 00:00:00+09:00,0.15625,0.021042,0.004259,3.177766,2.558847,103028.84375,61.177334,0.003413,280.260773,0.028556,-0.041619,2.841913,-4.775259,WTG01,5.556939,329.24173
1,2020-01-02 01:00:00+09:00,0.15625,0.021042,0.00425,3.907736,3.248139,103004.507812,61.698036,0.003413,280.260681,0.04213,-0.044463,3.091386,-3.75868,WTG01,4.866656,320.563873
2,2020-01-02 02:00:00+09:00,0.15625,0.021042,0.004248,3.552126,3.160641,102999.546875,62.101166,0.003418,280.268921,0.034533,-0.028805,3.005262,-3.511989,WTG01,4.622301,319.445862
3,2020-01-02 03:00:00+09:00,0.15625,0.021042,0.004248,3.26921,2.882998,102988.96875,61.944283,0.003418,280.276672,0.025693,-0.028163,2.610194,-3.798681,WTG01,4.609023,325.505798
4,2020-01-02 04:00:00+09:00,0.15625,0.021042,0.004248,3.065275,2.887952,102937.523438,61.080189,0.003418,280.266052,0.036102,-0.031177,2.678841,-3.966148,WTG01,4.786076,325.963867


Unnamed: 0,dt,elevation,land_cover,surf_rough,frictional_vmax_50m,frictional_vmin_50m,pressure,relative_humid,specific_humid,temp_air,storm_u_5m,storm_v_5m,wind_u_10m,wind_v_10m,turbine_id,wind_speed,wind_direction
0,2023-01-01 00:00:00+09:00,0.15625,0.021042,0.004261,5.453722,4.858016,102968.8125,53.894802,0.003174,280.996857,0.110175,-0.0297,4.781274,-3.15974,WTG01,5.731016,303.459015
1,2023-01-01 01:00:00+09:00,0.15625,0.021042,0.004269,6.164143,5.472655,102967.40625,52.638199,0.003174,281.004822,0.127694,-0.032369,5.404456,-3.56954,WTG01,6.476863,303.444
2,2023-01-01 02:00:00+09:00,0.15625,0.021042,0.004278,6.524754,5.98325,102989.492188,51.950668,0.003169,281.012634,0.10373,-0.042067,5.573801,-3.875658,WTG01,6.788813,304.812225
3,2023-01-01 03:00:00+09:00,0.15625,0.021042,0.004284,6.66741,6.047954,102987.890625,51.815098,0.003169,281.012573,0.058741,-0.049514,5.666257,-4.311956,WTG01,7.120354,307.270721
4,2023-01-01 04:00:00+09:00,0.15625,0.021042,0.004286,7.081893,5.935673,102947.242188,53.145462,0.003169,281.015503,0.101192,-0.057647,5.362169,-4.881277,WTG01,7.251187,312.312134


In [29]:
# dt가 인덱스로 설정되어 있어야 아래 logarithmic으로 변경가능
yg_train.set_index('dt', inplace = True)
yg_test.set_index('dt', inplace = True)

In [34]:
#### logarithmic_profile 함수로 height(100)에 따라 풍속 보정하기
yg_train_x = yg_train[['pressure', 'relative_humid', 'specific_humid', 'temp_air', 'wind_speed', 'wind_direction', 'surf_rough', 'turbine_id']]
# 터빈 허브 높이의 풍속을 log-wall wind profile을 적용해 산출 (경주풍력의 터빈 지표고도 = 100m, 메타데이터 참고)
x_windspeed_100m = yg_train_x.groupby("turbine_id").apply(
    lambda x:logarithmic_profile(x.wind_speed, 10, 100, x.surf_rough)
).T.reset_index().melt(
    value_vars = yg_train_x.turbine_id.unique().tolist(), id_vars="dt", value_name="wind_speed_height"
)
yg_train_x = pd.merge(yg_train_x.reset_index(), x_windspeed_100m, on=["dt", "turbine_id"])
yg_train_x.drop(['surf_rough', 'wind_speed'], axis = 1, inplace = True)

########################################################################################################

yg_test_x = yg_test[['pressure', 'relative_humid', 'specific_humid', 'temp_air', 'wind_speed', 'wind_direction', 'surf_rough', 'turbine_id']]
# 터빈 허브 높이의 풍속을 log-wall wind profile을 적용해 산출 (경주풍력의 터빈 지표고도 = 100m, 메타데이터 참고)
x_windspeed_100m = yg_test_x.groupby("turbine_id").apply(
    lambda x:logarithmic_profile(x.wind_speed, 10, 100, x.surf_rough)
).T.reset_index().melt(
    value_vars = yg_test_x.turbine_id.unique().tolist(), id_vars="dt", value_name="wind_speed_height"
)
yg_test_x = pd.merge(yg_test_x.reset_index(), x_windspeed_100m, on=["dt", "turbine_id"])
yg_test_x.drop(['surf_rough', 'wind_speed'], axis = 1, inplace = True)

display(yg_train_x.head())
display(yg_test_x.head())

  x_windspeed_100m = yg_train_x.groupby("turbine_id").apply(
  x_windspeed_100m = yg_test_x.groupby("turbine_id").apply(


Unnamed: 0,dt,pressure,relative_humid,specific_humid,temp_air,wind_direction,turbine_id,wind_speed_height
0,2020-01-02 00:00:00+09:00,103028.84375,61.177334,0.003413,280.260773,329.24173,WTG01,7.205547
1,2020-01-02 01:00:00+09:00,103004.507812,61.698036,0.003413,280.260681,320.563873,WTG01,6.310057
2,2020-01-02 02:00:00+09:00,102999.546875,62.101166,0.003418,280.268921,319.445862,WTG01,5.993149
3,2020-01-02 03:00:00+09:00,102988.96875,61.944283,0.003418,280.276672,325.505798,WTG01,5.975933
4,2020-01-02 04:00:00+09:00,102937.523438,61.080189,0.003418,280.266052,325.963867,WTG01,6.205494


Unnamed: 0,dt,pressure,relative_humid,specific_humid,temp_air,wind_direction,turbine_id,wind_speed_height
0,2023-01-01 00:00:00+09:00,102968.8125,53.894802,0.003174,280.996857,303.459015,WTG01,7.431366
1,2023-01-01 01:00:00+09:00,102967.40625,52.638199,0.003174,281.004822,303.444,WTG01,8.398944
2,2023-01-01 02:00:00+09:00,102989.492188,51.950668,0.003169,281.012634,304.812225,WTG01,8.804049
3,2023-01-01 03:00:00+09:00,102987.890625,51.815098,0.003169,281.012573,307.270721,WTG01,9.234368
4,2023-01-01 04:00:00+09:00,102947.242188,53.145462,0.003169,281.015503,312.312134,WTG01,9.40417


In [42]:
feature = ['pressure', 'relative_humid', 'specific_humid', 'temp_air', 'wind_direction', 'turbine_id', 'wind_speed_height']
temp_train = yg_train_x.copy()
yg_train_feature = temp_train.pivot(index = 'dt', columns = 'turbine_id', values = feature).reset_index()
yg_train_feature.columns = [' '.join(col).strip() for col in yg_train_feature.columns.values]

temp_test = yg_test_x.copy()
yg_test_feature = temp_test.pivot(index = 'dt', columns = 'turbine_id', values = feature).reset_index()
yg_test_feature.columns = [' '.join(col).strip() for col in yg_test_feature.columns.values]


yg_train_feature.drop(['turbine_id WTG01', 'turbine_id WTG02', 'turbine_id WTG03',
       'turbine_id WTG04', 'turbine_id WTG05', 'turbine_id WTG06',
       'turbine_id WTG07', 'turbine_id WTG08', 'turbine_id WTG09',
       'turbine_id WTG10', 'turbine_id WTG11', 'turbine_id WTG12',
       'turbine_id WTG13', 'turbine_id WTG14', 'turbine_id WTG15',
       'turbine_id WTG16', 'turbine_id WTG17', 'turbine_id WTG18',
       'turbine_id WTG19', 'turbine_id WTG20', 'turbine_id WTG21',
       'turbine_id WTG22', 'turbine_id WTG23', 'turbine_id WTG24',
       'turbine_id WTG25', 'turbine_id WTG26', 'turbine_id WTG27',
       'turbine_id WTG28', 'turbine_id WTG29', 'turbine_id WTG30',
       'turbine_id WTG31', 'turbine_id WTG32', 'turbine_id WTG33',
       'turbine_id WTG34', 'turbine_id WTG35'],axis = 1, inplace = True)
yg_test_feature.drop(['turbine_id WTG01', 'turbine_id WTG02', 'turbine_id WTG03',
       'turbine_id WTG04', 'turbine_id WTG05', 'turbine_id WTG06',
       'turbine_id WTG07', 'turbine_id WTG08', 'turbine_id WTG09',
       'turbine_id WTG10', 'turbine_id WTG11', 'turbine_id WTG12',
       'turbine_id WTG13', 'turbine_id WTG14', 'turbine_id WTG15',
       'turbine_id WTG16', 'turbine_id WTG17', 'turbine_id WTG18',
       'turbine_id WTG19', 'turbine_id WTG20', 'turbine_id WTG21',
       'turbine_id WTG22', 'turbine_id WTG23', 'turbine_id WTG24',
       'turbine_id WTG25', 'turbine_id WTG26', 'turbine_id WTG27',
       'turbine_id WTG28', 'turbine_id WTG29', 'turbine_id WTG30',
       'turbine_id WTG31', 'turbine_id WTG32', 'turbine_id WTG33',
       'turbine_id WTG34', 'turbine_id WTG35'],axis = 1, inplace = True)

print(yg_train_feature.shape)
print(yg_test_feature.shape)
yg_train_feature.columns
yg_test_feature.columns

(26202, 211)
(8760, 211)


Index(['dt', 'pressure WTG01', 'pressure WTG02', 'pressure WTG03',
       'pressure WTG04', 'pressure WTG05', 'pressure WTG06', 'pressure WTG07',
       'pressure WTG08', 'pressure WTG09',
       ...
       'wind_speed_height WTG26', 'wind_speed_height WTG27',
       'wind_speed_height WTG28', 'wind_speed_height WTG29',
       'wind_speed_height WTG30', 'wind_speed_height WTG31',
       'wind_speed_height WTG32', 'wind_speed_height WTG33',
       'wind_speed_height WTG34', 'wind_speed_height WTG35'],
      dtype='object', length=211)

In [43]:
### train_y 불러오기
train_y = pd.read_csv('train_y.csv').rename(columns = {'end_datetime': 'dt'})
train_y['dt'] = pd.to_datetime(train_y['dt']).dt.tz_convert('Asia/Seoul')
train_y.head()
yg_y = train_y.loc[train_y['plant_name'] == '영광풍력', ['dt', 'energy_kwh']]
yg_y

Unnamed: 0,dt,energy_kwh
26304,2020-01-01 01:00:00+09:00,15010.949
26305,2020-01-01 02:00:00+09:00,21870.458
26306,2020-01-01 03:00:00+09:00,13869.402
26307,2020-01-01 04:00:00+09:00,6434.200
26308,2020-01-01 05:00:00+09:00,6702.713
...,...,...
52603,2022-12-31 20:00:00+09:00,1708.529
52604,2022-12-31 21:00:00+09:00,67.645
52605,2022-12-31 22:00:00+09:00,0.000
52606,2022-12-31 23:00:00+09:00,0.000


In [44]:
# merge

yg_train_data = pd.merge(yg_train_feature, yg_y, on = 'dt', how = 'inner')
yg_train_data.head()

Unnamed: 0,dt,pressure WTG01,pressure WTG02,pressure WTG03,pressure WTG04,pressure WTG05,pressure WTG06,pressure WTG07,pressure WTG08,pressure WTG09,...,wind_speed_height WTG27,wind_speed_height WTG28,wind_speed_height WTG29,wind_speed_height WTG30,wind_speed_height WTG31,wind_speed_height WTG32,wind_speed_height WTG33,wind_speed_height WTG34,wind_speed_height WTG35,energy_kwh
0,2020-01-02 00:00:00+09:00,103028.84375,103030.59375,103030.59375,103030.59375,103030.59375,103030.59375,103030.59375,103030.59375,103030.59375,...,7.407754,7.407754,7.407754,4.367079,4.367079,7.298501,2.010578,2.010578,2.010578,0.0
1,2020-01-02 01:00:00+09:00,103004.507812,103005.382812,103005.382812,103003.757812,103003.757812,103003.757812,103003.757812,103004.507812,103004.507812,...,6.418415,6.418415,6.418415,5.369586,5.369586,5.586196,4.949708,4.949708,4.949708,40.931
2,2020-01-02 02:00:00+09:00,102999.546875,103000.796875,103000.796875,102999.671875,102999.671875,102999.671875,102999.671875,103000.171875,103000.171875,...,6.285058,6.285058,6.285058,4.444695,4.444695,5.434196,3.768684,3.768684,3.768684,185.493
3,2020-01-02 03:00:00+09:00,102988.96875,102990.96875,102990.96875,102990.96875,102990.96875,102990.96875,102990.96875,102990.96875,102990.96875,...,6.252741,6.252741,6.252741,4.852137,4.852137,5.360041,4.261261,4.261261,4.261261,16.287
4,2020-01-02 04:00:00+09:00,102937.523438,102939.523438,102939.523438,102939.773438,102939.773438,102939.773438,102939.773438,102939.648438,102939.648438,...,6.520255,6.520255,6.520255,5.256749,5.256749,5.541148,4.599045,4.599045,4.599045,0.0


In [45]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error


# Extract datetime features (hour of day, day of week, etc.)
yg_train_data['hour'] = yg_train_data['dt'].dt.hour
yg_train_data['day_of_week'] = yg_train_data['dt'].dt.dayofweek
yg_train_data['month'] = yg_train_data['dt'].dt.month

# Lag features: creating lagged versions of the features (1-hour lag example)
cols = yg_train_data.columns.to_list()[1:]

for col in cols:
    yg_train_data[col] = yg_train_data[col].shift(24)

# Drop rows with NaN values introduced by lagging
yg_train_data = yg_train_data.dropna()

# Split data into features (X) and target (y)
X = yg_train_data.drop(['dt', 'energy_kwh'], axis = 1)
y = yg_train_data['energy_kwh']

################################################################################
# Extract datetime features (hour of day, day of week, etc.)
yg_test_feature['hour'] = yg_test_feature['dt'].dt.hour
yg_test_feature['day_of_week'] = yg_test_feature['dt'].dt.dayofweek
yg_test_feature['month'] = yg_test_feature['dt'].dt.month

# Lag features: creating lagged versions of the features (1-hour lag example)
cols = yg_test_feature.columns.to_list()[1:]

for col in cols:
    yg_test_feature[col] = yg_test_feature[col].shift(24)

# Drop rows with NaN values introduced by lagging
# gj_test_feature = gj_test_feature.dropna()
yg_test_feature.drop(['dt'], axis = 1, inplace = True)


# train data x, y로 나누기
# Split data into features (X) and target (y)
X = yg_train_data.drop(['dt', 'energy_kwh'], axis = 1)
y = yg_train_data['energy_kwh']

# 성능평가
# Train-test split
# train_size = int(0.8 * len(gj_data))  # 80% for training, 20% for testing
# X_train, X_test = X[:train_size], X[train_size:]
# y_train, y_test = y[:train_size], y[train_size:]

# # Create and train the Random Forest model
# rf = RandomForestRegressor(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)

# # Predict on the test set
# y_pred_rf = rf.predict(X_test)

# # Evaluate the model
# mape = mean_absolute_percentage_error(y_test, y_pred_rf)
# print(f"Mean Absolute Percentage Error: {mape}")

# # You can also visualize the predictions vs actual power production
# import matplotlib.pyplot as plt

# plt.plot(y_test.index, y_test, label='Actual')
# plt.plot(y_test.index, y_pred_rf, label='Predicted')
# plt.xlabel('Time')
# plt.ylabel('Power Production')
# plt.legend()
# plt.show()

In [46]:
# 예측하기
# Create and train the Random Forest model
rf_yg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_yg.fit(X, y)


In [47]:
# Predict on the test set
y_pred_rf_yg = rf_yg.predict(yg_test_feature)
y_pred_rf_yg


array([  115.64316,   115.64316,   115.64316, ..., 35372.46806,
       27873.61792, 26312.71126])

In [48]:
len(y_pred_rf)

8760

In [50]:
pred_value_yg = pd.DataFrame({'y_pred_rf_yg': y_pred_rf_yg})
pred_value_yg.index = range(8760, 8760 + len(pred_value_yg))

pred_value_yg.head()

Unnamed: 0,y_pred_rf_yg
8760,115.64316
8761,115.64316
8762,115.64316
8763,115.64316
8764,115.64316


In [52]:
submission.loc[pred_value_yg.index, 'energy_kwh'] = pred_value_yg['y_pred_rf_yg']
submission

Unnamed: 0,plant_name,end_datetime,period_hours,energy_kwh
0,경주풍력,2023-01-01T01:00:00+09:00,1,1960.800781
1,경주풍력,2023-01-01T02:00:00+09:00,1,1960.800781
2,경주풍력,2023-01-01T03:00:00+09:00,1,1960.800781
3,경주풍력,2023-01-01T04:00:00+09:00,1,1960.800781
4,경주풍력,2023-01-01T05:00:00+09:00,1,1960.800781
...,...,...,...,...
17515,영광풍력,2023-12-31T20:00:00+09:00,1,30656.068200
17516,영광풍력,2023-12-31T21:00:00+09:00,1,37208.675080
17517,영광풍력,2023-12-31T22:00:00+09:00,1,35372.468060
17518,영광풍력,2023-12-31T23:00:00+09:00,1,27873.617920


In [53]:
submission.to_csv('씽씽 불어라팀 1차 중간평가 제출(모델 2).csv', index = False)