In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 데이터 로드

In [7]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/dacon_seoul/data/train.csv')
submission_df = pd.read_csv('/content/drive/MyDrive/dacon_seoul/data/sample_submission.csv')

In [8]:
train_df

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,,68.3,1.7,6.7,,,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,,,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,,,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,,,7.5
4,1960-01-05,1.3,-8.2,9.5,,44.0,5.1,8.2,,,-4.6
...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,,69.8,1.8,8.8,10.25,91.7,-2.6
23007,2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.86,90.6,-3.3
23008,2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.88,93.8,-2.9
23009,2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.84,82.3,-1.8


- 일시: 과거 데이터들의 관측 날짜 (1960-01-01 ~ 2022-12-31)

- 최고기온, 최저기온, 일교차

- 강수량, 평균습도, 평균풍속

- 일조합, 일사합, 일조율

- 평균기온: 우리가 예측하고자 하는 대상

In [9]:
submission_df

Unnamed: 0,일시,평균기온
0,2023-01-01,0
1,2023-01-02,0
2,2023-01-03,0
3,2023-01-04,0
4,2023-01-05,0
...,...,...
353,2023-12-20,0
354,2023-12-21,0
355,2023-12-22,0
356,2023-12-23,0


- sample_submission.csv 파일에는 다음과 같은 컬럼이 포함되어 있음.

- 일시: 문제에서 정의된 미래에 대해 예측해야 할 날짜 (2023-01-01 ~ 2023-12-24)

- 평균기온: 이 값에 대한 예측을 생성

# 데이터 전처리

In [10]:
# 날짜 데이터 변환
train_df['일시'] = pd.to_datetime(train_df['일시'])
train_df = train_df.set_index('일시')

# 데이터의 시간 간격 지정
train_df.index.freq = 'D'

# 일시 컬럼이 인덱스로 할당됩니다.
train_df.head()

Unnamed: 0_level_0,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1960-01-01,2.2,-5.2,7.4,,68.3,1.7,6.7,,,-1.6
1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,,,-1.9
1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,,,4.0
1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,,,7.5
1960-01-05,1.3,-8.2,9.5,,44.0,5.1,8.2,,,-4.6


# ARIMA 모델

In [11]:
from statsmodels.tsa.arima.model import ARIMA

# ARIMA 모델 훈련
model = ARIMA(train_df['평균기온'], order=(2, 1, 3))  # p, d, q 값은 조정 필요
model_fit = model.fit()



In [12]:
submission_df['일시'] = pd.to_datetime(submission_df['일시'])

# 예측할 기간 설정
start_date = submission_df['일시'].min()
end_date = submission_df['일시'].max()

# ARIMA 모델을 사용하여 예측
forecast = model_fit.predict(start=start_date, end=end_date, typ='levels')



예측 수행

In [13]:
# 예측 결과 추출 및 확인
submission_df['평균기온'] = forecast.values
display(submission_df.head())

# 예측 결과 저장
submission_df.to_csv('./baseline_submit.csv', index=False)

Unnamed: 0,일시,평균기온
0,2023-01-01,-2.923193
1,2023-01-02,-4.701815
2,2023-01-03,-5.781971
3,2023-01-04,-6.478582
4,2023-01-05,-6.963905
