In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1-cp37-none-manylinux1_x86_64.whl (76.8 MB)
[K     |████████████████████████████████| 76.8 MB 28 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 모델링용
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor

# tuning
from sklearn.model_selection import GridSearchCV

# 회귀모델 평가용
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

#### Data Load

In [4]:
# 학습 데이터 로딩
df2017 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/kaggle/test2017.csv', sep=',', index_col=0)
df2018 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/kaggle/test2018.csv', sep=',', index_col=0)
df2019 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/kaggle/test2019.csv', sep=',', index_col=0)
df2020 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/kaggle/test2020.csv', sep=',', index_col=0)
df2021 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/kaggle/test2021.csv', sep=',', index_col=0)

In [5]:
df_train = pd.concat([df2017, df2018, df2019, df2020, df2021])

In [6]:
df_train = df_train.reset_index()
df_train['기준일ID'] = df_train['기준일ID'].astype('str')
df_train['기준일ID'] = pd.to_datetime(df_train['기준일ID'])
df_train['year'] = df_train['기준일ID'].dt.year
df_train['month'] = df_train['기준일ID'].dt.month
df_train['day'] = df_train['기준일ID'].dt.day

# 불필요한 열 제거
df_train = df_train.loc[:,['총생활인구수', '시간대구분', 'year', 'month', 'day']]
df_train.rename(columns={'시간대구분':'hour'}, inplace=True)

#### 1416 생성

In [7]:
month = [1, 2]
day = range(1,32)
year = 2022
hour = range(24)
ls = []
for m in month:
  for d in day:
    for h in hour:
      ls.append((year,m,d,h))

ls = ls[:-72]
len(ls)

1416

In [8]:
temp = []
for year, m, d, h in ls:
  temp.append([np.nan, year, m, d, h])
df_test = pd.DataFrame(temp, columns=list(df_train))
df_test.rename(columns={'hour':'year','year':'month','month':'day','day':'hour'},inplace=True)

In [9]:
df_total_new = pd.concat([df_train, df_test])
df_total_new.reset_index(drop=True, inplace=True)
df_total_new

Unnamed: 0,총생활인구수,hour,year,month,day
0,43922.2281,0,2017,1,1
1,43763.4929,1,2017,1,1
2,43561.5708,2,2017,1,1
3,43928.9929,3,2017,1,1
4,44150.2973,4,2017,1,1
...,...,...,...,...,...
44923,,19,2022,2,28
44924,,20,2022,2,28
44925,,21,2022,2,28
44926,,22,2022,2,28


In [10]:
# weekday여부
from datetime import date
def is_weekend(d):
    return d.weekday() > 4
weekend_ls = []
for y,m,d in zip(df_total_new['year'], df_total_new['month'], df_total_new['day']):
  weekend_ls.append(int(is_weekend(date(y,m,d))))
df_total_new['weekend'] = weekend_ls

In [11]:
import requests
import xml.etree.ElementTree as ET
 
year=[2017,2018,2019,2020,2021]
#key="tMyvukB90PWt6qcV3cztPjVOx6V%2Fp0o9pVuwc9AqwwCwgTJMVvjHx1ZhdiOI6lWSLXh3K7PHRxNt2gY1d0%2FgXg%3D%3D" #일반인증키(encoding)
key = "tYQoDIlfuqQdGatJQ8Bonl2C4PzpWEeiMsQ8o4QxuRGwdg%2BNneMjn6H6wwRosTgV4pRf5p39OxZrPYya0XWjew%3D%3D"
holidays = []
for yr in year:
  res=requests.get("http://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getHoliDeInfo?solYear="+str(yr)+"&numOfRows=100&ServiceKey="+key) #한국천문연구원_특일 정보
  res.raise_for_status()
  res.encoding='EUC.KR'
  res=res.text

  temp=[]

  root = ET.fromstring(res) 
  for data in root.iter('item'): 
      
      isholiday=data[2].text
      if isholiday=="Y":
          temp.append(data[3].text)
  holidays.append(temp)

holi = []
for ls in holidays:
  for day in ls:
    holi.append(day)

is_holiday = []
for y, m, d in zip(df_total_new['year'], df_total_new['month'], df_total_new['day']):
  date = str(y) + "0"*(2-len(str(m))) + str(m) + "0"*(2-len(str(d))) + str(d)
  if date in holi:
    is_holiday.append(1)
  else:
    is_holiday.append(0)

df_total_new['holiday'] = is_holiday

In [12]:
df_total_new['shift1'] = df_total_new['총생활인구수'].shift(1416)
df_total_new['shift2'] = df_total_new['총생활인구수'].shift(1417)
df_total_new['shift3'] = df_total_new['총생활인구수'].shift(1418)
df_total_new['shift4'] = df_total_new['총생활인구수'].shift(1419)
df_total_new['shift5'] = df_total_new['총생활인구수'].shift(1420)
df_total_new['shift6'] = df_total_new['총생활인구수'].shift(1421)
df_total_new['shift7'] = df_total_new['총생활인구수'].shift(1422)
df_total_new['shift8'] = df_total_new['총생활인구수'].shift(1423)
df_total_new['shift9'] = df_total_new['총생활인구수'].shift(1424)
df_total_new['shift10'] = df_total_new['총생활인구수'].shift(1425)
df_total_new['shift11'] = df_total_new['총생활인구수'].shift(1426)
df_total_new['shift12'] = df_total_new['총생활인구수'].shift(1427)
df_total_new['shift13'] = df_total_new['총생활인구수'].shift(1428)
df_total_new['shift14'] = df_total_new['총생활인구수'].shift(1429)
df_total_new['shift15'] = df_total_new['총생활인구수'].shift(1430)
df_total_new['shift16'] = df_total_new['총생활인구수'].shift(1431)
df_total_new['shift17'] = df_total_new['총생활인구수'].shift(1432)
df_total_new['shift19'] = df_total_new['총생활인구수'].shift(1433)
df_total_new['shift20'] = df_total_new['총생활인구수'].shift(1434)
df_total_new['shift21'] = df_total_new['총생활인구수'].shift(1435)
df_total_new['shift22'] = df_total_new['총생활인구수'].shift(1436)
df_total_new['shift23'] = df_total_new['총생활인구수'].shift(1437)
df_total_new['shift24'] = df_total_new['총생활인구수'].shift(1438)
df_total_new['shift25'] = df_total_new['총생활인구수'].shift(1439)
df_total_new['shift26'] = df_total_new['총생활인구수'].shift(1440)

In [13]:
df_total_new['이동평균1'] = df_total_new['총생활인구수'].shift(1416).rolling(1416).mean()
df_total_new['이동평균2'] = df_total_new['총생활인구수'].shift(1417).rolling(1417).mean()
df_total_new['이동평균3'] = df_total_new['총생활인구수'].shift(1418).rolling(1418).mean()
df_total_new['이동평균4'] = df_total_new['총생활인구수'].shift(1419).rolling(1419).mean()
df_total_new['이동평균5'] = df_total_new['총생활인구수'].shift(1420).rolling(1420).mean()
df_total_new['이동평균6'] = df_total_new['총생활인구수'].shift(1421).rolling(1421).mean()
df_total_new['이동평균7'] = df_total_new['총생활인구수'].shift(1422).rolling(1422).mean()
df_total_new['이동평균8'] = df_total_new['총생활인구수'].shift(1423).rolling(1423).mean()
df_total_new['이동평균9'] = df_total_new['총생활인구수'].shift(1424).rolling(1424).mean()
df_total_new['이동평균10'] = df_total_new['총생활인구수'].shift(1425).rolling(1425).mean()
df_total_new['이동평균11'] = df_total_new['총생활인구수'].shift(1426).rolling(1426).mean()
df_total_new['이동평균12'] = df_total_new['총생활인구수'].shift(1427).rolling(1427).mean()
df_total_new['이동평균13'] = df_total_new['총생활인구수'].shift(1428).rolling(1428).mean()
df_total_new['이동평균14'] = df_total_new['총생활인구수'].shift(1429).rolling(1429).mean()
df_total_new['이동평균15'] = df_total_new['총생활인구수'].shift(1430).rolling(1430).mean()
df_total_new['이동평균16'] = df_total_new['총생활인구수'].shift(1431).rolling(1431).mean()
df_total_new['이동평균17'] = df_total_new['총생활인구수'].shift(1432).rolling(1432).mean()
df_total_new['이동평균18'] = df_total_new['총생활인구수'].shift(1433).rolling(1433).mean()
df_total_new['이동평균19'] = df_total_new['총생활인구수'].shift(1434).rolling(1434).mean()
df_total_new['이동평균20'] = df_total_new['총생활인구수'].shift(1435).rolling(1435).mean()
df_total_new['이동평균21'] = df_total_new['총생활인구수'].shift(1436).rolling(1436).mean()
df_total_new['이동평균22'] = df_total_new['총생활인구수'].shift(1437).rolling(1437).mean()
df_total_new['이동평균23'] = df_total_new['총생활인구수'].shift(1438).rolling(1438).mean()
df_total_new['이동평균24'] = df_total_new['총생활인구수'].shift(1439).rolling(1439).mean()
df_total_new['이동평균25'] = df_total_new['총생활인구수'].shift(1440).rolling(1440).mean()

In [14]:
df_total_new.drop(columns=['year','month', 'day'], inplace=True)

## Modeling

In [15]:
train = df_total_new.iloc[:-1416]
test = df_total_new.iloc[-1416:]

train.dropna(inplace=True)
train.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [17]:
train

Unnamed: 0,총생활인구수,hour,weekend,holiday,shift1,shift2,shift3,shift4,shift5,shift6,...,이동평균16,이동평균17,이동평균18,이동평균19,이동평균20,이동평균21,이동평균22,이동평균23,이동평균24,이동평균25
0,42320.5288,23,1,0,43853.3069,42189.7593,41836.4101,41090.7605,40369.1912,39133.9053,...,44178.971881,44178.891307,44176.622457,44175.864612,44175.177980,44175.415620,44175.950612,44175.252555,44173.727269,44172.631553
1,42858.5817,0,0,0,44599.3216,43853.3069,42189.7593,41836.4101,41090.7605,40369.1912,...,44177.544565,44178.451254,44178.416588,44176.582770,44175.920184,44175.516776,44176.113739,44176.209292,44175.081422,44173.442384
2,43043.8801,1,0,0,44740.7718,44599.3216,43853.3069,42189.7593,41836.4101,41090.7605,...,44175.236606,44177.216933,44178.107151,44177.764891,44176.710344,44176.155554,44176.407492,44176.928954,44176.191285,44174.655373
3,43102.6951,2,0,0,42281.8376,44740.7718,44599.3216,43853.3069,42189.7593,41836.4101,...,44172.109454,44174.987856,44176.697894,44177.633310,44177.724436,44176.765288,44176.493434,44177.104436,44177.186774,44176.019619
4,43247.4065,3,0,0,42636.4990,42281.8376,44740.7718,44599.3216,43853.3069,42189.7593,...,44168.442818,44172.274022,44174.662237,44176.355254,44176.982613,44177.851126,44176.999906,44177.382851,44177.918396,44177.168101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40628,33304.5400,19,0,0,35708.4819,35189.4563,35196.9603,35332.1774,35093.4066,36163.3722,...,37935.648525,37932.015588,37929.695954,37929.463084,37931.647672,37934.243677,37936.532344,37941.529998,37944.950338,37945.686103
40629,33368.7201,20,0,0,35924.7026,35708.4819,35189.4563,35196.9603,35332.1774,35093.4066,...,37938.452529,37934.373943,37931.260442,37929.954352,37930.981841,37933.500021,37936.278168,37938.650450,37943.793181,37945.132636
40630,34083.1721,21,0,0,36597.7205,35924.7026,35708.4819,35189.4563,35196.9603,35332.1774,...,37941.478225,37936.938715,37933.427455,37930.881453,37930.811141,37932.961611,37935.587387,37938.286010,37940.834447,37943.966047
40631,34380.0806,22,0,0,37163.4665,36597.7205,35924.7026,35708.4819,35189.4563,35196.9603,...,37945.068053,37940.027369,37935.664122,37932.671851,37931.138845,37932.327902,37934.811757,37937.619528,37940.401426,37943.096541


In [None]:
# x, y split
x_train = train.drop(columns=['총생활인구수'])
y_train = train.loc[:, '총생활인구수']

test.reset_index(inplace=True, drop=True)
y_test = test['총생활인구수']
x_test = test.drop(columns=['총생활인구수'])

In [None]:
# stacking
models = {
    'cat': CatBoostRegressor(),
    'rf': RandomForestRegressor(),
    'lgbm': LGBMRegressor(),
}

stacking = StackingRegressor(
    estimators=list(models.items()),
    final_estimator=LinearRegression(),
    cv=5
)
stacking.fit(x_train, y_train)
pred_stacking = stacking.predict(x_test)

## Inference

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/kaggle/test_submission.csv', sep=',', index_col=False)
submission['count'] = pred_stacking

In [None]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/KT_mini_5/stacking_new.csv', index=False)