# Import Library : 분석에 사용할 모듈 설치

"통계추론, 기계학습 및 딥러닝의 흐름에 시간패턴을 반영하려 진화"

"지도학습(예측 분류), 비지도학습 문제에 모두 활용되는 필수 알고리즘"

"미래 예측을 포함한 추천 서비스와 같은 비즈니스에 활용중"

<img src='./img/TS_Evolution.png'>

In [1]:
# !pip install --upgrade pip
# !python -m pip install --user --upgrade pip
# !pip install pandas-datareader
# !pip install tqdm
# !pip install xgboost
# !pip install lightgbm
# !pip install --user pmdarima

In [3]:
# Auto reload and user defined functions
%reload_ext autoreload
%autoreload 2
from module_regression import *
from module_classification import *
from module_timeseries import *

# Data manipulation and useful functions
import requests
from io import BytesIO
import random
from itertools import product
import scipy as sp

# Time series algorithms
from statsmodels.tsa.arima_process import arma_generate_sample
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import AutoARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, plot_predict

# **Base Algorithm:** Linear Regression

In [11]:
# 데이터 로딩
location = os.path.join('.', 'data', 'Bike_Sharing_Demand_Full.csv')
df_all = pd.read_csv(location)

# 데이터 전처리
df_fe = feature_engineering(df_all)

# 데이터 분리
Y_colname = ['count']
X_remove = ['datetime', 'DateTime', 'temp_group', 'casual', 'registered']
X_colname = [x for x in df_fe.columns if x not in Y_colname+X_remove]
X_train, X_test, Y_train, Y_test = datasplit_ts(df_fe, Y_colname, X_colname, '2012-07-01')

# 데이터 전처리(현실성)
### Reality ###
target = ['count_trend', 'count_seasonal', 'count_Day', 'count_Week', 'count_diff']
X_train_R = feature_engineering_year_duplicated(X_train, target)
X_test_R = feature_engineering_year_duplicated(X_test, target)
target = ['count_lag1', 'count_lag2']
X_test_R = feature_engineering_lag_modified(Y_test, X_test_R, target)
###############
### Scaling ###
X_train_RS, X_test_RS = feature_engineering_scaling(preprocessing.Normalizer(), 
                                                    X_train_R, X_test_R)
###############
### Multicollinearity ###
print('Number_of_Total_X: ', len(X_train_RS.columns))
X_colname_vif = feature_engineering_XbyVIF(X_train_RS, 12)
print('Number_of_Selected_X: ', len(X_colname_vif))
X_train_RSM, X_test_RSM = X_train_RS[X_colname_vif].copy(), X_test_RS[X_colname_vif].copy()
#########################

# Linear Regression
model_lr = sm.OLS(Y_train, X_train_RSM).fit()
display(model_lr.summary())

Y_trpred = pd.DataFrame(model_lr.predict(X_train_RSM), columns=['Pred'])
Y_tepred = pd.DataFrame(model_lr.predict(X_test_RSM), columns=['Pred'])
plot_prediction(pd.concat([Y_train, Y_trpred], axis=1).reset_index().iloc[:,1:])
plot_prediction(pd.concat([Y_test, Y_tepred], axis=1).reset_index().iloc[:,1:])

# 분석 검증
Score = evaluation_reg_trte(Y_train, Y_trpred, Y_test, Y_tepred)
display(Score)

# 에러 분석
Resid_te = Y_test - Y_tepred.values
Resid_te.columns = ['Error']
Error_te = error_analysis(X_test, Y_tepred, Resid_te, graph_on=True)
display(Error_te)

Train_size: (13128, 29) Test_size: (4416, 29)
X_train: (13128, 24) Y_train: (13128, 1)
X_test: (4416, 24) Y_test: (4416, 1)


ValueError: Must have equal len keys and value when setting with an iterable