In [None]:
import pandas as pd
import numpy as np
import os
from glob import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.tsa.stattools import adfuller
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

plt.rc('font', family='NanumGothic')
pd.set_option('display.max_columns', None)

df = pd.read_csv('Panel/data/a_macro.csv', encoding='cp949')

df['date'] = pd.to_datetime(df['date'], format='%Y/%m')
df.set_index('date', inplace=True)

c_code = "A005930"  # 삼성전자
cdataList = glob("Panel/data/*.csv")[1:]
for cdata in cdataList:
    fname = cdata.split('\\')[1].split('.')[0]
    temp_cdf = pd.read_csv(cdata, encoding='utf-8')
    df[fname] = temp_cdf[c_code].values


### visualization for all columns
# for feature in df.columns:
#     plt.figure(figsize=(10, 8))
#     plt.plot(df.index, df[feature])
#     plt.xlabel('date')
#     plt.ylabel(feature)
#     plt.title('the change in {} over time'.format(feature))
#     plt.show()

### ADF test
adf_df = pd.DataFrame(columns=['stat', 'p-value', 'stationary'], index=df.columns)
for feature in df.columns:
    print(feature)
    temp_result = adfuller(df[feature].dropna())
    temp_stat = temp_result[0]
    temp_pval = temp_result[1]
    temp_stationary = "o" if temp_pval <= 0.05 else "x"
    adf_df.loc[feature] = [temp_stat, temp_pval, temp_stationary]


### 1st differenciate and ADF test
non_staionary = adf_df[adf_df['stationary'] == 'x'].index.drop('kospi')
adf_df2 = pd.DataFrame(columns=['stat', 'p-value', 'stationary'], index=non_staionary)
for feature in non_staionary:
    print(feature)
    temp_result = adfuller(np.log(df[feature].dropna()).diff().dropna())
    temp_stat = temp_result[0]
    temp_pval = temp_result[1]
    temp_stationary = "o" if temp_pval <= 0.05 else "x"
    adf_df2.loc[feature] = [temp_stat, temp_pval, temp_stationary]

###
adf_df2

### change non_stationary columns to stationary ones
df[non_staionary] = np.log(df[non_staionary])
df[non_staionary] = df[non_staionary].diff()

### RE-visualization for all columns
for feature in df.columns:
    plt.figure(figsize=(10, 8))
    plt.plot(df.index, df[feature])
    plt.xlabel('date')
    plt.ylabel(feature)
    plt.title('the change in {} over time'.format(feature))
    plt.show()

###
df.columns

###
target_df = df[['kospi', 'koribor', 'xp', 'cpi', 'ppi', 'esi', 'pis', 'dowjones', 'nasdaq', 'shcomp', 'e_dollar', 'oil']].iloc[55:-1, :]
target_df['kospi'] = target_df['kospi'].fillna(method='bfill')
target_df['pis'] = target_df['pis'].fillna(method='bfill')
target_df = target_df.reset_index().iloc[:, 1:]
# for i in range(1, len(target_df)):
#     target_df.loc[i+1, 'kospi'] = (target_df.loc[i, 'kospi'] - target_df.loc[i-1, 'kospi']) / target_df.loc[i-1, 'kospi']
target_df = target_df.dropna()
X = target_df.drop(['kospi'], axis=1)
y = target_df['kospi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


###
window_size = 3
predictions = []
true_values = []
for i in range(len(target_df) - window_size):
    X_train = X.iloc[i:i+window_size, :].values
    y_train = y.iloc[i:i+window_size].values
    X_test = X.iloc[i + window_size:i + window_size + 1, :].values
    # model = CatBoostRegressor(iterations=100, learning_rate=0.3, depth=6)
    model = LGBMRegressor(max_depth=9, reg_lambda=0.9, num_leaves=5)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    predictions.append(prediction[0])
    true_values.append(y.iloc[i + window_size])
    print(i, prediction)

rmse = np.sqrt(mean_squared_error(true_values, predictions))
print("Root Mean Squared Error (RMSE): {}".format(rmse))

plt.figure(figsize=(10, 6))
plt.plot(target_df.index, target_df['kospi'], label='True Values')
plt.plot(target_df.index[window_size:], predictions, label='Predicted Values')
plt.xlabel("Date")
plt.ylabel("KOSPI")
plt.title("Rolling Window(size={}) Prediction using LightGBM".format(window_size))
plt.legend()
plt.show()


###

