In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORT

In [None]:
import numpy as np
import pandas as pd
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.stattools import pacf
from statsmodels.regression.linear_model import yule_walker
import matplotlib.pyplot as plt
%matplotlib inline
from keras.models import Sequential
from keras.optimizers import RMSprop, SGD
from keras.metrics import MeanSquaredError
import tensorflow as tf
import keras
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, roc_auc_score

# 하이퍼파라미터
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# 분류모델
from sklearn.ensemble import RandomForestClassifier     # Random Forest Classifier
from sklearn.tree import DecisionTreeClassifier         # DecisionTree
from sklearn.naive_bayes import GaussianNB              # Naive bayes
from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.svm import SVC                             # SVM
from sklearn.neighbors import KNeighborsClassifier      # KNN
from sklearn.ensemble import GradientBoostingClassifier # GBT

# LOAD DATA

In [None]:
# 기존 30% 주식 데이터를 불러오는 함수
def load_data(file_name):
    file_path = "/content/drive/MyDrive/0_Capstone/data/merged_completed/" + file_name

    # 주식 데이터 불러오기
    all_data_orign = pd.read_csv(file_path)

    all_data_orign.rename(columns={
        'TRD_DD': 'Timestamp',
        'TDD_CLSPRC': 'Close',
        'TDD_OPNPRC': 'Open',
        'TDD_HGPRC': 'High',
        'TDD_LWPRC': 'Low',
        'ACC_TRDVOL': 'Volume'
    }, inplace=True)

    all_data_orign['Timestamp'] = pd.to_datetime(all_data_orign['Timestamp'])

    # 쉼표 제거하기
    def remove_commas(value):
        if isinstance(value, str):
            return int(value.replace(',', ''))
        return value

    all_data_orign = all_data_orign.applymap(remove_commas)

    #df 날짜 역순변경
    all_data_orign["TRD_DD2"] = pd.to_datetime(all_data_orign["Timestamp"])
    all_data_orign.sort_values(by =['TRD_DD2'],ascending = True, inplace= True)
    # trend.head()
    all_data_orign.drop(columns = 'TRD_DD2', inplace = True)
    all_data_orign.reset_index(drop = True, inplace = True)

    columns_to_drop = ['Close','Open', 'High', 'Low', 'Volume']
    all_data_orign.drop(columns=columns_to_drop, inplace=True)

    #예측 정확도 계산에 이용될 30% 데이터 추출
    N = len(all_data_orign)
    traintest_cutoff = int(np.ceil(0.7*N))

    all_data_orign_30  = all_data_orign[traintest_cutoff:]

    return all_data_orign_30

## TRAIN, TEST 분리

In [None]:
#트렌드 데이터 불러오는 함수
def load_trend_data(file_name):
  file_name = file_name.split('.')[0]
  file_path = "/content/drive/MyDrive/0_Capstone/data/merged_trend/" + file_name + "_preprocessed.csv"
  all_data= pd.read_csv(file_path)

  # TREND_NEW_NO 열 생성
  all_data['TREND_NEW_NO'] = 0
  all_data.loc[all_data['TREND_new'] == 1, 'TREND_NEW_NO'] = 1
  all_data.loc[all_data['TREND_new'] == -1, 'TREND_NEW_NO'] = -1

  return all_data

In [None]:
#트렌드 데이터 학습-검증 분할 함수
def ts_train_test(file_name):
  all_data = load_trend_data(file_name)
  # Cutoff train, test data
  N = len(all_data)
  traintest_cutoff = int(np.ceil(0.7*N))
  time_steps = 5
  for_periods = 2

  # Define a function to prepare the data
  #def ts_train_test(all_data, time_steps, for_periods, traintest_cutoff):
  # Extract the features (independent variables)
  features = all_data[['RSI_sig', 'SMA_sig', 'BBND_sig', 'ROC_sig', 'DPO_sig', 'STOCH_sig', 'MACD_sig', 'GDC_sig']].values

  # Extract the target variable (dependent variable)
  target = all_data['TREND_NEW_NO'].values

  # Split the data into training and test sets
  split_index = len(all_data[:traintest_cutoff])
  ts_X_train = features[:split_index]
  ts_y_train = target[:split_index]
  ts_X_test = features[split_index - time_steps:]
  ts_Y_test = target[split_index - time_steps:]

  # Reshape data
  X_train = []
  y_train = []
  X_test = []
  Y_test = []

  for i in range(time_steps, len(ts_X_train) - for_periods + 1):
      X_train.append(features[i - time_steps:i])
      y_train.append(target[i:i + for_periods])

  for i in range(time_steps, len(ts_X_test)):
      X_test.append(features[i - time_steps:i])
      Y_test.append(target[i:i + for_periods])

  X_train = np.array(X_train)
  y_train = np.array(y_train)
  X_test = np.array(X_test)
  Y_test = np.array(Y_test)

  # return X_train, y_train, X_test, Y_test

  # Prepare the data
  # X_train, y_train, X_test, Y_test = ts_train_test(all_data, 5, 2, traintest_cutoff)
  X_train.shape[0],X_train.shape[1]

  # Convert data types to float64
  X_train = X_train.astype(np.float64)
  y_train = y_train.astype(np.float64)

  # 데이터 차원 다시 1차원으로 조정
  X_train_1d = X_train.reshape(X_train.shape[0], -1)
  Y_train_1d = y_train[:, 1]  # 두 개씩 반복되어 있어서 마지막 하나 데이터가 더 남아있는 뒤에 열로 선택하여 슬라이싱.

  X_test_1d = X_test.reshape(X_test.shape[0], -1)
  Y_test_1d = Y_test[:, 1]

  return X_train_1d, Y_train_1d, X_test_1d, Y_test_1d


# 분류 모델의 평가 함수

In [None]:
# evaluate_classifier 함수에서 sig_column을 사용하기 때문에 반환값으로 all_data_orign_30을 기대
def making_sig(pred,all_data_orign_30):
  all_data_orign_30['pred'] = pred
  all_data_orign_30['pred_sig'] = pred.astype(int)

  # 인덱스 재편성
  all_data_orign_30.reset_index(drop=True, inplace=True)

  return all_data_orign_30

In [None]:
# 혼동행렬
def plot_confusion_matrix(Y_test, pred):
    cm = pd.crosstab(Y_test, pred, rownames=['Actual'], colnames=['Predicted'])

    fig, ax = plt.subplots(ncols=1, figsize=(5, 5))
    sns.heatmap(cm,
                xticklabels=['-1', '0', '1'],
                yticklabels=['-1', '0', '1'],
                annot=True, ax=ax,
                linewidths=.2, linecolor="Darkblue", cmap="Blues")
    plt.title('Confusion Matrix', fontsize=14)
    plt.show()

# 각 클래스당 실제 값으로 가진 샘플 수에 대한 비율
def calculate_normalized_confusion_matrix(Y_test, pred):
    cm_normalized = confusion_matrix(Y_test, pred, normalize='true') * 100
    print('Normalized Confusion Matrix (%):')
    print(cm_normalized)

# 모델 평가
def evaluate_classifier(model, X_train, Y_train, X_test, Y_test, all_data_orign_30):
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)

    train_auc = model.score(X_train, Y_train).round(2)
    test_auc = model.score(X_test, Y_test).round(2)

    all_data_orign_30 = making_sig(pred, all_data_orign_30)
    profits = result_profit(all_data_orign_30, 'pred_sig')

    return train_auc, test_auc, profits, pred

In [None]:
# 모델 모든 평가 지표 시각화 출력
def visualization(Y_test, pred) :
  plot_confusion_matrix(Y_test, pred)
  calculate_normalized_confusion_matrix(Y_test, pred)

  print('Classification Report:')
  print(classification_report(Y_test, pred))

  RMSE = mean_squared_error(Y_test, pred).round(2)
  print('RMSE Error:', RMSE)
  return RMSE

# Backtesting 함수

In [None]:
#시그널 수익률 계산 함수
def result_profit(df, sig_column):
  # 초기 자본금 설정
  initial_capital = 1000000000000000  #1000조

  # 보유 주식 수와 자본금 추적
  shares_held = 0
  capital = initial_capital
  capital_history = [capital]

  # 매수, 매도, 또는 보유 결정에 따른 자본금 변화 계산
  for i in range(1, len(df)):
      if df[sig_column][i] == 1:  # Buy 시그널인 경우
          shares_to_buy = capital // df['MKTCAP'][i]  # 보유 가능한 주식 수 계산
          shares_held += shares_to_buy
          capital -= shares_to_buy * df['MKTCAP'][i]

      elif df[sig_column][i] == -1:  # Sell 시그널인 경우
          capital += shares_held * df['MKTCAP'][i]  # 보유 주식 매도
          shares_held = 0

      if df[sig_column][i] == 0:  # 0 시그널인 경우 (보유 유지)
            capital_history.append(capital + shares_held * df['MKTCAP'][i])

      capital_history.append(capital + shares_held * df['MKTCAP'][i])  # 자본금 변화 추적

  # 수익률 계산
  returns = (capital_history[-1] - initial_capital) / initial_capital * 100

  return returns

# Model

In [None]:
file_names = [
    "new_KR7005930003.csv",
    "new_KR7000250001.csv",
    "new_KR7036570000.csv",
    "new_KR7051910008.csv",
    "new_KR7066700006.csv",
    "new_KR7066970005.csv",
    "new_KR7068760008.csv",
    "new_KR7078600004.csv",
    "new_KR7096770003.csv",
    "new_KR7128940004.csv",
    "new_KR7185750007.csv",
    "new_KR7192080000.csv",
    "new_KR7207940008.csv",
    "new_KR7225570001.csv",
    "new_KR7247540008.csv",
    "new_KR7251270005.csv",
    "new_KR7263750002.csv",
    "new_KR7293490009.csv",
    "new_KR7373220003.csv",
]

In [None]:
# 수익률 * 0.8 + 일치율 * 0.2
def custom_scorer(y_true, y_pred):
    train_auc, test_auc, profits, pred = evaluate_classifier(model, X_train_1d, Y_train_1d, X_test_1d, Y_test_1d, all_data_orign_30)
    loss = 0.8 * profits + 0.2 * train_auc
    return loss

In [None]:
# 각 모델에 대한 파라미터 그리드 정의
param_grids = {
    'RandomForest': {
        'n_estimators': [10, 50, 100],
        'max_depth': [None, 10, 20],
        'class_weight': ['balanced', None]
    },
    'DecisionTree': {
        'max_depth': [None, 10, 20],
        'criterion': ['gini', 'entropy']
    },
    'NaiveBayes': {},  # Naive Bayes는 튜닝할 하이퍼파라미터가 없음
    'LogisticRegression': {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'max_iter': [1000, 2000]
    },
    'SVM': {
        'C': [0.1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': [ 0.01, 0.1, 10],
        'class_weight': ['balanced', None]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'GBT': {
       'n_estimators': [10, 50, 100],
       'learning_rate': [0.05, 0.1],
       'max_depth': [None, 10, 20]
    }
}

In [None]:
# 튜닝할 모델 정의
models_to_tune = {
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'NaiveBayes': GaussianNB(),
    'LogisticRegression': LogisticRegression(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'GBT': GradientBoostingClassifier()
}

# 빈 DataFrame 생성
all_results_df = pd.DataFrame()

# 각 파일에 대한 처리 및 결과 DataFrame에 추가
for file_name in file_names:
    print(f"file_name: {file_name}")
    print(f"---------------------------------------------------------------------")
    all_data_orign_30 = load_data(file_name)
    X_train_1d, Y_train_1d, X_test_1d, Y_test_1d = ts_train_test(file_name)

    for model_name, model in models_to_tune.items():
        print(f"model_name: {model_name}")

        # 각 모델에 대한 GridSearchCV 수행
        grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring=make_scorer(custom_scorer))
        grid_search.fit(X_train_1d, Y_train_1d) # 하이퍼 파라미터들을 순차적으로 학습/평가

        # 최적 파라미터로 모델 재구성
        best_params = grid_search.best_params_
        model = models_to_tune[model_name].set_params(**best_params)

        # 모델 평가
        train_auc, test_auc, profits, pred = evaluate_classifier(model, X_train_1d, Y_train_1d, X_test_1d, Y_test_1d, all_data_orign_30)
        rmse = visualization(Y_test_1d, pred) # 혼동행렬, f1-score, 재현율, 정밀도, 지지도, RMSE
        print(f"---------------------------------------------------------------------")
        file_data = {'file_name': [file_name],
                     'model_name': [model_name],
                     'best_params': [best_params],
                     'train_auc': [train_auc],
                     'test_auc': [test_auc],
                     'rmse' : [rmse],
                     'profits': [profits]
                     }

        # concat 함수를 사용하여 데이터프레임 연결
        all_results_df = pd.concat([all_results_df, pd.DataFrame(file_data)], ignore_index=True)

# 결과 DataFrame 출력
print("---------------------------------------------------------------------")
print("All Models")
print(all_results_df)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
all_results_df

Unnamed: 0,file_name,model_name,best_params,train_auc,test_auc,rmse,profits
0,new_KR7005930003.csv,RandomForest,"{'class_weight': None, 'max_depth': None, 'n_e...",0.99,0.99,0.01,64.474070
1,new_KR7005930003.csv,DecisionTree,"{'criterion': 'gini', 'max_depth': None}",1.00,1.00,0.00,47.862869
2,new_KR7005930003.csv,NaiveBayes,{},0.04,0.04,0.99,89.262823
3,new_KR7005930003.csv,LogisticRegression,"{'C': 0.001, 'max_iter': 1000, 'penalty': 'l1'...",0.98,0.97,0.03,0.000000
4,new_KR7005930003.csv,SVM,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.42,0.42,0.59,27.808112
...,...,...,...,...,...,...,...
128,new_KR7373220003.csv,NaiveBayes,{},0.29,0.27,0.82,30.934800
129,new_KR7373220003.csv,LogisticRegression,"{'C': 0.001, 'max_iter': 1000, 'penalty': 'l1'...",0.06,0.06,1.13,0.000000
130,new_KR7373220003.csv,SVM,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.59,0.58,0.45,9.605700
131,new_KR7373220003.csv,KNN,"{'n_neighbors': 3, 'weights': 'uniform'}",0.88,0.92,0.08,-1.638000


In [None]:
all_results_df.to_csv('/content/drive/MyDrive/0_Capstone/희선 작업실/분류/Classification.csv', index=False)

In [None]:
# all_results_df의 결과를 종목별로 그룹화하여 출력
grouped_results = all_results_df.groupby('file_name', group_keys=False)
for name, group in grouped_results:
    print(f"File Name: {name}")
    group = group.drop('file_name', axis=1)  # 'file_name' 열 제거
    print(group)
    print("---------------------------------------------------------------------")

File Name: new_KR7000250001.csv
            model_name                                        best_params  \
7         RandomForest  {'class_weight': None, 'max_depth': 20, 'n_est...   
8         DecisionTree           {'criterion': 'gini', 'max_depth': None}   
9           NaiveBayes                                                 {}   
10  LogisticRegression  {'C': 0.001, 'max_iter': 1000, 'penalty': 'l1'...   
11                 SVM  {'C': 0.1, 'class_weight': 'balanced', 'gamma'...   
12                 KNN           {'n_neighbors': 3, 'weights': 'uniform'}   
13                 GBT  {'learning_rate': 0.05, 'max_depth': None, 'n_...   

    train_auc  test_auc  rmse      profits  
7        0.93      0.92  0.08    47.907257  
8        0.97      0.96  0.04   570.906067  
9        0.53      0.52  0.57   166.168729  
10       0.92      0.90  0.10     0.000000  
11       0.31      0.31  0.78   205.385222  
12       0.92      0.91  0.10    56.571242  
13       0.97      0.96  0.04  1631.



---

