In [1]:
import numpy as np 
import pandas as pd 
from math import ceil

from tqdm import tqdm
from datetime import datetime, timedelta
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
data.shape

In [16]:
data = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
data["Date"] = pd.to_datetime(data["Date"])
data.head()

In [25]:
data[(data['Target'].notnull()) & (data['Close'].isnull())]

In [20]:
data.isnull().sum()

In [4]:
data = data[data['Target'].notnull()].reset_index(drop = True)

col_names = ['SecuritiesCode','Open_R','High_R','Low_R','Close_R']
rep_data = data.groupby('SecuritiesCode').mean()[['Open','High','Low','Close']].reset_index()
rep_data.columns = col_names

data = pd.merge(data, rep_data, on = 'SecuritiesCode', how = 'left')
data['Open'] = np.where(data['Open'].notnull(), data['Open'], data['Open_R'])
data['High'] = np.where(data['High'].notnull(), data['High'], data['High_R'])
data['Low'] = np.where(data['Low'].notnull(), data['Low'], data['Low_R'])
data['Close'] = np.where(data['Close'].notnull(), data['Close'], data['Close_R'])

data.drop(['Open_R','High_R','Low_R','Close_R'], axis = 1, inplace = True)

data.isnull().sum()

In [5]:
data['Date'].describe()

In [27]:
tmp = data.copy()
tmp['yymm'] = tmp['Date'].apply(lambda x: str(x)[:4] + str(x)[5:7])
tmp['yymm'].value_counts().plot(kind = 'bar', figsize = (10,6))

In [7]:
## 총 거래액 생성
data['amount'] = data['Close'] * data['Volume'] # amount

In [8]:
def week_of_month(date):
    day = date.day
    wom = int(np.ceil(day / 7.0))
    
    return wom

def day_feature(data):    
    day_df = data.groupby('Date').sum()[['amount','Close']].reset_index()
    day_df['diff'] = day_df['Close'].diff()
    day_df['shift1'] = day_df['Close'].shift(1)

    day_df['day_roc'] = (day_df['diff'] / day_df['shift1']) * 100
    day_df.rename({'amount':'day_amount'}, axis=1, inplace=True)
    day_df = day_df[['Date','day_amount','day_roc']]

    data = pd.merge(data, day_df, on = 'Date', how='left')
    
    return data

stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
stock_list = stock_list[['SecuritiesCode','Section/Products','NewMarketSegment','33SectorName','17SectorName']]

def day_list_feature(data):
    data = pd.merge(data, stock_list, on = 'SecuritiesCode', how = 'left')
    segs = ['Section/Products','NewMarketSegment','33SectorName','17SectorName']
    
    for seg in segs:
        day_df = data.groupby(['Date',seg]).sum()[['amount','Close']].reset_index()
        
        tmp = pd.DataFrame()
        for unique_seg in data[seg].unique():
            day_unique_df = day_df[day_df[seg] == unique_seg].reset_index(drop = True)

            day_unique_df['diff'] = day_unique_df['Close'].diff()
            day_unique_df['shift1'] = day_unique_df['Close'].shift(1)

            day_unique_df['day_roc'] = (day_unique_df['diff'] / day_unique_df['shift1']) * 100
            
            tmp = pd.concat([tmp, day_unique_df])
            
        tmp.rename({'amount': seg + '_amount', 'day_roc': seg + '_roc'}, axis=1, inplace=True)
        tmp.drop(['diff','shift1','Close'], axis=1, inplace=True)
        data = pd.merge(data, tmp, on = ['Date',seg], how='left')
        
#     data.drop(['Section/Products','NewMarketSegment','33SectorName','17SectorName'], axis=1, inplace=True)
    
    return data

In [9]:
print(data.shape)
## day 관련 변수
data['weekday'] = data["Date"].apply(lambda x: x.weekday())
data['weeknum'] = data["Date"].apply(lambda x: week_of_month(x))

data = day_feature(data)
data = day_list_feature(data)
print(data.shape)
# day_to_sec = 24 * 60 * 60
# month_to_sec = 20 * day_to_sec
# timestamp_s = data["Date"].apply(datetime.timestamp)
# timestamp_freq = round((timestamp_s / month_to_sec).diff(20)[20], 1)

# data['dayofmonth_freq_sin'] = np.sin((timestamp_s / month_to_sec) * ((2 * np.pi) / timestamp_freq))
# data['dayofmonth_freq_cos'] = np.cos((timestamp_s / month_to_sec) * ((2 * np.pi) / timestamp_freq))

In [10]:
def Stochastic(df, n=14, m=5, t=5):    
    # n일 중 최고가
    ndays_high = df['High'].rolling(window = n, min_periods=1).max()
    # n일 중 최저가
    ndays_low = df['Low'].rolling(window = n, min_periods=1).max()
    
    # Fast %K 계산
    fast_k = ((df['Close'] - ndays_low) / (ndays_high - ndays_low)) * 100
    # Fast %D (Slow %K) 계산
    slow_k = fast_k.ewm(span=m).mean()
    # Slow %d 계산
    slow_d = slow_k.ewm(span=t).mean()
    
    # 값 추가
    df['fast_k'] = fast_k
    df['fast_d'] = slow_k
    df['slow_d'] = slow_d
    
    return df

def SMA(data, period=30, column = 'Close'):
    return data[column].rolling(window=period).mean()

def RSI(data, period = 14, column = 'Close'):
    delta = data[column].diff(1)
    delta = delta.dropna()
    
    up = delta.copy()
    down = delta.copy()
    
    up[up < 0] = 0
    down[down > 0] = 0
    
    data['up'] = up
    data['down'] = down
    
    AVG_Gain = SMA(data, period, column = 'up')
    AVG_Loss = abs(SMA(data, period, column = 'down'))
    
    RS = AVG_Gain / AVG_Loss
    RSI = 100.0 - (100.0/(1.0 + RS))
    
    data['RSI'] = RSI
    
    return data

def OBV(data):
    OBV = [0]
    for i in tqdm(range(1,len(data))):
        if data['Close'][i] > data['Close'][i-1]:
            OBV.append(OBV + data['Volume'][i])
        elif tmp_stock['Close'][i] < data['Close'][i-1]:
            OBV.append(OBV - data['Volume'][i])
        else:
            OBV.append(OBV[-1])
    data['OBV'] = OBV
    data['OBV_EMA'] = data['OBV'].ewm(com = 20).mean()
    
    return data

def MFI(data):
    # 10일(거래일 기준으로 2주 동안) 기준의 현금흐름지표를 구하는 코드
    data['avg_price'] = (data['High']+data['Low']+data['Close'])/3
    data['PMF'] = 0
    data['NMF'] = 0
    
    for i in range(len(data['Close'])-1):
        # 당일의 중심가격이 전일의 중심가격보다 크면 긍정적 현금흐름
        if data['avg_price'].values[i] < data['avg_price'].values[i+1]:
            data['PMF'].values[i+1] = data['avg_price'].values[i+1]*data['Volume'].values[i+1]
            data['NMF'].values[i+1] = 0
        # 당일의 중심가격이 전일의 중심가격보다 작거나 같으면 부정적 현금흐름
        else:
            data['NMF'].values[i+1] = data['avg_price'].values[i+1]*data['Volume'].values[i+1]
            data['PMF'].values[i+1] = 0

    data['MFR'] = data['PMF'].rolling(window=10).sum()/data['NMF'].rolling(window=10).sum()
    data['MFI10'] = 100 - 100/(1+data['MFR'])
    
    return data

def CCI(data):
    data['TP'] = (data['High'] + data['Low'] + data['Close']) / 3
    data['SMA'] = data['TP'].rolling(window=20).mean()
    data['MAD'] = data['TP'].rolling(window=20).apply(lambda x: pd.Series(x).mad())
    data['CCI'] = (data['TP'] - data['SMA']) / (0.015 * data['MAD'])
    
    data.drop(['TP','SMA','MAD'], axis=1, inplace=True)
    
    return data

def MACD(data, m_NumFast=12, m_NumSlow=26, m_NumSignal=9):
    data['EMAFast'] = data['Close'].ewm(span = m_NumFast, min_periods = m_NumFast - 1).mean()
    data['EMASlow'] = data['Close'].ewm(span = m_NumSlow, min_periods = m_NumSlow - 1).mean()
    data['MACD'] = data['EMAFast'] - data['EMASlow']
    data['MACDSignal'] = data['MACD'].ewm(span = m_NumSignal, min_periods = m_NumSignal-1).mean()
    data['MACDDiff'] = data['MACD'] - data['MACDSignal']
    
    return data

def bollinger(data):
    data['ma20'] = data['Close'].rolling(window=20).mean() # 20일 이동평균
    data['stddev'] = data['Close'].rolling(window=20).std() # 20일 이동표준편차
    data['band_upper'] = data['ma20'] + 2*data['stddev'] # 상단밴드
    data['band_lower'] = data['ma20'] - 2*data['stddev'] # 하단밴드
    
    data.drop(['ma20','stddev'], axis = 1, inplace = True)
    
    return data

def williams(data, n_days = 14):
    data['low_min'] = data['Low'].rolling(window = n_days, center = False).min()
    data['high_max'] = data['High'].rolling(window = n_days, center = False).max()
    
    data['willr'] = ((data['high_max'] - data['Close']) / (data['high_max'] - data['low_min'])) * -100
    
    return data

def ROC(data):
    data['diff'] = data['Close'].diff()
    data['shift1'] = data['Close'].shift(1)
    
    data['rate_of_change'] = (data['diff'] / data['shift1']) * 100
    
    data.drop(['diff','shift1'], axis=1, inplace=True)
    
    return data

In [None]:
ttmp = data[data['SecuritiesCode'] == 1332].reset_index(drop = True)
ttmp = ROC(ttmp)
display(ttmp['rate_of_change'].isnull().sum(), ttmp['rate_of_change'].describe())

In [None]:
## 주식 파생변수
data['amount'] = data["Close"] * data["Volume"] # amount

# OBV, Stochastic, RSI, MFI
tmp = pd.DataFrame()

for stock in tqdm(data['SecuritiesCode'].unique()):
    tmp_stock = data[data['SecuritiesCode'] == stock].reset_index(drop = True)
    
    tmp_stock = Stochastic(tmp_stock)
    tmp_stock = RSI(tmp_stock)
    tmp_stock = OBV(tmp_stock)
    tmp_stock = MFI(tmp_stock)
    
    tmp = pd.concat([tmp, tmp_stock])

tmp.reset_index(drop = True, inplace=True)
data = pd.merge(data, tmp[['Date','SecuritiesCode',
                           'fast_k','fast_d','slow_d','RSI','OBV','OBV_EMA']], on = ['Date','SecuritiesCode'], how = 'left')


In [None]:
## 이동평균
data["close_mv5"] = data["Close"].rolling(5, min_periods=5).mean()
data["close_mv10"] = data["Close"].rolling(10, min_periods=10).mean()
data["close_mv20"] = data["Close"].rolling(20, min_periods=20).mean()

data["volume_mv5"] = data["Volume"].rolling(5, min_periods=5).mean()
data["volume_mv10"] = data["Volume"].rolling(10, min_periods=10).mean()
data["volume_mv20"] = data["Volume"].rolling(20, min_periods=20).mean()

data["amount_mv5"] = data["amount"].rolling(5, min_periods=5).mean()
data["amount_mv10"] = data["amount"].rolling(10, min_periods=10).mean()
data["amount_mv20"] = data["amount"].rolling(20, min_periods=20).mean()


## 과거 시점 데이터
tmp_df = pd.DataFrame()
tmp_cols = []

for i in range(1,6,1):
    tmp_df = pd.concat([tmp_df, data["Close"].shift(i).to_frame()], axis=1)
    tmp_cols.append("close_" + str(i) + "shift")
tmp_df.columns = tmp_cols
data = pd.concat([data, tmp_df], axis=1)

In [None]:
data.head()