In [53]:
from pykrx import stock
import sys
import pandas as pd
import numpy as np

In [45]:
nsq_p = pd.read_csv("nasdaq.csv")[['date','nasdaq']]
nsq_p['date'] = pd.to_datetime(nsq_p['date'])

In [25]:
def labellingD0(d0) -> str:
    '''
    D0 시점의 각 봉에 대한 라벨링 25가지
    '''
    openP = d0['open']
    highP = d0['high']
    lowP = d0['low']
    closeP = d0['close']

    # 장대 양봉
    if closeP >= 1.1*openP:
        if closeP < highP:
            if openP > lowP:
                return "P15"
            if highP - 2*closeP + openP >= 0:
                return "P14"
            return "P13"
        if openP > lowP:
            return "P11"
        return "P10"

    # 짧은 양봉
    elif closeP >= 1.005*openP:
        if closeP < highP:
            if openP > lowP:
                return "P05"
            if highP - 3*closeP + 2*openP >= 0:
                return "P04"
            return "P03"
        if openP > lowP:
            if 2*highP - 3*openP + lowP >= 0:
                return "P01"
            return "P02"
        return "P00"

    # 보합
    elif closeP >= openP:
        end_min = closeP - lowP
        Max_end = highP - closeP
        if end_min > Max_end*3:
            return "K01"
        elif end_min*3 < Max_end:
            return "K02"
        return "K00"

    # 짧은 음봉
    elif closeP >= 0.9*openP:
        if openP < highP:
            if closeP > lowP:
                return "M05"
            if highP - 3*openP + 2*closeP < 0:
                return "M03"
            return "M04"
        if closeP > lowP:
            if 3*closeP - lowP - 2*openP < 0:
                return "M01"
            return "M02"
        return "M00"

    # 장대 음봉
    else:
        if closeP > lowP:
            if openP < highP:
                return "M15"
            return "M11"
        if openP < highP:
            if highP - 2*openP + closeP >= 0:
                return "M14"
            return "M13"
        return "M10"


def labellingD1(d10) -> str:
    '''
    D1 5가지 x D0 25가지
    '''
    temp = d10.iloc[0]
    openP = temp['open']
    # highP = temp['high']
    # lowP = temp['low']
    closeP = temp['close']

    # 장대 양봉
    if closeP >= 1.1*openP:
        res = "P10"

    # 짧은 양봉
    elif closeP >= 1.005*openP:
        res = "P00"

    # 보합
    elif closeP >= openP:
        res = "K00"

    # 짧은 음봉
    elif closeP >= 0.9*openP:
        res = "M00"

    # 장대 음봉
    else:
        res = "M10"

    return res + labellingD0(d10.iloc[1])


def labellingD2(d210):
    '''
    D2D1 12가지 x D0 25가지
    '''
    d2_openP = d210.iloc[0]['open']
    d2_closeP = d210.iloc[0]['close']
    d1_openP = d210.iloc[1]['open']
    d1_closeP = d210.iloc[1]['close']

    d21_max = max(d2_openP, d2_closeP, d1_openP, d1_closeP)
    d21_avg = (d21_max + min(d2_openP, d2_closeP, d1_openP, d1_closeP))/2
    if d21_max/d21_avg <= 1.005:
        res = "S04"
    elif d2_openP <= d2_closeP:  # D2 양봉
        if d1_openP <= d1_closeP:
            res = "P10"
        elif d2_openP >= d1_closeP:
            res = "S07"
        elif d2_closeP > d1_openP:
            res = "S06"
        elif d2_closeP >= d1_closeP:
            res = "S03"
        else:
            res = "S05"
    elif d2_openP >= d2_closeP:  # D2 음봉
        if d1_openP >= d1_closeP:
            res = "M10"
        elif d2_closeP < d1_openP:
            res = "S01"
        elif d2_openP <= d1_closeP:
            res = "S02"
        elif d2_closeP >= d1_closeP:
            res = "S08"
        else:
            res = "S00"
    else:
        res = "S09"

    return res + labellingD0(d210.iloc[2])


In [28]:
today = "20201023"
stockCode = 5930



###########################

stockCode = str(stockCode)
stockCode = "0"*(6-len(stockCode)) + stockCode

stockData = stock.get_market_ohlcv_by_date("20120101", today, stockCode)
comName = stockData.columns.name
stockData.index.name = 'date'
stockData.columns = pd.Index(
    ["open", "high", "low", "close", "volume"], name=comName)

stockData['pattern1'] = None
for i in range(len(stockData)):
    stockData['pattern1'].values[i] = labellingD0(stockData.iloc[i])

stockData['pattern2'] = None
for i in range(1, len(stockData)):
    stockData['pattern2'].values[i] = labellingD1(stockData.iloc[i-1:i+1])

stockData['pattern3'] = None
for i in range(2, len(stockData)):
    stockData['pattern3'].values[i] = labellingD2(stockData.iloc[i-2:i+1])

In [118]:
stockData2 = stockData.reset_index()
stockData2

삼성전자,date,open,high,low,close,volume,pattern1,pattern2,pattern3
0,2012-01-02,21400,21600,21300,21600,263300,P01,,
1,2012-01-03,21860,22100,21840,22100,339046,P01,P00P01,
2,2012-01-04,22100,22200,21500,21600,342389,M05,P00M05,P10M05
3,2012-01-05,21460,21580,21100,21100,346691,M03,M00M03,S07M03
4,2012-01-06,21120,21319,20600,20800,376753,M05,M00M05,M10M05
...,...,...,...,...,...,...,...,...,...
2163,2020-10-19,59600,60200,59500,60000,14474985,P05,M00P05,M10P05
2164,2020-10-20,60300,60900,60100,60900,19326115,P01,P00P01,S04P01
2165,2020-10-21,61200,61500,60600,60900,15703443,M05,P00M05,P10M05
2166,2020-10-22,60300,60500,59800,60100,14294095,M05,M00M05,S03M05


In [128]:
test = pd.merge(stockData2, nsq_p, on='date', how='left')
test

Unnamed: 0,date,open,high,low,close,volume,pattern1,pattern2,pattern3,nasdaq
0,2012-01-02,21400,21600,21300,21600,263300,P01,,,
1,2012-01-03,21860,22100,21840,22100,339046,P01,P00P01,,
2,2012-01-04,22100,22200,21500,21600,342389,M05,P00M05,P10M05,U04
3,2012-01-05,21460,21580,21100,21100,346691,M03,M00M03,S07M03,T00
4,2012-01-06,21120,21319,20600,20800,376753,M05,M00M05,M10M05,U04
...,...,...,...,...,...,...,...,...,...,...
2163,2020-10-19,59600,60200,59500,60000,14474985,P05,M00P05,M10P05,D04
2164,2020-10-20,60300,60900,60100,60900,19326115,P01,P00P01,S04P01,D04
2165,2020-10-21,61200,61500,60600,60900,15703443,M05,P00M05,P10M05,U04
2166,2020-10-22,60300,60500,59800,60100,14294095,M05,M00M05,S03M05,D04


In [129]:
nan_list = test[test['nasdaq'].isnull()].index
nan_list

Int64Index([   0,    1,   10,   33,   66,  125,  167,  205,  206,  223,  261,
             280,  308,  347,  374,  415,  473,  507,  525,  569,  592,  619,
             659,  717,  751,  771,  802,  864,  909,  964,  998, 1015, 1043,
            1086, 1110, 1154, 1208, 1230, 1234, 1244, 1267, 1305, 1332, 1357,
            1400, 1452, 1486, 1509, 1537, 1575, 1600, 1642, 1695, 1704, 1734,
            1751, 1794, 1818, 1845, 1886, 1945, 1979, 1997, 2036, 2063, 2092,
            2137],
           dtype='int64')

In [130]:
test['nasdaq'].fillna(-1)
for i in nan_list:
    pointer = i
    while (pointer>0):
        pointer -= 1
        temp = test['nasdaq'].values[pointer]
        if temp != -1:
            test['nasdaq'].values[i] = temp
            break
test

Unnamed: 0,date,open,high,low,close,volume,pattern1,pattern2,pattern3,nasdaq
0,2012-01-02,21400,21600,21300,21600,263300,P01,,,
1,2012-01-03,21860,22100,21840,22100,339046,P01,P00P01,,
2,2012-01-04,22100,22200,21500,21600,342389,M05,P00M05,P10M05,U04
3,2012-01-05,21460,21580,21100,21100,346691,M03,M00M03,S07M03,T00
4,2012-01-06,21120,21319,20600,20800,376753,M05,M00M05,M10M05,U04
...,...,...,...,...,...,...,...,...,...,...
2163,2020-10-19,59600,60200,59500,60000,14474985,P05,M00P05,M10P05,D04
2164,2020-10-20,60300,60900,60100,60900,19326115,P01,P00P01,S04P01,D04
2165,2020-10-21,61200,61500,60600,60900,15703443,M05,P00M05,P10M05,U04
2166,2020-10-22,60300,60500,59800,60100,14294095,M05,M00M05,S03M05,D04
