In [1]:
from pathlib import Path
import re
import time

import sqlite3

import pandas as pd
from datetime import datetime, timedelta

from pykrx import stock
import requests

import QueryStockDataDB
from tqdm import tqdm

In [2]:
# setting the path
folderPath = Path.cwd().joinpath('DataStock')
os.chdir(folderPath)

## **Stock Number of Share DB**

### **1. Creat The Historical Number of Shares DF : The Step 1**

#### **1.1. Definition**

In [3]:
def getDayList(startDay, endDay):
    # Day List
    start = datetime.strptime(startDay, "%Y%m%d")
    end = datetime.strptime(endDay, "%Y%m%d")
    date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days+1)]

    dayList = []
    for date in date_generated:
        dayList.append(date.strftime("%Y%m%d"))

    return dayList

In [4]:
def uniqueTickerDf() :  # get tickers which have beeb existed since the year of 2010
    folderPath = Path.cwd().joinpath('DataStock')
    query = QueryStockDataDB.QueryStockData('.')
    stockListDf = query.queryStockListDB(by='parquet')
    groupbyDf = stockListDf.groupby('ticker').last()
    groupbyDf.firstDay = groupbyDf.firstDay.apply(lambda x: x.strftime('%Y%m%d'))
    groupbyDf.endDay = groupbyDf.endDay.apply(lambda x: x.strftime('%Y%m%d'))
    return groupbyDf

#	        name	   key             firstDay    endDay
# ticker				
# 000020	동화약품	000020동화약품	20100101	20211008
# 000030	우리은행	000030우리은행	20141119	20190212
# 000040	KR모터스	000040KR모터스	20100101	20211008

In [5]:
def getFirstEndDay(ticker) :
    groupbyDf = uniqueTickerDf()
    firstDay = groupbyDf.loc[ticker, 'firstDay']
    endDay = groupbyDf.loc[ticker, 'endDay']

    dic = {'firstDay' : firstDay, 'endDay' : endDay}

    return dic

# {'firstDay': '20100101', 'endDay': '20211008'}

In [6]:
def getNOfSharesDf(ticker):     # This is the first step which get the number of shares and we get family stock info in the next step later.
    dic = getFirstEndDay(ticker)
    nOfShareDf = stock.get_market_cap_by_date(dic['firstDay'], dic['endDay'], ticker)
    nOfShareDf = nOfShareDf.reset_index()[['날짜', '상장주식수']]
    nOfShareDf.columns = ['date', 'nOfShare']
    nOfShareDf = nOfShareDf.assign(ticker = ticker)
    nOfShareDf = nOfShareDf.assign(name = stock.get_market_ticker_name(ticker))
    colsOrder = ['date', 'ticker', 'name', 'nOfShare']
    nOfShareDf = nOfShareDf[colsOrder]
    return nOfShareDf

# 	date	ticker	name	nOfShare
# 0	2010-01-04	005930	삼성전자	147299337
# 1	2010-01-05	005930	삼성전자	147299337
# 2	2010-01-06	005930	삼성전자	147299337
# 3	2010-01-07	005930	삼성전자	147299337
# 4	2010-01-08	005930	삼성전자	147299337
# ...	...	...	...	...
# 2899	2021-10-01	005930	삼성전자	5969782550
# 2900	2021-10-05	005930	삼성전자	5969782550
# 2901	2021-10-06	005930	삼성전자	5969782550
# 2902	2021-10-07	005930	삼성전자	5969782550
# 2903	2021-10-08	005930	삼성전자	5969782550

#### **1.2. Creating The Historical Number of Shares DF**

In [8]:
lst = uniqueTickerDf().index.tolist()  # This is the first step which get DF for the number of shares and we will get family stock info in the next step later.

dfs = []
for ticker in tqdm(lst) :
    df = getNOfSharesDf(ticker)
    dfs.append(df)
historicalDf = pd.concat(dfs)
historicalDf

### **2. Get The Number of Shares for Family Stock And merge it with the step1 Df : Step2**
 - using apply-expand funtion

#### **2.1. Definition**

In [26]:
def getDayStockListDf(day) :
    query = QueryStockDataDB.QueryStockData('.')
    stockListDf = query.queryStockListDB(by='parquet')
    con1 = stockListDf.loc[:, 'firstDay'] <= day
    con2 = stockListDf.loc[:, 'endDay'] >= day
    stockListDf = stockListDf.loc[con1&con2]
    return stockListDf

# 	ticker	name	key	firstDay	endDay
# 0	004560	현대비앤지스틸	004560현대비앤지스틸	2010-01-01	2021-10-08
# 1	004565	현대비앤지스틸우	004565현대비앤지스틸우	2010-01-01	2021-10-08
# 2	001460	BYC	001460BYC	2010-01-01	2021-10-08
# 3	001465	BYC우	001465BYC우	2010-01-01	2021-10-08
# 4	084680	이월드	084680이월드	2010-01-01	2021-10-08
# ...	...	...	...	...	...
# 920	069260	휴켐스	069260휴켐스	2010-01-01	2021-10-08
# 921	000540	흥국화재	000540흥국화재	2010-01-01	2021-10-08
# 922	000547	흥국화재2우B	000547흥국화재2우B	2010-01-01	2021-10-08
# 923	000545	흥국화재우	000545흥국화재우	2010-01-01	2021-10-08
# 924	003280	흥아해운	003280흥아해운	2010-01-01	2021-10-08

In [27]:
def getOriginStockInfo(day, originText) :  

    targetText = originText.replace(' ','') # delete blank texts

    pasedWoo = re.search(r'.+우', targetText) # check the target text whether including '우'
    if pasedWoo :
        spanWoo = pasedWoo.span()
        filteredText = targetText[spanWoo[0] : spanWoo[1]-1]
        # print(f'우:{filteredText}')
        pasedNumber = re.search(r'.+[0-9]$', filteredText) # check the 'woo' filtered text whether including number
        if pasedNumber :
            spanNumber = pasedNumber.span()
            filteredText = targetText[spanNumber[0] : spanNumber[1]-1]
        StockListDf = getDayStockListDf(day)
        con = StockListDf.name==filteredText
            
        if len(StockListDf.name.loc[con]) ==0 :
            kind = 'commonStock'
            filteredText = originText
        else : 
            kind = 'preferredStock'

    else :
        kind = 'commonStock'
        filteredText = originText
        # print(f'해당없음:{filteredText}')

    return {'kind' : kind, 'filteredText':filteredText}

In [28]:
def checkIfRelative(targetText, x):
    p1 = re.findall(f'{targetText}\s?[0-9]?\s?[우]', x)
    p2 = re.findall(f'{targetText}$', x)
    if p1 or p2 :
        return True
    else :
        return False

In [29]:
def getFamilyStock(day, targetText) : 
    stockListDf = getDayStockListDf(day)
    con = stockListDf.name.apply(lambda x : checkIfRelative(targetText,x))
    relativeStock = stockListDf.name.loc[con].tolist()
    return relativeStock

In [30]:
def getTickerNNofShares(day, stockName, df) :

    con3=df.loc[:, 'date'] == day
    con4=df.loc[:, 'name'] == stockName

    tempDf = df.loc[con3&con4]
    # ticker = tempDf.ticker.iat[0]
    ticker = tempDf.ticker.iat[0]
    nofShares = tempDf.nOfShare.iat[0]

    return [ticker, nofShares]

In [43]:
def additionalInfo(day, originStock, df):
    originStockInfo = getOriginStockInfo(day, originStock)      # {'kind': 'preferredStock', 'filteredText': 'CJ'}
    filteredText = originStockInfo['filteredText']

    familyStock = getFamilyStock(day, filteredText)     # ['CJ', 'CJ우', 'CJ4우(전환)']


    dic = {'kindOfStock':''}
    for k in range(1,5) :
        dic[f'familyStock{k}'] = None     # {'kindOfStock': 'preferredStock', 'familyStock1': '', 'nOfShare_familyStock1': '', 'familyStock2': '', 'nOfShare_familyStock2': ''}
        dic[f'nOfShare_familyStock{k}'] = None
    num=0
    for stock in familyStock :
        index = familyStock.index(stock)
        if index==familyStock.index(originStock):
            dic['kindOfStock'] = originStockInfo['kind']
        else :
            num+=1
            dic[f'familyStock{num}'] = stock
            tickerNNofShares = getTickerNNofShares(day, stock, df)
            ticker = tickerNNofShares[0]
            nofShares = tickerNNofShares[1]
            dic[f'nOfShare_familyStock{num}'] = nofShares  # {'kindOfStock': 'preferredStock', 'familyStock1': 'CJ', 'nOfShare_familyStock1': '', 'familyStock2': 'CJ우', 'nOfShare_familyStock2': ''}
    nOfShare = list(dic.values())
    return nOfShare

#### 2.2. Get The Number of Shares for Family Stock And merge it with the step1 Df

In [14]:
# Use this cell to devide the historical Df which we get in the step1 into four parts by years

# historicalDf = historicalDf.sort_values(by='date').reset_index(drop=True)
# historicalDf1 = historicalDf.iloc[0:694480]
# historicalDf2 = historicalDf.iloc[694480:1581561]
# historicalDf3 = historicalDf.iloc[1581561:2237677]
# historicalDf4 = historicalDf.iloc[2237677:]

# display(historicalDf1)
# display(historicalDf2)
# display(historicalDf3)
# display(historicalDf4)

In [15]:
query = QueryStockDataDB.QueryStockData(folderPath)
historicalDf = query.queryStockNumberOfSharesDB(by='parquet')


cols = ['date', 'ticker', 'name', 'nOfShare', 'kindOfStock', 'familyStock1', 'nOfShare_familyStock1', 'familyStock2', 'nOfShare_familyStock2', 'familyStock3', 'nOfShare_familyStock3', 'familyStock4', 'nOfShare_familyStock4']

historicalDf = historicalDf.sort_values(by='date').reset_index(drop=True)
# historicalDf = historicalDf.iloc[0:694480]  # use this code when operating divided task
tqdm.pandas()
expandDf = historicalDf.progress_apply(lambda row : additionalInfo(row['date'], row['name'], historicalDf[historicalDf.date==row['date']]), axis=1, result_type='expand')
historicalDfFinal = pd.concat([historicalDf.reset_index(drop=True), expandDf.reset_index(drop=True)], axis=1, join='inner') # merge wiht family stock df with the step1 df 
historicalDfFinal.columns = cols
historicalDfFinal
# historicalDfFinal.reset_index(drop=True).to_csv(folderPath/'stockNumberOfSharesDB_1.csv')  # use this code when operating divided task

100%|██████████| 694480/694480 [4:45:46<00:00, 40.50it/s]


Unnamed: 0,name,date,ticker,nOfShare,kindOfStock,familyStock1,nOfShare_familyStock1,familyStock2,nOfShare_familyStock2,familyStock3,nOfShare_familyStock3,familyStock4,nOfShare_familyStock4
0,2010-01-04,000020,동화약품,27931470,commonStock,,,,,,,,
1,2010-01-04,078420,동북아1호,3222000,commonStock,,,,,,,,
2,2010-01-04,002840,미원상사,1300000,commonStock,,,,,,,,
3,2010-01-04,078930,GS,92915378,commonStock,GS우,1784826,,,,,,
4,2010-01-04,002820,SUN&L,2000000,commonStock,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
694475,2012-12-28,047050,포스코인터내셔널,113876291,commonStock,,,,,,,,
694476,2012-12-28,004770,써니전자,19460000,commonStock,,,,,,,,
694477,2012-12-28,010640,진양폴리,10000000,commonStock,,,,,,,,
694478,2012-12-28,012630,HDC,75384180,commonStock,,,,,,,,


In [13]:
# use this code when operating divided task

# cols = ['date', 'ticker', 'name', 'nOfShare', 'kindOfStock', 'familyStock1', 'nOfShare_familyStock1', 'familyStock2', 'nOfShare_familyStock2', 'familyStock3', 'nOfShare_familyStock3', 'familyStock4', 'nOfShare_familyStock4']
# historicalDfFinal_1 = pd.read_csv(folderPath/'stockNumberOfSharesDB_1.csv', encoding='cp949', index_col=0)
# historicalDfFinal_2 = pd.read_csv(folderPath/'stockNumberOfSharesDB_2.csv', encoding='cp949', index_col=0)
# historicalDfFinal_3 = pd.read_csv(folderPath/'stockNumberOfSharesDB_3.csv', encoding='cp949', index_col=0)
# historicalDfFinal_4 = pd.read_csv(folderPath/'stockNumberOfSharesDB_4.csv', encoding='cp949', index_col=0)
# historicalDfFinal_total = pd.concat([historicalDfFinal_1,historicalDfFinal_2,historicalDfFinal_3,historicalDfFinal_4])

# historicalDfFinal_total.columns = cols
# historicalDfFinal_total

#### 2.3. **Save DB as the format of Parquet**

In [41]:
historicalDfFinal.to_parquet(folderPath/'stockNumberOfSharesDB.parquet')

In [27]:
historicalDfFinal = QueryStockDataDB.QueryStockData(folderPath).queryStockNumberOfSharesDB(by='parquet')
historicalDfFinal

Unnamed: 0,date,ticker,name,nOfShare,kindOfStock,familyStock1,nOfShare_familyStock1,familyStock2,nOfShare_familyStock2,familyStock3,nOfShare_familyStock3,familyStock4,nOfShare_familyStock4
0,2010-01-04,000020,동화약품,27931470,commonStock,,,,,,,,
1,2010-01-04,078420,동북아1호,3222000,commonStock,,,,,,,,
2,2010-01-04,002840,미원상사,1300000,commonStock,,,,,,,,
3,2010-01-04,078930,GS,92915378,commonStock,GS우,1784826.0,,,,,,
4,2010-01-04,002820,SUN&L,2000000,commonStock,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2640211,2021-10-08,009200,무림페이퍼,41609310,commonStock,,,,,,,,
2640212,2021-10-08,001630,종근당홀딩스,5009861,commonStock,,,,,,,,
2640213,2021-10-08,008060,대덕,33890150,commonStock,대덕1우,1212357.0,,,,,,
2640214,2021-10-08,004835,덕성우,1392000,preferredStock,덕성,15680000.0,,,,,,


In [28]:
historicalDfFinal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2640216 entries, 0 to 2640215
Data columns (total 13 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   date                   datetime64[ns]
 1   ticker                 object        
 2   name                   object        
 3   nOfShare               int64         
 4   kindOfStock            object        
 5   familyStock1           object        
 6   nOfShare_familyStock1  float64       
 7   familyStock2           object        
 8   nOfShare_familyStock2  float64       
 9   familyStock3           object        
 10  nOfShare_familyStock3  float64       
 11  familyStock4           float64       
 12  nOfShare_familyStock4  float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(6)
memory usage: 261.9+ MB


#### 2.4. Creating DB file and Insert data into DB

In [21]:
# creating DB file
conn = sqlite3.connect('stockNumberOfSharesDB.db')
cur = conn.cursor()
conn.execute(
    'CREATE TABLE stockNumberOfSharesDB (id INTEGER PRIMARY KEY AUTOINCREMENT, date TIMESTAMP, ticker TEXT,  name TEXT, nOfShares REAL, kindOfStock TEXT, familyStock1 TEXT, nOfShare_familyStock1 REAL, familyStock2 TEXT, nOfShare_familyStock2 REAL, familyStock3 TEXT, nOfShare_familyStock3 REAL, familyStock4 TEXT, nOfShare_familyStock4 REAL)')
conn.commit()
conn.close()

In [29]:
# ready the staged changes
historicalDfFinal['date'] = historicalDfFinal['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
for row in historicalDfFinal.iloc[:5].itertuples():
    print(row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], sep='/')

2010-01-04/000020/동화약품/27931470/commonStock/None/nan/None/nan/None/nan/nan/nan
2010-01-04/078420/동북아1호/3222000/commonStock/None/nan/None/nan/None/nan/nan/nan
2010-01-04/002840/미원상사/1300000/commonStock/None/nan/None/nan/None/nan/nan/nan
2010-01-04/078930/GS/92915378/commonStock/GS우/1784826.0/None/nan/None/nan/nan/nan
2010-01-04/002820/SUN&L/2000000/commonStock/None/nan/None/nan/None/nan/nan/nan


In [33]:
# inset data into DB
connect = sqlite3.connect('./stockNumberOfSharesDB.db')
cursor = connect.cursor()
for row in historicalDfFinal.itertuples():
    sql = "insert into stockNumberOfSharesDB (date, ticker, name, nOfShares, kindOfStock, familyStock1, nOfShare_familyStock1, familyStock2, nOfShare_familyStock2, familyStock3, nOfShare_familyStock3, familyStock4, nOfShare_familyStock4) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
    cursor.execute(sql, (row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13]))
connect.commit()
connect.close()

### **3. Update DB (insert today data into the historical data)**

#### 3.1. Web Scrapping The Number of Shares <a href="http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020201"> (more info) </a> : The Step 1


In [44]:
today = datetime(2021,10,11)
url = 'http://data.krx.co.kr/comm/bldAttendant/getJsonData.cmd'
data = {
        'bld': 'dbms/MDC/STAT/standard/MDCSTAT01901',
        'mktId': 'STK',
        'share': '1',
        'csvxls_isNo': 'false'
}
r = requests.post(url, data = data)
dataKrxStockBasicInfo = pd.DataFrame(r.json()['OutBlock_1'])
originCols=['ISU_SRT_CD','ISU_ABBRV','LIST_SHRS'] 
customizedCols=['ticker', 'name', 'nOfShare']

numbersOfSharesDf = dataKrxStockBasicInfo[originCols]
numbersOfSharesDf.columns = customizedCols
numbersOfSharesDf = numbersOfSharesDf.assign(date=today)
numbersOfSharesDf = numbersOfSharesDf[['date','ticker', 'name', 'nOfShare']]
numbersOfSharesDf['nOfShare'] = numbersOfSharesDf['nOfShare'].str.replace(',', '').astype('int64')
numbersOfSharesDf.info()
numbersOfSharesDf

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939 entries, 0 to 938
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      939 non-null    datetime64[ns]
 1   ticker    939 non-null    object        
 2   name      939 non-null    object        
 3   nOfShare  939 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 29.5+ KB


Unnamed: 0,date,ticker,name,nOfShare
0,2021-10-11,095570,AJ네트웍스,46822295
1,2021-10-11,006840,AK홀딩스,13247561
2,2021-10-11,282330,BGF리테일,17283906
3,2021-10-11,027410,BGF,95716791
4,2021-10-11,138930,BNK금융지주,325935246
...,...,...,...,...
934,2021-10-11,069260,휴켐스,40878588
935,2021-10-11,000545,흥국화재우,768000
936,2021-10-11,000547,흥국화재2우B,153600
937,2021-10-11,000540,흥국화재,64242645


#### **3.2. Get The Number of Shares for Family Stock And merge it with the step1 Df : Step2**  
! Check the state of update for the stockListDB before proceeding. If it's not updated, the filter fuction dosen't work

In [45]:
cols = ['date', 'ticker', 'name', 'nOfShare', 'kindOfStock', 'familyStock1', 'nOfShare_familyStock1', 'familyStock2', 'nOfShare_familyStock2', 'familyStock3', 'nOfShare_familyStock3', 'familyStock4', 'nOfShare_familyStock4']

tqdm.pandas()
expandDf = numbersOfSharesDf.progress_apply(lambda row : additionalInfo(row['date'], row['name'], numbersOfSharesDf), axis=1, result_type='expand')
numbersOfSharesDf = pd.concat([numbersOfSharesDf.reset_index(drop=True), expandDf.reset_index(drop=True)], axis=1, join='inner') # merge wiht family stock df with the step1 df 
numbersOfSharesDf.columns = cols
numbersOfSharesDf

100%|██████████| 939/939 [00:07<00:00, 130.06it/s]


Unnamed: 0,date,ticker,name,nOfShare,kindOfStock,familyStock1,nOfShare_familyStock1,familyStock2,nOfShare_familyStock2,familyStock3,nOfShare_familyStock3,familyStock4,nOfShare_familyStock4
0,2021-10-11,095570,AJ네트웍스,46822295,commonStock,,,,,,,,
1,2021-10-11,006840,AK홀딩스,13247561,commonStock,,,,,,,,
2,2021-10-11,282330,BGF리테일,17283906,commonStock,,,,,,,,
3,2021-10-11,027410,BGF,95716791,commonStock,,,,,,,,
4,2021-10-11,138930,BNK금융지주,325935246,commonStock,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
934,2021-10-11,069260,휴켐스,40878588,commonStock,,,,,,,,
935,2021-10-11,000545,흥국화재우,768000,preferredStock,흥국화재,64242645.0,흥국화재2우B,153600.0,,,,
936,2021-10-11,000547,흥국화재2우B,153600,preferredStock,흥국화재,64242645.0,흥국화재우,768000.0,,,,
937,2021-10-11,000540,흥국화재,64242645,commonStock,흥국화재2우B,153600.0,흥국화재우,768000.0,,,,
