# 전 세계 주식 데이터 수집하기

## 유료 데이터 벤더 이용하기

[https://api.tiingo.com/](https://api.tiingo.com)

In [1]:
import keyring

keyring.set_password('tiingo', 'ahn283', 'b3c899b0a73a26c227212485c4e4c946763523e7')

In [2]:
# download data
from tiingo import TiingoClient
import pandas as pd
import keyring

api_key = keyring.get_password('tiingo', 'ahn283')
config = {}
config['session'] = True
config['api_key'] = api_key
client = TiingoClient(config)

In [3]:
# list_stock_tickers() 메서드를 통해 티커 정보를 받아올 수 있다.
tickers = client.list_stock_tickers()
tickers_df = pd.DataFrame.from_records(tickers)

tickers_df.head()

Unnamed: 0,ticker,exchange,assetType,priceCurrency,startDate,endDate
0,-P-H,NYSE,Stock,USD,,
1,-P-S,NYSE,Stock,USD,2018-08-22,2023-05-05
2,000001,SHE,Stock,CNY,2007-01-04,2023-11-23
3,000002,SHE,Stock,CNY,2007-01-04,2023-11-23
4,000003,SHE,Stock,CNY,,


In [4]:
# 거래시장에 따른 데이터 구분 (마이너 거래소나 장외 거래소 이용 불가)
tickers_df.groupby(['exchange', 'priceCurrency'])['ticker'].count()

exchange   priceCurrency
           USD               2459
AMEX       USD                 78
ASX        AUD                169
           USD               2172
BATS       USD                 13
CSE        USD                 32
EXPM       USD               1840
LSE        USD                 12
NASDAQ     USD              12612
NMFQS      USD                 36
NYSE       USD               7733
NYSE ARCA  USD                 79
NYSE MKT   USD                466
NYSE NAT   USD                  3
OTCBB      USD                651
OTCCE      USD               1105
OTCGREY    USD               4253
OTCMKTS    USD               1190
OTCQB      USD               1260
OTCQX      USD                788
PINK       USD              15229
SHE        CNY               2554
           HKD                 12
SHEB       HKD                 42
SHG        CNY               1933
           USD                  6
SHGB       USD                 44
Name: ticker, dtype: int64

In [5]:
# APPL 주식 상세 정보 확인
ticker_metadata = client.get_ticker_metadata("AAPL")
print(ticker_metadata)

{'ticker': 'AAPL', 'name': 'Apple Inc', 'description': "Apple Inc. (Apple) designs, manufactures and markets mobile communication and media devices, personal computers, and portable digital music players, and a variety of related software, services, peripherals, networking solutions, and third-party digital content and applications. The Company's products and services include iPhone, iPad, Mac, iPod, Apple TV, a portfolio of consumer and professional software applications, the iOS and OS X operating systems, iCloud, and a variety of accessory, service and support offerings. The Company also delivers digital content and applications through the iTunes Store, App StoreSM, iBookstoreSM, and Mac App Store. The Company distributes its products worldwide through its retail stores, online stores, and direct sales force, as well as through third-party cellular network carriers, wholesalers, retailers, and value-added resellers. In February 2012, the Company acquired app-search engine Chomp.", 

In [6]:
# 주가 정보
# divCash : 현금배당, splitFactor : 주식분할 조정계수
historical_prices = client.get_dataframe("AAPL", 
                                         startDate='2017-08-01',
                                         frequency='daily')
historical_prices.head()

Unnamed: 0_level_0,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-08-01 00:00:00+00:00,150.05,150.22,148.41,149.1,24725526,35.201397,35.241278,34.816656,34.978529,98902104,0.0,1.0
2017-08-02 00:00:00+00:00,157.14,159.75,156.16,159.28,69222793,36.864695,37.476995,36.634789,37.366734,276891172,0.0,1.0
2017-08-03 00:00:00+00:00,155.57,157.21,155.02,157.05,26000738,36.496376,36.881117,36.367348,36.843581,104002952,0.0,1.0
2017-08-04 00:00:00+00:00,156.39,157.4,155.69,156.07,20349532,36.688746,36.92569,36.524528,36.613675,81398128,0.0,1.0
2017-08-07 00:00:00+00:00,158.81,158.92,156.6701,157.06,21870321,37.256473,37.282279,36.754457,36.845927,87481284,0.0,1.0


In [7]:
# 일별 가치지표 (무료 계정은 다우존스 30 지수에 포함되는 종목만 제공)
fundamentals_daily = client.get_fundamentals_daily('AAPL')
fundamentals_daily_df = pd.DataFrame.from_records(fundamentals_daily)

fundamentals_daily_df.head()

Unnamed: 0,date,marketCap,enterpriseVal,peRatio,pbRatio,trailingPEG1Y
0,2020-11-24T00:00:00.000Z,1969699000000.0,1991192000000.0,34.30874,30.14584,-26.074642
1,2020-11-25T00:00:00.000Z,1984407000000.0,2005900000000.0,34.564931,30.370946,-26.269348
2,2020-11-27T00:00:00.000Z,1993985000000.0,2015478000000.0,34.731753,30.517527,-26.396132
3,2020-11-30T00:00:00.000Z,2036057000000.0,2057550000000.0,35.464578,31.161434,-26.95308
4,2020-12-01T00:00:00.000Z,2098823000000.0,2120316000000.0,36.557859,32.122059,-27.783973


In [8]:
# financial statements (재무제표)
fundamentals_stmnts = client.get_fundamentals_statements(
    'AAPL', startDate='2019-01-01', asReported=True, fmt='csv'
)
# 텍스트 형태의 데이터를 클렌징을 통해 데이터프레임 형태로 변환
df_fs = pd.DataFrame([x.split(',') for x in fundamentals_stmnts.split('\n')])
# 첫번째 행을 열 이름 지정후 해당 행 삭제
df_fs.columns = df_fs.iloc[0]
df_fs = df_fs[1:]
# 'data' 열을 인덱스로 지정
df_fs.set_index('date', drop=True, inplace=True)
df_fs = df_fs[df_fs.index != '']

df_fs.head()

Unnamed: 0_level_0,year,quarter,statementType,dataCode,value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-11-03,2023,4,incomeStatement,ebitda,30653000000.0
2023-11-03,2023,4,balanceSheet,debtCurrent,15807000000.0
2023-11-03,2023,4,incomeStatement,netIncComStock,22956000000.0
2023-11-03,2023,4,cashFlow,ncfi,2394000000.0
2023-11-03,2023,4,balanceSheet,taxAssets,0.0


## 티커 수집하기

[https://www.investing.com/](https://www.investing.com/)

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import math
import pandas as pd

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.investing.com/stock-screener/?sp=country::5|sector::a|industry::a|equityType::a%3Ceq_market_cap;1'
# open url
driver.get(url)

In [4]:
# HTML 정보에 해당하는 driver.page_source를 BeautifulSoup 객체로 만들어준다.
html = BeautifulSoup(driver.page_source, 'lxml')

In [6]:
# 국가명 확인
html.find(class_='js-search-input inputDropDown')['value']

'United States'

In [5]:
html_table = html.select('table.genTbl.openTbl.resultsStockScreenerTbl.elpTbl')
print(html_table[0])

<table class="genTbl openTbl resultsStockScreenerTbl elpTbl" id="resultsTable">
<thead>
<tr>
<th class="flag"> </th>
<th class="symbol left js-sortable pointer" data-column-name="name_trans"><i title="Name">Name</i><span class="headerSortDefault"></span></th><th class="left js-sortable pointer" data-column-name="viewData.symbol"><i title="Symbol">Symbol</i><span class="headerSortDefault"></span></th><th class="left displayNone js-sortable pointer" data-column-name="exchange_trans"><i title="Exchange">Exchange</i><span class="headerSortDefault"></span></th><th class="left displayNone js-sortable pointer" data-column-name="sector_trans"><i title="Sector">Sector</i><span class="headerSortDefault"></span></th><th class="left displayNone js-sortable pointer" data-column-name="industry_trans"><i title="Industry">Industry</i><span class="headerSortDefault"></span></th><th class="js-sortable pointer" data-column-name="last"><i title="Last">Last</i><span class="headerSortDefault"></span></th><t

In [10]:
# 데이터프레임 변환
# prettify() 메서드를 이용해 BeautifulSoup에서 파싱한 파서 트리를 유니코드 형태로 다시 돌려준 후, read_html() 함수를 통해 테이블을 읽어준다.
df_table = pd.read_html(html_table[0].prettify())
df_table_result = df_table[0]
df_table_result.head()

Unnamed: 0.1,Unnamed: 0,Name,Symbol,Exchange,Sector,Industry,Last,Chg. %,Market Cap,Vol.,...,Bull/Bear Power (13 / 1D),CCI (14 / 1D),Highs/Lows (14 / 1D),ROC (1D),RSI (14 / 1D),STOCH (14 / 1D),STOCHRSI (14 / 1D),Ultimate Oscillator (14 /1D),Williams %R (1D),"var columnsSettings_stock_screener = new ColumnsSettings( 'ltr', // strHtmlDir 'resultsContainer', // containerId 'resultsContainer', // tab 'stock_screener', // id 'Currently selected: X (max 8)', // message '8', // intMaxCheckboxes '1' // intMinOptCheckboxes \t); Adjust table Name Symbol Exchange Sector Industry Last Chg. % Market Cap Vol. P/E Ratio MACD (12,26 / 1D) Revenue Average Vol. (3m) EPS Beta Dividend Yield 15 Minutes Hourly Daily Weekly Monthly Daily 1 Week 1 Month YTD 1 Year 3 Years 1-Year Change Dividend Yield (%) P/E Ratio (TTM) Price to Sales (TTM) Price to Cash Flow (MRQ) Price to Free Cash Flow (TTM) Price to Book (MRQ) Price to Tangible Book (MRQ) EPS(MRQ) vs Qtr. 1 Yr. Ago EPS(TTM) vs TTM 1 Yr. Ago 5 Year EPS Growth Sales (MRQ) vs Qtr. 1 Yr. Ago Sales (TTM) vs TTM 1 Yr. Ago (TTM) 5 Year Sales Growth 5 Year Capital Spending Growth Asset Turnover (TTM) Inventory Turnover (TTM) Revenue/Employee (TTM) Net Income/Employee (TTM) Receivable Turnover (TTM) 52 wk Range - High 52 wk Range - Low % Change from 52 wk High % Change from 52 wk Low Previous Month % Change Gross margin (TTM) Gross Margin (5YA) Operating margin (TTM) Operating margin (5YA) Pretax margin (TTM) Pretax margin (5YA) Net Profit margin (TTM) Net Profit margin (5YA) Quick Ratio (MRQ) Current Ratio (MRQ) LT Debt to Equity (MRQ) Total Debt to Equity Dividend Yield 5 Year Avg. (5YA) Dividend Growth Rate (ANN) Payout Ratio (TTM) ADX (14 / 1D) ATR (14 / 1D) Bull/Bear Power (13 / 1D) CCI (14 / 1D) Highs/Lows (14 / 1D) ROC (1D) RSI (14 / 1D) STOCH (14 / 1D) STOCHRSI (14 / 1D) Ultimate Oscillator (14 /1D) Williams %R (1D) Advanced Metrics Currently selected: 6 (max 8 ) Apply $('#colSelectPopup_stock_screener').click(function(e) { e.stopPropagation(); \t});"
0,,Apple,AAPL,NASDAQ,Technology,"Computers, Phones & Household Electronics",191.31,0.35%,2.98T,39.63M,...,11.12,109.85,4.58,7.73,71.49,80.29,91.19,60.63,-8.38,
1,,Microsoft,MSFT,NASDAQ,Technology,Software & IT Services,377.85,1.28%,2.81T,23.36M,...,21.21,126.31,8.41,8.48,68.57,75.76,28.96,60.78,-5.77,
2,,Alphabet A,GOOGL,NASDAQ,Technology,Software & IT Services,138.49,1.11%,1.74T,17.82M,...,8.05,158.43,4.28,8.64,62.5,71.87,100.0,64.23,-7.56,
3,,Alphabet C,GOOG,NASDAQ,Technology,Software & IT Services,140.02,1.01%,1.74T,14.95M,...,8.27,150.8,4.23,8.89,62.55,72.41,100.0,63.89,-8.71,
4,,Amazon.com,AMZN,NASDAQ,Consumer Cyclicals,Diversified Retail,146.71,1.95%,1.52T,45.70M,...,7.97,157.47,2.65,6.29,63.92,65.87,54.69,62.47,-9.17,


In [9]:
df_table_select = df_table[0][['Name', 'Symbol', 'Exchange', 'Sector', 'Market Cap']]
df_table_select.head()

Unnamed: 0,Name,Symbol,Exchange,Sector,Market Cap
0,Apple,AAPL,NASDAQ,Technology,2.98T
1,Microsoft,MSFT,NASDAQ,Technology,2.81T
2,Alphabet A,GOOGL,NASDAQ,Technology,1.74T
3,Alphabet C,GOOG,NASDAQ,Technology,1.74T
4,Amazon.com,AMZN,NASDAQ,Consumer Cyclicals,1.52T


In [11]:
# 페이지 수 계산
end_num = driver.find_element(By.CLASS_NAME, value='js-total-results').text
print(math.ceil(int(end_num) / 50))

226


In [12]:
# 드라이버 종료
driver.quit()

## 전 종목 티커 크롤링

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from datetime import datetime
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

# 크롬 드라이버 불러오기
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
# 국가코드 미국에 해당하는 '5'를 입력
nationcode = '5'
# 첫페이지 URL
url = f'''https://investing.com/stock-screener/?sp=country::
{nationcode}|sector::a|industry::a|equityType::ORD%3Ceq_market_cap;1'''
# 셀레니움으로 페이지 열기
driver.get(url)

# 'Screener Results'에 해당하는 부문은 종목이 들어있는 테이블이 로딩된 이후 나타난다. 
# WebDriverWait() 함수를 통해 해당 테이블이 로딩될 떄까지 기다린다.
# 테이블의 XPATH는 '//*[@id="resultsTable"]/tbody'
WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
    (By.XPATH, '//*[@id="resultsTable"]/tbody')
))

# 종목수에 해당하는 부분을 크롤링한 후, 이를 통해 페이지 수를 계산
end_num = driver.find_element(By.CLASS_NAME, value='js-total-results').text
end_num = math.ceil(int(end_num) / 50)

In [15]:
all_data_df = []

# 전체 페이지에서 종목명과 티커 등의 정보를 크롤링

for i in tqdm(range(1, end_num + 1)):
    url = f'''https://investing.com/stock-screener/?sp=country::
        {nationcode}|sector::a|industry::a|equityType::ORD%3Ceq_market_cap;{i}'''
    driver.get(url)
    
    try:
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
            (By.XPATH, '//*[@id="resultsTable"]/tbody')
        ))
    except:
        time.sleep(1)
        driver.refresh()
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
            (By.XPATH, '//*[@id="resultsTable"]/tbody')
        ))
    
    html = BeautifulSoup(driver.page_source, 'lxml')
    html_table = html.select(
        'table.genTbl.openTbl.resultsStockScreenerTbl.elpTbl'
    )
    df_table = pd.read_html(html_table[0].prettify())
    df_table_select = df_table[0][['Name', 'Symbol', 'Exchange', 'Sector', 'Market Cap']]
    
    all_data_df.append(df_table_select)
    
    time.sleep(2)
    
# for 문이 끝나면 concat() 함수를 통해 리스트 내 모든 데이터프레임을 행으로 묶어 준다.
all_data_df_bind = pd.concat(all_data_df, axis=0)

data_country = html.find(class_='js-search-input inputDropDown')['value']
all_data_df_bind['country'] = data_country
all_data_df_bind['date'] = datetime.today().strftime('%Y-%m-%d')
# 일부 종목의 경우 종목명이 빈칸으로 들어오므로 이를 제거
all_data_df_bind = all_data_df_bind[~all_data_df_bind['Name'].isnull()]
# 거래 가능한 거래소만 선택
all_data_df_bind = all_data_df_bind[all_data_df_bind['Exchange'].isin(
    ['NASDAQ', 'NYSE', 'NYSE Amex']
)]
# 중복으로 들어오는 경우 1개만 남김
all_data_df_bind = all_data_df_bind.drop_duplicates(['Symbol'])
all_data_df_bind.reset_index(inplace=True, drop=True)
all_data_df_bind = all_data_df_bind.replace({np.nan: None})

driver.quit()

100%|██████████| 167/167 [12:40<00:00,  4.56s/it]


In [20]:
all_data_df_bind[1150:1159]

Unnamed: 0,Name,Symbol,Exchange,Sector,Market Cap,country,date
1150,Enstar,ESGR,NASDAQ,Financials,4.09B,United States,2023-11-25
1151,Madison Square Garden Sports,MSGS,NYSE,Consumer Cyclicals,4.09B,United States,2023-11-25
1152,Lincoln National,LNC,NYSE,Financials,4.09B,United States,2023-11-25
1153,Grand Canyon Education,LOPE,NASDAQ,Academic & Educational Services,4.08B,United States,2023-11-25
1154,Schneider National,SNDR,NYSE,Industrials,4.08B,United States,2023-11-25
1155,Crescent Point Energy,CPG,NYSE,Energy,4.07B,United States,2023-11-25
1156,Synaptics,SYNA,NASDAQ,Technology,4.07B,United States,2023-11-25
1157,Ashland Global,ASH,NYSE,Basic Materials,4.05B,United States,2023-11-25
1158,Hashicorp,HCP,NASDAQ,Technology,4.05B,United States,2023-11-25


In [24]:
# insert into database
import pymysql

con = pymysql.connect(user='root',
              passwd='04250629',
              host='127.0.0.1',
              db='stock',
              charset='utf8')
mycursor = con.cursor()
query = """
INSERT INTO ticker_global (name, symbol, exchange, sector, market_cap, country, date)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
name=VALUES(name), exchange=VALUES(exchange), sector=VALUES(exchange), market_cap=VALUES(market_cap);
"""

args = all_data_df_bind.values.tolist()

mycursor.executemany(query, args)
con.commit()

con.close()

## 주가 다운로드