# Yahoo! Finance Scraper
Extract financial data and historical stock prices from Yahoo! Finance using background java strings and a hidden api.

In [None]:
import re
import json
import csv
from io import StringIO
from bs4 import BeautifulSoup
import requests

- First, navigate to [https://finance.yahoo.com/](https://finance.yahoo.com/) and enter the stock you want to look up.  
- You'll noticed several tabs along the page such as "Stats", "Chart", "Financials", "Analysis", etc...  
- Navigate to the "Financials" tab. Notice that the **Income Statement** and the **Balance Sheet** are available as well as **Annual** and **Quarterly** options.  
- Copy the url for this tab, and for "Profile" and "Financials". We are going to scrape the data from these 3 tabs first.  

Replace the stock symbol in the url with a curly brace to turn it into a template.

In [None]:
# url templates
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_financials = 'https://finance.yahoo.com/quote/{}/financials?p={}'

# the stock I want to scrape
ticker = 'AAPL'

## Extracting and parsing the html & json data

Now, use the "Financials" template to request the webpage, passing in the stock variable to fill in the url template.

In [None]:
headers = {'User-Agent': 'Mozilla/5.0'}
#response = requests.get(url_financials.format(stock, stock))

#https://finance.yahoo.com/most-active?count=100&offset=0
response=requests.get(url_financials.format(ticker,ticker),headers=headers)   
print("response.ok : {} , response.status_code : {}".format(response.ok , response.status_code))

Next, parse the html using `BeautifulSoup`

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
# sc=soup.find_all('script')
# sc
#print(soup.prettify())

In [None]:
#data found in the line 237 onwards

If you were to look at the raw html, you would notice that there is a lot of javascript code and not a lot of html to work with. You may also notice that embedded in the code there are json formatted text strings. Fortunately for us, there is a javascript function, appropriately commented with "--Data--". This function is located inside of a generic "script" tag. However, we can use regular expressions with BeautifulSoup in order to identify the script tag with the function we're looking for.

In [None]:
pattern = re.compile(r'\s--\sData\s--\s') #\s is the whitespace character
script_data = soup.find('script', text=pattern).contents[0]

In [None]:
#print(script_data.prettify())
type(script_data)

In [None]:
#script_data

There's a lot of good json data here, but it's wrapped in a javascript function, as you can clearly see. However, if we can identify the starting and ending position of this json data, we can slice it and then parse it with the `json.loads` function.

In [None]:
# beginning
script_data[:500]

In [None]:
# the end
script_data[-500:]

In [None]:
script_data[-1]

In [None]:
shares_number=script_data.find("Ordinary Shares Number")
shares_number

In [None]:
# find the starting position of the json string
# remvoes ;\n}(this));\n'
start = script_data.find("context")-2

# slice the json string
json_data = json.loads(script_data[start:-12]) #loads mean load from string

In [None]:
#script_data[start:]

In [None]:
#json_data

In [None]:
json_data['context']['dispatcher']['stores']#.keys()

## Financial statements

Now that you have the data, you can explore the dictionary to discover what's inside. This dataset contains both Annual and Quarterly financial statements, as you can see from the dictionary paths listed below.

In [None]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['balanceSheetHistoryQuarterly']['balanceSheetStatements']

In [None]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore'].keys()

In [None]:
dic_keys=json_data['context']['dispatcher']['stores']['QuoteSummaryStore'].keys()
dic_keys

In [None]:
import pandas as pd
for c,i in enumerate(dic_keys,start=1):
    #if c in [1,12]:continue
    print(f"\n{i:=^130}\n")
    print(f"{c}){i}")
    temp=json_data['context']['dispatcher']['stores']['QuoteSummaryStore'][i].keys()
    print(f"temp:\n{temp}")
    for d,j in enumerate(temp,start=1):
        #pass
        temp1=json_data['context']['dispatcher']['stores']['QuoteSummaryStore'][i][j]
        if type(temp1) is dict:
        
            print(f"\ntemp1 {d}->{j}):\n{temp1}\n")
            
        else:print("not dictionary !\n")
           
#         stmts = []
#         for s in temp1:
#             statement = {}
#             for key, val in s.items():
#                 try:
#                     statement[key] = val['raw']
#                 except TypeError:
#                     continue
#                 except KeyError:
#                     continue
#             stmts.append(statement)
    
    #
        
        #pass
    #temp_list=list(temp.values())[0]
    #print(f"{type(temp)}")
    # print(f"keys: {temp.keys()}")
    # print(f"\nfirst: {temp_list}\n")
    #print(f"{temp.to_string()}")
    #print(json_data['context']['dispatcher']['stores']['QuoteSummaryStore'][i])
    #if test in ['cashflowStatements']
    
    
    #stmts = []

# consolidate annual
    # for s in temp:
    #     statement = {}
    #     for key, val in s.items():
    #         try:
    #             statement[key] = val['raw']
    #         except TypeError:
    #             continue
    #         except KeyError:
    #             continue
    #     stmts.append(statement)
    
    #
    
    

In [None]:
#json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['cashflowStatementHistory']#['financialsTemplate']#.keys()

In [None]:
quaterly_is_stmts = []

# consolidate annual
for s in quarterly_is:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    quaterly_is_stmts.append(statement)

In [None]:
# income statement
annual_is = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['incomeStatementHistory']['incomeStatementHistory']
quarterly_is = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['incomeStatementHistoryQuarterly']['incomeStatementHistory']

# cash flow statement
annual_cf = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['cashflowStatementHistory']['cashflowStatements']
quarterly_cf = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['cashflowStatementHistoryQuarterly']['cashflowStatements']

# balance sheet
annual_bs = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['balanceSheetHistory']['balanceSheetStatements']
quarterly_bs = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['balanceSheetHistoryQuarterly']['balanceSheetStatements']

In [None]:
quarterly_bs_test = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['balanceSheetHistoryQuarterly']
#quarterly_bs_test

In [None]:
# example of income statmement accounts
print(quarterly_bs[0])

In [None]:
quarterly_bs[0].keys()

In [None]:
quaterly_is_stmts = []

# consolidate annual
for s in quarterly_is:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    quaterly_is_stmts.append(statement)

In [None]:
import pandas as pd
df=pd.DataFrame(quaterly_is_stmts)
df.T

In [None]:
quaterly_bs_stmts = []

# consolidate annual
for s in quarterly_bs:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    quaterly_bs_stmts.append(statement)

In [None]:
#import pandas as pd
df1=pd.DataFrame(quaterly_bs_stmts)
df1=df1.T
df1

In [None]:
# shares=64849000000;
# shares_in_millions=shares*1.0/10^3
# shares_in_millions

In [None]:
# there's a variety of  number formats provided
annual_is[0]['operatingIncome']

The data can be consoldated into an easy to read, or export, data set with a loop

In [None]:
annual_is_stmts = []

# consolidate annual
for s in annual_is:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    annual_is_stmts.append(statement)

In [None]:
annual_is_stmts[0]

This model can be applied to all other financial statements, as you can see from the examples below.

In [None]:
annual_cf_stmts = []
quarterly_cf_stmts = []

# annual
for s in annual_cf:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    annual_cf_stmts.append(statement)
    
# quarterly
for s in quarterly_cf:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    quarterly_cf_stmts.append(statement)

In [None]:
annual_cf_stmts[0]

## Profile Data

We can copy the same steps from the Financial statements on the Profile data

In [None]:
response = requests.get(url_profile.format(stock, stock))
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'\s--\sData\s--\s')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context")-2
json_data = json.loads(script_data[start:-12])

In [None]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore'].keys()

In [None]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile'].keys()

In [None]:
# data for company officers (just the first 3 are listed for brevity )
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['companyOfficers'][:3]

In [None]:
# business description
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['longBusinessSummary']

In [None]:
# sec filings from Edgars ( just the first 3 are listed for brevity )
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['secFilings']['filings'][:3]

In [None]:
# lot of other data is available
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['summaryDetail']

# Statistics

In [None]:
response = requests.get(url_stats.format(stock, stock))
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'\s--\sData\s--\s')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context")-2
json_data = json.loads(script_data[start:-12])

In [None]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']

# Historical Stock Data

This data uses a hidden api, as you can see from the "query" prefix, the version number (V7), and the variety of parameters.

In [None]:
stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/F?period1=1568483641&period2=1600106041&interval=1d&events=history'

In [None]:
response = requests.get(stock_url)

In [None]:
# extract the csv data
file = StringIO(response.text)
reader = csv.reader(file)
data = list(reader)

# show the first 5 records
for row in data[:5]:
    print(row)

You can start to customize this by pulling out the parameters from the URL and putting them into a dictionary. 

In [None]:
stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/{}?'

params = {
    'period1':'1568483641',
    'period2':'1600106041',
    'interval':'1d',
    'events':'history'
}

By inspecting the request headers and parameters online, it's possible to see how this can be simplified further... by using the range parameter instead of the periods.

In [None]:
params = {
    'range': '5y',
    'interval':'1d',
    'events':'history'
}

In [None]:
response = requests.get(stock_url.format(stock), params=params)

In [None]:
# extract the csv data
file = StringIO(response.text)
reader = csv.reader(file)
data = list(reader)

# show the first 5 records
for row in data[:5]:
    print(row)