In [None]:
## Date: May 15 2021
## Author: Benjamin Diethelm-Varela
## Tools for extracting stock data using Python

This is a small demonstration on how to collect, webscrape and visualize stock data using Pytohn librares.

Two key tools used here are yfinance and BeautifulSoup.

In [83]:
%matplotlib widget
import yfinance as yf
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() 

In [71]:
# Use ticker module to retrieve info from a company. Say: IBM
ibm = yf.Ticker('IBM')

# Use the info method to display the info. It is displayed as a dict
ibm.info

{'zip': '10504',
 'sector': 'Technology',
 'fullTimeEmployees': 345900,
 'longBusinessSummary': "International Business Machines Corporation provides integrated solutions and services worldwide. Its Cloud & Cognitive Software segment offers software for vertical and domain-specific solutions in health, financial services, supply chain, and asset management, weather, and security software and services application areas; and customer information control system and storage, and analytics and integration software solutions to support client mission critical on-premise workloads in banking, airline, and retail industries. It also offers middleware and data platform software, including Red Hat that enables the operation of clients' hybrid multi-cloud environments; and Cloud Paks, WebSphere distributed, and analytics platform software, such as DB2 distributed, information integration, and enterprise content management, as well as IoT, Blockchain and AI/Watson platforms. The company's Global B

In [72]:
# Use the country key to see where this stock is from
ibm.info['country']

'United States'

In [73]:
# Use the history method to get the share price of a stock over the defined time (in this case the longest stretch possible)
ibmhist = ibm.history(period='max')
ibmhist

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1962-01-02,1.858593,1.858593,1.837710,1.837710,390000,0.0,0.0
1962-01-03,1.837710,1.853774,1.837710,1.853774,292500,0.0,0.0
1962-01-04,1.853772,1.853772,1.834496,1.835299,262500,0.0,0.0
1962-01-05,1.832889,1.832889,1.795942,1.799155,367500,0.0,0.0
1962-01-08,1.797550,1.797550,1.750964,1.765422,547500,0.0,0.0
...,...,...,...,...,...,...,...
2021-05-10,145.800003,148.380005,145.800003,146.169998,6983400,0.0,0.0
2021-05-11,144.990005,145.190002,142.899994,144.220001,7126400,0.0,0.0
2021-05-12,143.839996,144.149994,141.139999,141.300003,5959600,0.0,0.0
2021-05-13,141.449997,144.899994,141.279999,144.169998,4595800,0.0,0.0


In [74]:
ibmhist = ibmhist.reset_index()

In [75]:
ibmhist

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1962-01-02,1.858593,1.858593,1.837710,1.837710,390000,0.0,0.0
1,1962-01-03,1.837710,1.853774,1.837710,1.853774,292500,0.0,0.0
2,1962-01-04,1.853772,1.853772,1.834496,1.835299,262500,0.0,0.0
3,1962-01-05,1.832889,1.832889,1.795942,1.799155,367500,0.0,0.0
4,1962-01-08,1.797550,1.797550,1.750964,1.765422,547500,0.0,0.0
...,...,...,...,...,...,...,...,...
14940,2021-05-10,145.800003,148.380005,145.800003,146.169998,6983400,0.0,0.0
14941,2021-05-11,144.990005,145.190002,142.899994,144.220001,7126400,0.0,0.0
14942,2021-05-12,143.839996,144.149994,141.139999,141.300003,5959600,0.0,0.0
14943,2021-05-13,141.449997,144.899994,141.279999,144.169998,4595800,0.0,0.0


In [76]:
# Let's plot open prices against dates!!
plt.figure();
sns.lineplot(data=ibm.history(period='max').reset_index(), x='Date', y='Open', alpha=0.75, color='darkred');
plt.title('IBM stocks open price history')
plt.ylabel('Open price (US dollars)')
plt.xlabel('Date (year)')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'Date (year)')

In [77]:
divs = pd.DataFrame(ibm.dividends)

In [78]:
divs = divs.reset_index()
divs

Unnamed: 0,Date,Dividends
0,1962-02-06,0.001000
1,1962-05-08,0.001000
2,1962-08-07,0.001000
3,1962-11-05,0.001000
4,1963-02-05,0.001333
...,...,...
231,2020-05-07,1.630000
232,2020-08-07,1.630000
233,2020-11-09,1.630000
234,2021-02-09,1.630000


In [79]:
plt.figure();
sns.lineplot(data=divs, x='Date', y='Dividends', color='darkgreen');
plt.title('IBM dividends over time');
plt.ylabel('Dividends (dollar return per owned share)');
plt.xlabel('Date (year)');

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Further data extraction with BeautifulSoup webscraping

In [84]:
# Assign url to variable and get text
page = requests.get(' https://finance.yahoo.com/quote/AMZN/history?period1=1451606400&period2=1612137600&interval=1mo&filter=history&frequency=1mo&includeAdjustedClose=true').text

In [86]:
# Parse html data with BeautifulSoup
beautiful_soup = BeautifulSoup(page, "html5lib")

In [87]:
# Check the title atribute
beautiful_soup.title

<title>Amazon.com, Inc. (AMZN) Stock Historical Prices &amp; Data - Yahoo Finance</title>

In [95]:
# Let's extract a table with historical share prices. We first take a look at the columns we need to make
for row in beautiful_soup.find("tbody").find_all("tr"):
    col = row.find_all("td")
    date =col[0].text
    for item in col:
        print(item.text)
    
# We can see that we get one date item followed by 6 items. In stocks jargon these correspond to open, high, low, close, volume,
# and adjusted close

Jan 01, 2021
3,270.00
3,363.89
3,086.00
3,206.20
3,206.20
71,528,900
Dec 01, 2020
3,188.50
3,350.65
3,072.82
3,256.93
3,256.93
77,556,200
Nov 01, 2020
3,061.74
3,366.80
2,950.12
3,168.04
3,168.04
90,810,500
Oct 01, 2020
3,208.00
3,496.24
3,019.00
3,036.15
3,036.15
116,226,100
Sep 01, 2020
3,489.58
3,552.25
2,871.00
3,148.73
3,148.73
115,899,300
Aug 01, 2020
3,180.51
3,495.00
3,073.00
3,450.96
3,450.96
83,516,600
Jul 01, 2020
2,757.99
3,344.29
2,754.00
3,164.68
3,164.68
127,502,000
Jun 01, 2020
2,448.00
2,796.00
2,437.13
2,758.82
2,758.82
87,818,300
May 01, 2020
2,336.80
2,525.45
2,256.38
2,442.37
2,442.37
82,584,400
Apr 01, 2020
1,932.97
2,475.00
1,889.15
2,474.00
2,474.00
124,609,800
Mar 01, 2020
1,906.49
1,996.33
1,626.03
1,949.72
1,949.72
163,809,100
Feb 01, 2020
2,010.60
2,185.95
1,811.13
1,883.75
1,883.75
92,510,100
Jan 01, 2020
1,875.00
2,055.72
1,815.34
2,008.72
2,008.72
84,698,300
Dec 01, 2019
1,804.40
1,901.40
1,735.00
1,847.84
1,847.84
68,149,600
Nov 01, 2019
1,788.01
1,824.6

In [96]:
# Let's generate the table now
amazon_data = pd.DataFrame(columns=["Date", "Open", "High", "Low", "Close", "Volume"])
for row in beautiful_soup.find("tbody").find_all("tr"):
    col = row.find_all("td")
    date =col[0].text
    Open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text
    
    amazon_data = amazon_data.append({"Date":date, "Open":Open, "High":high, "Low":low, "Close":close, "Adj Close":adj_close, "Volume":volume}, ignore_index=True)

In [97]:
# Check the data
amazon_data.head(50)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Jan 01, 2021",3270.0,3363.89,3086.0,3206.2,71528900,3206.2
1,"Dec 01, 2020",3188.5,3350.65,3072.82,3256.93,77556200,3256.93
2,"Nov 01, 2020",3061.74,3366.8,2950.12,3168.04,90810500,3168.04
3,"Oct 01, 2020",3208.0,3496.24,3019.0,3036.15,116226100,3036.15
4,"Sep 01, 2020",3489.58,3552.25,2871.0,3148.73,115899300,3148.73
5,"Aug 01, 2020",3180.51,3495.0,3073.0,3450.96,83516600,3450.96
6,"Jul 01, 2020",2757.99,3344.29,2754.0,3164.68,127502000,3164.68
7,"Jun 01, 2020",2448.0,2796.0,2437.13,2758.82,87818300,2758.82
8,"May 01, 2020",2336.8,2525.45,2256.38,2442.37,82584400,2442.37
9,"Apr 01, 2020",1932.97,2475.0,1889.15,2474.0,124609800,2474.0
