# Project 9: Web Scraping, APIs & Wrappers (US Stocks)

## Web Scraping - the Dow Jones Constituents

In [1]:
import pandas as pd

In [2]:
pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")

[                                                   0  \
 0  Historical logarithmic graph of the DJIA from ...   
 1                                         Foundation   
 2                                           Operator   
 3                                          Exchanges   
 4                                     Trading symbol   
 5                                       Constituents   
 6                                               Type   
 7                                         Market cap   
 8                                   Weighting method   
 9                                            Website   
 
                                                    1  
 0  Historical logarithmic graph of the DJIA from ...  
 1  February 16, 1885; 136 years ago[1]May 26, 189...  
 2                              S&P Dow Jones Indices  
 3                      New York Stock ExchangeNASDAQ  
 4                                  ^DJI$INDU.DJIDJIA  
 5                                 

In [26]:
const = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]

In [4]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,Index weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,3.62%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,3.00%
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,,4.18%
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,,2.78%
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,4.12%
5,Caterpillar Inc.,NYSE,CAT,Construction and Mining,1991-05-06,,3.96%
6,Chevron Corporation,NYSE,CVX,Petroleum industry,2008-02-19,Also 1930-07-18 to 1999-11-01,1.82%
7,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08,,1.10%
8,The Coca-Cola Company,NYSE,KO,Food industry,1987-03-12,Also 1932-05-26 to 1935-11-20,1.04%
9,Dow Inc.,NYSE,DOW,Chemical industry,2019-04-02,,1.18%


In [27]:
const = const.iloc[:, :5].copy()
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added
0,3M,NYSE,MMM,Conglomerate,1976-08-09
1,American Express,NYSE,AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12
5,Caterpillar Inc.,NYSE,CAT,Construction and Mining,1991-05-06
6,Chevron Corporation,NYSE,CVX,Petroleum industry,2008-02-19
7,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08
8,The Coca-Cola Company,NYSE,KO,Food industry,1987-03-12
9,Dow Inc.,NYSE,DOW,Chemical industry,2019-04-02


In [28]:
const.rename(columns = {"Date added":"Date_Added"}, inplace = True)

In [29]:
const.Date_Added = pd.to_datetime(const.Date_Added)

In [30]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
Company       30 non-null object
Exchange      30 non-null object
Symbol        30 non-null object
Industry      30 non-null object
Date_Added    30 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


## Normalizing Unicode Strings and Getting the Ticker Symbols

In [31]:
import unicodedata

In [32]:
const.Symbol

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DOW
10      GS
11      HD
12     HON
13     IBM
14    INTC
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21      PG
22     CRM
23     TRV
24     UNH
25      VZ
26       V
27     WBA
28     WMT
29     DIS
Name: Symbol, dtype: object

In [34]:
const.Symbol[1]

'AXP'

In [11]:
const.Symbol.to_list()

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [12]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
Company       30 non-null object
Exchange      30 non-null object
Symbol        30 non-null object
Industry      30 non-null object
Date_Added    30 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


In [13]:
const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DOW
10      GS
11      HD
12     HON
13     IBM
14    INTC
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21      PG
22     CRM
23     TRV
24     UNH
25      VZ
26       V
27     WBA
28     WMT
29     DIS
Name: Symbol, dtype: object

In [14]:
const.Symbol = const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

In [15]:
const.Symbol[0]

'MMM'

In [16]:
const["Ticker"] = const.Symbol.str.split(": ").apply(lambda x: x[-1])

In [17]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date_Added,Ticker
0,3M,NYSE,MMM,Conglomerate,1976-08-09,MMM
1,American Express,NYSE,AXP,Financial services,1982-08-30,AXP
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,AMGN
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,AAPL
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,BA
5,Caterpillar Inc.,NYSE,CAT,Construction and Mining,1991-05-06,CAT
6,Chevron Corporation,NYSE,CVX,Petroleum industry,2008-02-19,CVX
7,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08,CSCO
8,The Coca-Cola Company,NYSE,KO,Food industry,1987-03-12,KO
9,Dow Inc.,NYSE,DOW,Chemical industry,2019-04-02,DOW


In [18]:
ticker_list = const.Ticker.to_list()

In [19]:
ticker_list

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [20]:
const.to_csv("const.csv", index = False)

## Loading and Saving Historical Stock Prices

In [35]:
import pandas as pd
import yfinance as yf

ModuleNotFoundError: No module named 'yfinance'

In [27]:
ticker_list

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [None]:
prices = yf.download(ticker_list, start = "2007-01-01", end = "2020-03-31")

In [None]:
prices

In [None]:
prices.info()

In [None]:
prices = prices.loc[:,"Close"].copy()

In [None]:
prices.info()

In [None]:
prices

In [None]:
prices.to_csv("const_prices.csv")

In [None]:
dji = yf.download("^DJI", start = "2007-01-01",  end = "2020-03-31")

In [None]:
dji

In [None]:
dji.to_csv("dji.csv")