## Web scraping - DJIA 

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average") #interested in the table 1

[                                                   0  \
 0  Historical logarithmic graph of the DJIA from ...   
 1                                         Foundation   
 2                                           Operator   
 3                                          Exchanges   
 4                                     Trading symbol   
 5                                       Constituents   
 6                                               Type   
 7                                         Market cap   
 8                                   Weighting method   
 9                                            Website   
 
                                                    1  
 0  Historical logarithmic graph of the DJIA from ...  
 1  February 16, 1885; 135 years ago[1]May 26, 189...  
 2                              S&P Dow Jones Indices  
 3                      New York Stock ExchangeNASDAQ  
 4                                               ^DJI  
 5                                 

In [3]:
df_djia = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]
df_djia.head()

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,"Index weighting(as of September 1, 2020"
0,3M Company,NYSE,NYSE: MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,3.80%
1,American Express,NYSE,NYSE: AXP,Financial services,1982-08-30,,2.35%
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,,5.81%
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,,2.87%
4,Boeing,NYSE,NYSE: BA,Aerospace and defense,1987-03-12,,4.04%


In [4]:
#remove last two columns

df_djia = df_djia.iloc[:, :5].copy()

In [5]:
df_djia.head()

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added
0,3M Company,NYSE,NYSE: MMM,Conglomerate,1976-08-09
1,American Express,NYSE,NYSE: AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19
4,Boeing,NYSE,NYSE: BA,Aerospace and defense,1987-03-12


In [6]:
df_djia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Company     30 non-null     object
 1   Exchange    30 non-null     object
 2   Symbol      30 non-null     object
 3   Industry    30 non-null     object
 4   Date added  30 non-null     object
dtypes: object(5)
memory usage: 1.3+ KB


In [7]:
df_djia.rename(columns={'Date added': 'Date_added'}, inplace=True)

In [8]:
df_djia['Date_added']= pd.to_datetime(df_djia['Date_added'])

In [9]:
#use symbol to pull prices from Yahoo
df_djia['Symbol'][0]

'NYSE:\xa0MMM'

In [10]:
#unicode - convert
import unicodedata

In [11]:
#first - normalize unicodedata
df_djia['Symbol'].apply(lambda x: unicodedata.normalize('NFKD',x))

0     NYSE: MMM
1     NYSE: AXP
2          AMGN
3          AAPL
4      NYSE: BA
5     NYSE: CAT
6     NYSE: CVX
7          CSCO
8      NYSE: KO
9     NYSE: DOW
10     NYSE: GS
11     NYSE: HD
12    NYSE: HON
13    NYSE: IBM
14         INTC
15    NYSE: JNJ
16    NYSE: JPM
17    NYSE: MCD
18    NYSE: MRK
19         MSFT
20    NYSE: NKE
21     NYSE: PG
22    NYSE: CRM
23    NYSE: TRV
24    NYSE: UNH
25     NYSE: VZ
26      NYSE: V
27          WBA
28    NYSE: WMT
29    NYSE: DIS
Name: Symbol, dtype: object

In [12]:
df_djia['Symbol'] = df_djia['Symbol'].apply(lambda x: unicodedata.normalize('NFKD', x))

In [13]:
df_djia['Symbol'][:2]

0    NYSE: MMM
1    NYSE: AXP
Name: Symbol, dtype: object

In [14]:
#second - grab ticker 

df_djia['Ticker'] = df_djia['Symbol'].str.split(': ').apply(lambda x: x[-1])

In [15]:
df_djia.head()

Unnamed: 0,Company,Exchange,Symbol,Industry,Date_added,Ticker
0,3M Company,NYSE,NYSE: MMM,Conglomerate,1976-08-09,MMM
1,American Express,NYSE,NYSE: AXP,Financial services,1982-08-30,AXP
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,AMGN
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,AAPL
4,Boeing,NYSE,NYSE: BA,Aerospace and defense,1987-03-12,BA


In [16]:
#third - use ticker to pull price from yahoo

import yfinance as yf

In [17]:
ticker_list = df_djia['Ticker'].to_list()
ticker_list

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [18]:
df_price = yf.download(ticker_list, start='2000-01-01', end ='2020-12-31')
df_price.head()

[*********************100%***********************]  30 of 30 completed


Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2000-01-03,0.86217,50.17403,34.383446,25.741438,11.685953,,40.733959,16.678095,23.115221,,...,5978800.0,53228400.0,8014400.0,4275000.0,336400.0,3431200.0,,4663700.0,2095900.0,8369900.0
2000-01-04,0.789479,46.337494,33.085163,25.701408,11.535748,,38.44873,16.678095,24.469255,,...,7533100.0,54119000.0,9810400.0,4270800.0,494400.0,5997600.0,,5005800.0,1709100.0,6745100.0
2000-01-05,0.801033,47.931908,32.244839,27.302753,11.806119,,38.330936,16.977245,25.484774,,...,7598300.0,64059600.0,6542400.0,5098400.0,736000.0,10562400.0,,6368500.0,2167800.0,7018700.0
2000-01-06,0.731713,48.729095,32.90176,27.582977,12.406937,,37.694832,17.700218,24.469255,,...,4760500.0,54976600.0,4891200.0,6524200.0,660400.0,15300000.0,,4705600.0,2595400.0,6544500.0
2000-01-07,0.766373,54.209888,33.380791,28.383648,12.812489,,39.909409,18.011839,24.082386,,...,10373300.0,62013600.0,3993600.0,9832000.0,594700.0,16160800.0,,5043800.0,3629900.0,7976900.0


In [19]:
df_price.columns.levels[0]

Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')

In [20]:
#interested in close price 

idx = pd.IndexSlice

In [21]:
df_close_price = df_price.loc[idx[:], idx['Close']]

In [22]:
pd.options.display.max_columns = 30

In [23]:
df_close_price.head()

Unnamed: 0_level_0,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,GS,HD,HON,IBM,INTC,JNJ,JPM,KO,MCD,MMM,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
2000-01-03,0.999442,62.9375,45.88031,40.1875,24.3125,,54.03125,41.8125,29.471687,,88.3125,65.1875,54.044136,116.0,43.5,46.09375,48.583332,28.1875,39.625,47.1875,67.625,58.28125,6.015625,53.59375,33.0,6.71875,,53.903156,28.5625,66.8125
2000-01-04,0.915179,58.125,44.147945,40.125,24.0,,51.0,41.8125,31.198063,,82.75,61.75,53.090767,112.0625,41.46875,44.40625,47.25,28.21875,38.8125,45.3125,65.25,56.3125,5.6875,52.5625,32.5625,6.632812,,52.160721,27.5,64.3125
2000-01-05,0.928571,60.125,42.962643,42.625,24.5625,,50.84375,42.5625,32.492844,,78.875,63.0,52.37574,116.0,41.8125,44.875,46.958332,28.46875,39.4375,46.625,67.8125,56.90625,6.015625,51.5625,32.3125,6.617188,,53.903156,27.8125,63.0
2000-01-06,0.848214,61.125,43.83794,43.0625,25.8125,,50.0,44.375,31.198063,,82.25,60.0,53.388695,114.0,39.375,46.28125,47.625,28.5,38.875,50.375,68.375,55.0,5.984375,53.9375,32.9375,6.859375,,53.284874,27.0,63.6875
2000-01-07,0.888393,68.0,44.476181,44.3125,26.65625,,52.9375,45.15625,30.704813,,82.5625,63.5,56.248802,113.5,41.0,48.25,48.5,30.375,39.875,51.375,74.9375,55.71875,5.984375,58.25,34.25,7.664062,,52.891418,27.6875,68.5


In [24]:
#save as csv

df_close_price.to_csv('DJIA_Close_Price.csv')

In [25]:
pd.read_csv("DJIA_Close_Price.csv").set_index('Date')

Unnamed: 0_level_0,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,GS,HD,HON,IBM,INTC,JNJ,JPM,KO,MCD,MMM,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
2000-01-03,0.999442,62.937500,45.880310,40.187500,24.312500,,54.031250,41.812500,29.471687,,88.312500,65.187500,54.044136,116.000000,43.500000,46.093750,48.583332,28.187500,39.625000,47.187500,67.625000,58.281250,6.015625,53.593750,33.000000,6.718750,,53.903156,28.562500,66.812500
2000-01-04,0.915179,58.125000,44.147945,40.125000,24.000000,,51.000000,41.812500,31.198063,,82.750000,61.750000,53.090767,112.062500,41.468750,44.406250,47.250000,28.218750,38.812500,45.312500,65.250000,56.312500,5.687500,52.562500,32.562500,6.632812,,52.160721,27.500000,64.312500
2000-01-05,0.928571,60.125000,42.962643,42.625000,24.562500,,50.843750,42.562500,32.492844,,78.875000,63.000000,52.375740,116.000000,41.812500,44.875000,46.958332,28.468750,39.437500,46.625000,67.812500,56.906250,6.015625,51.562500,32.312500,6.617188,,53.903156,27.812500,63.000000
2000-01-06,0.848214,61.125000,43.837940,43.062500,25.812500,,50.000000,44.375000,31.198063,,82.250000,60.000000,53.388695,114.000000,39.375000,46.281250,47.625000,28.500000,38.875000,50.375000,68.375000,55.000000,5.984375,53.937500,32.937500,6.859375,,53.284874,27.000000,63.687500
2000-01-07,0.888393,68.000000,44.476181,44.312500,26.656250,,52.937500,45.156250,30.704813,,82.562500,63.500000,56.248802,113.500000,41.000000,48.250000,48.500000,30.375000,39.875000,51.375000,74.937500,55.718750,5.984375,58.250000,34.250000,7.664062,,52.891418,27.687500,68.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-23,130.960007,222.940002,116.949997,219.690002,178.419998,227.429993,44.380001,85.690002,173.550003,54.750000,256.450012,269.809998,209.990005,123.900002,46.570000,151.940002,125.070000,53.080002,212.020004,173.990005,79.760002,221.020004,141.759995,136.339996,137.949997,337.589996,205.300003,58.959999,39.869999,143.220001
2020-12-24,131.970001,222.929993,117.349998,217.149994,179.559998,225.779999,44.540001,85.330002,173.729996,54.790001,256.160004,270.920013,209.529999,124.690002,47.070000,152.470001,124.519997,53.439999,211.389999,174.520004,80.139999,222.750000,141.600006,137.720001,138.490005,340.790009,208.699997,58.840000,39.619999,143.500000
2020-12-28,136.690002,223.649994,118.360001,216.089996,178.369995,224.639999,44.919998,84.900002,178.860001,54.290001,259.589996,269.250000,209.940002,124.820000,47.070000,153.190002,125.339996,54.160000,214.020004,174.710007,80.449997,224.960007,142.429993,138.679993,139.190002,345.950012,212.630005,58.980000,39.660000,145.220001
2020-12-29,134.869995,226.869995,118.139999,216.250000,176.649994,222.460007,44.639999,84.610001,177.300003,54.330002,258.010010,266.190002,209.130005,123.800003,49.389999,154.139999,125.010002,54.130001,212.710007,173.850006,81.419998,224.149994,141.570007,138.419998,138.350006,347.350006,214.369995,58.810001,39.410000,144.300003
