## Web Scraping : Dow Jones Industrial Average (DJIA) Stock Data from Wikipedia

In [13]:
# !pip install lxml

In [75]:
import pandas as pd

In [79]:
companies = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[2]
companies

# Rename the 'Symbol' Column to 'Ticker'
companies.columns = ['Company', 'Exchange', 'Ticker', 'Industry', 'Date_Added', 'Notes', 'Index_weighting']
companies
# Note: the pd.read_html may already normalize the unicode strings, but we'll still normalize later as best practice.

Unnamed: 0,Company,Exchange,Ticker,Industry,Date_Added,Notes,Index_weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,1.83%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,4.12%
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31,,3.76%
3,Amazon,NASDAQ,AMZN,Retailing,2024-02-26,,3.02%
4,Apple,NASDAQ,AAPL,Information technology,2015-03-19,,3.33%
5,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,2.15%
6,Caterpillar,NYSE,CAT,Construction and mining,1991-05-06,,5.41%
7,Chevron,NYSE,CVX,Petroleum industry,2008-02-19,Also 1930-07-18 to 1999-11-01,2.18%
8,Cisco,NASDAQ,CSCO,Information technology,2009-06-08,,0.82%
9,Coca-Cola,NYSE,KO,Drink industry,1987-03-12,Also 1932-05-26 to 1935-11-20,0.86%


In [80]:
# Using only relevant columns : the first 5 
companies = companies.iloc[:, :5].copy()
companies

Unnamed: 0,Company,Exchange,Ticker,Industry,Date_Added
0,3M,NYSE,MMM,Conglomerate,1976-08-09
1,American Express,NYSE,AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Biopharmaceutical,2020-08-31
3,Amazon,NASDAQ,AMZN,Retailing,2024-02-26
4,Apple,NASDAQ,AAPL,Information technology,2015-03-19
5,Boeing,NYSE,BA,Aerospace and defense,1987-03-12
6,Caterpillar,NYSE,CAT,Construction and mining,1991-05-06
7,Chevron,NYSE,CVX,Petroleum industry,2008-02-19
8,Cisco,NASDAQ,CSCO,Information technology,2009-06-08
9,Coca-Cola,NYSE,KO,Drink industry,1987-03-12


In [81]:
# Rename the Date Added Column to remove space. This avoids column access issues.
companies.rename(columns = {"Date added": "Date_Added"}, inplace = True)

In [82]:
companies['Date_Added'] = pd.to_datetime(companies['Date_Added'])

In [83]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Ticker      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


In [20]:
# companies['Date_Added']

## Getting Ticker Symbols by Normalizing Unicode Strings

In [84]:
# This would be applicable if there were any Unicode Strings that need normalization. In this case, the 
# pd.read_html already parsed the data to normal form.
import unicodedata

In [58]:
# The Unicode Strings in HTML format would need to be converted to normal form for proper ticker symbol format.
# Unicode is a sytem that lets computers read, store and display text 
# from different languages and symbols by assigning unique numbers or code points Example letter 'A' = unicode is U+0041.

companies['Ticker'][0]

'MMM'

In [87]:
# check for any hidden unicode. repr show raw representation form
print(repr(companies['Ticker']))


0      MMM
1      AXP
2     AMGN
3     AMZN
4     AAPL
5       BA
6      CAT
7      CVX
8     CSCO
9       KO
10     DIS
11      GS
12      HD
13     HON
14     IBM
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21    NVDA
22      PG
23     CRM
24     SHW
25     TRV
26     UNH
27      VZ
28       V
29     WMT
Name: Ticker, dtype: object


In [89]:
# Unicode Normalization
# NFKD is a normalization form: Normalization Form Compatibility Decomposition
companies['Ticker']= companies['Ticker'].apply(lambda x: unicodedata.normalize('NFKD', x))
companies['Ticker']

0      MMM
1      AXP
2     AMGN
3     AMZN
4     AAPL
5       BA
6      CAT
7      CVX
8     CSCO
9       KO
10     DIS
11      GS
12      HD
13     HON
14     IBM
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21    NVDA
22      PG
23     CRM
24     SHW
25     TRV
26     UNH
27      VZ
28       V
29     WMT
Name: Ticker, dtype: object

In [35]:
# ## The different Unicode normalization forms
# text = '½ ℌ ²'

# # Normalize to different forms
# nfc = unicodedata.normalize('NFC', text) # Composed form (combines characters into one).
# nfd = unicodedata.normalize('NFD', text) # Decomposed form (splits combined characters).
# nfkc = unicodedata.normalize('NFKC', text) #  Composed, but removes formatting differences.
# nfkd = unicodedata.normalize('NFKD', text) # Decomposed and removes formatting

# # Print and display Unicode representation
# print("Original:", repr(text))
# print("NFC:", repr(nfc))
# print("NFD:", repr(nfd))
# print("NFKC:", repr(nfkc))
# print("NFKD:", repr(nfkd))

In [90]:
companies.Ticker.str.split(": ").apply(lambda x: x[-1])

0      MMM
1      AXP
2     AMGN
3     AMZN
4     AAPL
5       BA
6      CAT
7      CVX
8     CSCO
9       KO
10     DIS
11      GS
12      HD
13     HON
14     IBM
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21    NVDA
22      PG
23     CRM
24     SHW
25     TRV
26     UNH
27      VZ
28       V
29     WMT
Name: Ticker, dtype: object

In [92]:
ticker_list = companies.Ticker.to_list()
ticker_list 

['MMM',
 'AXP',
 'AMGN',
 'AMZN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DIS',
 'GS',
 'HD',
 'HON',
 'IBM',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'NVDA',
 'PG',
 'CRM',
 'SHW',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WMT']

In [93]:
# Save the Ticker List in CSV file
companies.to_csv("ticker_list", index = False)