In [45]:
# Import dependencies
import pandas as pd
from pathlib import Path
import re

In [5]:
# Define the path to the csv
asx_path = Path("Resources/asx-companies-list.csv")
nasdaq_path = Path("Resources/nasdaq-companies-list.csv")

In [42]:
# Create ASX DataFrame
asx_df = pd.read_csv(asx_path)
asx_df.head()

Unnamed: 0,Code,Company,Link,Market Cap,Last trade,Change,%Change,Sector
0,ASX:BHP,BHP Group Limited (ASX:BHP),https://www.listcorp.com/asx/bhp/bhp-group-lim...,225204000000,43.36,-1.07,-2.41,Materials
1,ASX:CBA,Commonwealth Bank (ASX:CBA),https://www.listcorp.com/asx/cba/commonwealth-...,165522000000,97.82,-0.93,-0.94,Financials
2,ASX:CSL,CSL Limited (ASX:CSL),https://www.listcorp.com/asx/csl/csl-limited,112598000000,236.8,3.71,1.59,Health Care
3,ASX:NAB,National Australia Bank (ASX:NAB),https://www.listcorp.com/asx/nab/nab,89644400000,28.56,-0.09,-0.31,Financials
4,ASX:ANZ,ANZ Group Holdings Limited (ASX:ANZ),https://www.listcorp.com/asx/anz/anz-group-hol...,75943600000,25.22,-0.05,-0.2,Financials


In [9]:
# Create NASDAQ DataFrame
nasdaq_df = pd.read_csv(nasdaq_path)
nasdaq_df.head()

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,A,Agilent Technologies Inc. Common Stock,$109.23,-0.28,-0.256%,31959310000.0,United States,1999.0,2367325,Industrials,Electrical Products
1,AA,Alcoa Corporation Common Stock,$24.01,-0.23,-0.949%,4284575000.0,United States,2016.0,9203147,Industrials,Aluminum
2,AAC,Ares Acquisition Corporation Class A Ordinary ...,$10.76,0.0,0.00%,759701800.0,,2021.0,22059,Industrials,Metal Fabrications
3,AACG,ATA Creativity Global American Depositary Shares,$1.05,0.02,1.942%,33203280.0,China,2008.0,2014,Real Estate,Other Consumer Services
4,AACI,Armada Acquisition Corp. I Common Stock,$10.69,0.0,0.00%,0.0,United States,2021.0,133,Finance,Blank Checks


## Data Cleaning

### ASX Companies

In [39]:
# Rename the columns
list(asx_df.columns)

['Code',
 'Company',
 'Link',
 'Market Cap',
 'Last trade',
 'Change',
 '%Change',
 'Sector']

In [43]:
asx_df = asx_df.rename(columns={
    'Code': 'code',
    'Company': 'company',
    'Link': 'link',
    'Market Cap': 'market_cap',
    'Last trade': 'last_trade',
    'Change': 'change',
    '%Change': 'pct_change',
    'Sector': 'sector'})
asx_df.head()

Unnamed: 0,code,company,link,market_cap,last_trade,change,pct_change,sector
0,ASX:BHP,BHP Group Limited (ASX:BHP),https://www.listcorp.com/asx/bhp/bhp-group-lim...,225204000000,43.36,-1.07,-2.41,Materials
1,ASX:CBA,Commonwealth Bank (ASX:CBA),https://www.listcorp.com/asx/cba/commonwealth-...,165522000000,97.82,-0.93,-0.94,Financials
2,ASX:CSL,CSL Limited (ASX:CSL),https://www.listcorp.com/asx/csl/csl-limited,112598000000,236.8,3.71,1.59,Health Care
3,ASX:NAB,National Australia Bank (ASX:NAB),https://www.listcorp.com/asx/nab/nab,89644400000,28.56,-0.09,-0.31,Financials
4,ASX:ANZ,ANZ Group Holdings Limited (ASX:ANZ),https://www.listcorp.com/asx/anz/anz-group-hol...,75943600000,25.22,-0.05,-0.2,Financials


In [44]:
# Drop the "ASX:" from the code column
asx_df['code'] = asx_df['code'].str[4:]
asx_df.head()

Unnamed: 0,code,company,link,market_cap,last_trade,change,pct_change,sector
0,BHP,BHP Group Limited (ASX:BHP),https://www.listcorp.com/asx/bhp/bhp-group-lim...,225204000000,43.36,-1.07,-2.41,Materials
1,CBA,Commonwealth Bank (ASX:CBA),https://www.listcorp.com/asx/cba/commonwealth-...,165522000000,97.82,-0.93,-0.94,Financials
2,CSL,CSL Limited (ASX:CSL),https://www.listcorp.com/asx/csl/csl-limited,112598000000,236.8,3.71,1.59,Health Care
3,NAB,National Australia Bank (ASX:NAB),https://www.listcorp.com/asx/nab/nab,89644400000,28.56,-0.09,-0.31,Financials
4,ANZ,ANZ Group Holdings Limited (ASX:ANZ),https://www.listcorp.com/asx/anz/anz-group-hol...,75943600000,25.22,-0.05,-0.2,Financials


In [77]:
# Drop the (ASX:...) from the company column
pattern = '(\(\w+:\w+\))'
asx_df['company'].str.extractall(pattern, flags=re.I)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,(ASX:BHP)
1,0,(ASX:CBA)
2,0,(ASX:CSL)
3,0,(ASX:NAB)
4,0,(ASX:ANZ)
...,...,...
2760,0,(ASX:RCO)
2761,0,(ASX:CVO)
2762,0,(ASX:AYS)
2763,0,(ASX:BAL)


In [87]:
pattern = '([\w+\s]+)'
parsed_company = asx_df['company'].str.extractall(pattern, flags=re.I)
parsed_company.head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,BHP Group Limited
0,1,ASX
0,2,BHP
1,0,Commonwealth Bank
1,1,ASX
1,2,CBA
2,0,CSL Limited
2,1,ASX
2,2,CSL
3,0,National Australia Bank


In [104]:
asx_df['company'] = parsed_company.loc(axis=0)[:,0].reset_index(drop=True)
asx_df.head()

Unnamed: 0,code,company,link,market_cap,last_trade,change,pct_change,sector
0,BHP,BHP Group Limited,https://www.listcorp.com/asx/bhp/bhp-group-lim...,225204000000,43.36,-1.07,-2.41,Materials
1,CBA,Commonwealth Bank,https://www.listcorp.com/asx/cba/commonwealth-...,165522000000,97.82,-0.93,-0.94,Financials
2,CSL,CSL Limited,https://www.listcorp.com/asx/csl/csl-limited,112598000000,236.8,3.71,1.59,Health Care
3,NAB,National Australia Bank,https://www.listcorp.com/asx/nab/nab,89644400000,28.56,-0.09,-0.31,Financials
4,ANZ,ANZ Group Holdings Limited,https://www.listcorp.com/asx/anz/anz-group-hol...,75943600000,25.22,-0.05,-0.2,Financials
