In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Extract data science jobs csv into DataFrame

In [2]:
# Read in the data science jobs csv
datasci_file = "../Resources/datascientist_jobs_in_australia_Oct_25_2019.csv"
datasci_df = pd.read_csv(datasci_file)
datasci_df.head()

Unnamed: 0.1,Unnamed: 0,title,company,cpage,ratings,location,days_ago,summary
0,1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
1,2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
2,3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
3,4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
4,5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [3]:
# Summary of DataFrame
datasci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619 entries, 0 to 618
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  619 non-null    int64  
 1   title       619 non-null    object 
 2   company     619 non-null    object 
 3   cpage       619 non-null    object 
 4   ratings     619 non-null    float64
 5   location    619 non-null    object 
 6   days_ago    619 non-null    int64  
 7   summary     619 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 38.8+ KB


### Transform data science jobs csv

In [4]:
# Rename the unnamed column to id and set as index
datasci_df.rename(columns={"Unnamed: 0":"id"},inplace=True)
datasci_df.set_index("id",inplace=True)

In [5]:
# Preview resulting DataFrame
datasci_df.head()

Unnamed: 0_level_0,title,company,cpage,ratings,location,days_ago,summary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [6]:
# Check for duplicate rows, based on all columns
duplicate_rows = datasci_df[datasci_df.duplicated(keep="first")]
# duplicate_rows
# duplicate_rows.count()

In [7]:
# Drop duplicate rows from DataFrame
datasci_df.drop_duplicates(inplace=True)

In [8]:
# Clean up cpage column (replace invalid URL 'https://au.indeed.comn/a' with 'Not available')
datasci_df.replace("https://au.indeed.comn/a","Not available",inplace=True)

In [9]:
# Replace 0.0 ratings with 'No rating'
datasci_df.replace(0.00,"Not available",inplace=True)

In [10]:
datasci_df.head()

Unnamed: 0_level_0,title,company,cpage,ratings,location,days_ago,summary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4,Sydney NSW,30,The data will be both temporal and spatial. Ad...
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,Not available,Not available,Sydney NSW,10,"In data science and big data analytics, the ID..."
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [11]:
# Drop days_ago columns
datasci_df.drop(columns="days_ago",inplace=True)

In [12]:
# Clean up location column; separate city and state into two columns
location = datasci_df["location"]
# location.unique()
# location.value_counts()
location_df = location.str.rsplit(" ",n=1,expand=True)

In [13]:
# Add new city and state columns into existing DataFrame
datasci_df["city"] = location_df[0]
datasci_df["state"] = location_df[1]
# Drop existing location column
datasci_df.drop(columns="location",inplace=True)

In [29]:
datasci_df["city"].unique()
# Need to take another look at this... 'New South' is a value in city column, so delimiting strategy is not ideal. Also needs a different name as not all values are "cities"

array(['Sydney', 'Canberra', 'Melbourne', 'Taringa', 'Victoria',
       'Sydney Inner Suburbs', 'Bella Vista', 'Australia', 'Brisbane',
       'Heidelberg', 'Campbelltown', 'Parkville', 'Wollongong',
       'North Ryde', 'Surry Hills', 'Toowoomba', 'Sydney Western Suburbs',
       'Perth', 'Adelaide', 'Hunter Valley', 'Artarmon', 'Camperdown',
       'St Leonards', 'Perth Airport', 'Rydalmere', 'Gosford',
       'Shepparton', 'Eagle Farm', 'New South', 'Lane Cove', 'Whyalla',
       'Lucas Heights', 'Melbourne City Centre', 'Griffith',
       'Macquarie Park', 'Coburg', 'Broadway', 'Tumut', 'West Perth',
       'Newcastle', 'Edinburgh', 'Rutherford', 'Southbank', 'Narrabri',
       'Brisbane Central Business District', 'Randwick', 'Katoomba',
       'Tweed Heads', 'St Lucia', 'New Lambton', 'Chatswood', 'Bourke',
       'Richmond', 'Campsie', 'Townsville', 'Sydney Eastern Suburbs',
       'Sydney Central Business District', 'Queensland',
       'Melbourne Northern Suburbs', 'Gold Coast

In [15]:
# Rename columns with meaningful titles e.g. 'Indeed landing page'

In [16]:
# Find number of listings for each company. Could add this information as a new column in the company information table
company_count = datasci_df["company"].value_counts()
company_count

NSW Health Pathology           16
TechSkills Accelerator         11
Talenza                        10
Deloitte                       10
CSIRO                           9
                               ..
Talent International            1
Blue Mountains City Council     1
Agilex Biolabs                  1
Hydrosphere Consulting          1
Blackroc                        1
Name: company, Length: 265, dtype: int64

In [17]:
# Select columns from data sci DF to create a job information table

In [18]:
# Nice to have...

# Attempting to see if there are some keywords that frequently appear in the summary column... not sure what we can do with this information, just curious!
summary_words = pd.Series(' '.join(datasci_df.summary).split()).value_counts()
summary_words_df = pd.DataFrame(summary_words)
summary_words_df.to_csv("summary_words.csv")

### Extract ASX companies csv into DataFrame

In [19]:
# Read in CSV, skipping first row (contained report metadata)
asxlisting_file = "../Resources/ASXListedCompanies.csv"
asx_df = pd.read_csv(asxlisting_file, skiprows=1)
asx_df.head()

Unnamed: 0,Company name,ASX code,GICS industry group
0,MOQ LIMITED,MOQ,Software & Services
1,1300 SMILES LIMITED,ONT,Health Care Equipment & Services
2,1414 DEGREES LIMITED,14D,Capital Goods
3,1ST GROUP LIMITED,1ST,Health Care Equipment & Services
4,333D LIMITED,T3D,Commercial & Professional Services


### Transform ASX companies csv

In [20]:
# Set ASX code as index

In [21]:
# Join select columns from data sci DF to ASX DF using company name as match point

In [22]:
# Fill in blank values where there is no match on company name? Use sector 'Other' or populate in some other way? Could be government organisations, independent companies, etc.

### Extract market capital csv into DataFrame

In [23]:
# Read in csv

### Transform market cap csv

In [24]:
# Set ASX code as index

In [25]:
# Drop weight, total index market cap and unnamed columns

In [26]:
# Select columns to create a sector/industry DataFrame