In [50]:
import pandas as pd
from sqlalchemy import create_engine

### Extract data science jobs csv into DataFrame

In [51]:
# Read in the data science jobs csv
datasci_file = "../Resources/datascientist_jobs_in_australia_Oct_25_2019.csv"
datasci_df = pd.read_csv(datasci_file)
datasci_df.head()

Unnamed: 0.1,Unnamed: 0,title,company,cpage,ratings,location,days_ago,summary
0,1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
1,2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
2,3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
3,4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
4,5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [52]:
# Summary of DataFrame
datasci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619 entries, 0 to 618
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  619 non-null    int64  
 1   title       619 non-null    object 
 2   company     619 non-null    object 
 3   cpage       619 non-null    object 
 4   ratings     619 non-null    float64
 5   location    619 non-null    object 
 6   days_ago    619 non-null    int64  
 7   summary     619 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 38.8+ KB


### Transform data science jobs csv

In [53]:
# Rename the unnamed column to id and set as index
datasci_df.rename(columns={"Unnamed: 0":"id"},inplace=True)
datasci_df.set_index("id",inplace=True)

In [54]:
# Preview resulting DataFrame
datasci_df.head()

Unnamed: 0_level_0,title,company,cpage,ratings,location,days_ago,summary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [55]:
# Check for duplicate rows, based on all columns
duplicate_rows = datasci_df[datasci_df.duplicated(keep="first")]
# duplicate_rows
# duplicate_rows.count()

In [56]:
# Drop duplicate rows from DataFrame
datasci_df.drop_duplicates(inplace=True)

In [57]:
# Clean up cpage column (replace invalid URL 'https://au.indeed.comn/a' with 'Not available')
datasci_df.replace("https://au.indeed.comn/a","Not available",inplace=True)

In [58]:
# Replace 0.0 ratings with 'No rating'
datasci_df.replace(0.00,"Not available",inplace=True)

In [59]:
# Drop days_ago columns
datasci_df.drop(columns="days_ago",inplace=True)

In [60]:
# Replace state only values in location with "N/A, <state>" to enable splitting of location into city and state
datasci_df["location"].replace(to_replace="New South Wales",value="Not available, NSW",inplace=True)
datasci_df["location"].replace(to_replace="Victoria",value="Not available, VIC",inplace=True)
datasci_df["location"].replace(to_replace="Queensland",value="Not available, QLD",inplace=True)
datasci_df["location"].replace(to_replace="Tasmania",value="Not available, TAS",inplace=True)

In [61]:
# Clean up location column; separate city and state into two columns
location = datasci_df["location"]
# location.unique()
# location.value_counts()
location_df = location.str.rsplit(" ",n=1,expand=True)

In [62]:
# Add new city and state columns into existing DataFrame
datasci_df["city"] = location_df[0]
datasci_df["state"] = location_df[1]
# Drop existing location column
datasci_df.drop(columns="location",inplace=True)

In [63]:
# Check city and state data
# datasci_df["city"].unique()
# datasci_df["state"].unique()

In [64]:
# Clean up city column to replace "Australia" values? OR leave "Australia" amd rename column to location
# datasci_df[datasci_df["state"].isna()]
# datasci_df.loc[datasci_df["state"] == "Australia"]

In [65]:
# Rename columns with meaningful titles e.g. 'Indeed landing page'
datasci_df.rename(columns={"title": "job posting title",
                            "company": "organisation",
                            "cpage": "Indeed profile page",
                            "ratings": "Indeed rating",
                            "summary": "job summary",
                            "city": "location"},
                            inplace=True)

In [66]:
datasci_df.head()

Unnamed: 0_level_0,job posting title,organisation,Indeed profile page,Indeed rating,job summary,location,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4,The data will be both temporal and spatial. Ad...,Sydney,NSW
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,They lead the implementation of data science a...,Canberra,ACT
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,Not available,Not available,"In data science and big data analytics, the ID...",Sydney,NSW
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,"Experience working with big data sets, especia...",Melbourne,VIC
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Collaborates with data/software engineers to i...,Taringa,QLD


In [67]:
# Select columns from data sci DF to create a job information table
jobinfo_cols = ["job posting title","organisation","location","state"]
jobinfo_df = datasci_df[jobinfo_cols].copy()
jobinfo_df.head()

Unnamed: 0_level_0,job posting title,organisation,location,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,Sydney,NSW
2,Data Scientist,Australian Government Department of Human Serv...,Canberra,ACT
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,Sydney,NSW
4,IBM Research Scientist Data Scientist – Melbourne,IBM,Melbourne,VIC
5,Data Scientist,Cash Converters,Taringa,QLD


In [83]:
# Select columns to create organisation DF
orginfo_cols = ["organisation","Indeed profile page","Indeed rating"]
orginfo_df = datasci_df[orginfo_cols].copy()
orginfo_df.head()

Unnamed: 0_level_0,organisation,Indeed profile page,Indeed rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Deloitte,https://au.indeed.com/cmp/Deloitte,4
2,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9
3,International Institute of Data & Analytics,Not available,Not available
4,IBM,https://au.indeed.com/cmp/IBM,3.9
5,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5


In [104]:
# Drop duplicates from org DF
orginfo_df.drop_duplicates(subset="organisation")

ValueError: Length of passed values is 265, index implies 496.

### Extract ABS labour market data csv into DataFrame



In [105]:
# Read in csv
abslabour_file = "../Resources/SA4 Time Series - July 2020.csv"
abslabour_df = pd.read_csv(abslabour_file)
abslabour_df.head()

Unnamed: 0,State/Territory,Date,Employment Rate (15-64),Unemployment Rate (15+),Participation Rate (15+)
0,NSW,Feb-78,,6.9,60.8
1,NSW,Mar-78,,6.4,59.8
2,NSW,Apr-78,63.7,6.5,60.2
3,NSW,May-78,63.7,6.1,60.0
4,NSW,Jun-78,63.7,6.4,60.1


### Transform ABS labour market data

In [None]:
# Filter for most recent labour market data (July 2020)

In [None]:
# Select and rename columns to create labour market table

### Extract ASX company data

In [98]:
asx_file = "../Resources/20200601-asx200.csv"
asx_df = pd.read_csv(asx_file, skiprows=1,usecols=range())
asx_df.head()

Unnamed: 0,Code,Company,Sector
0,ABP,Abacus Property Group,Real Estate
1,ABC,Adbri Ltd,Materials
2,APT,Afterpay Ltd,Information Technology
3,AGL,AGL Energy Ltd,Utilities
4,ALQ,Als Ltd,Industrials


### Transform market cap csv

In [103]:
# Join select columns from org info DF to ASX DF using company name as match point
jobs_asx_join = pd.merge(orginfo_df,asx_df,left_on="organisation",right_on="Company",how="left")
jobs_asx_join

Unnamed: 0,organisation,Indeed profile page,Indeed rating,Code,Company,Sector
0,Deloitte,https://au.indeed.com/cmp/Deloitte,4,,,
1,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,,,
2,International Institute of Data & Analytics,Not available,Not available,,,
3,IBM,https://au.indeed.com/cmp/IBM,3.9,,,
4,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,,,
...,...,...,...,...,...,...
260,Feedzai,Not available,Not available,,,
261,AECOM,https://au.indeed.com/cmp/AECOM,3.7,,,
262,Commonwealth Bank,https://au.indeed.com/cmp/Commonwealth-Bank,4.1,,,
263,NSW Health Pathology,https://au.indeed.com/cmp/Nsw-Health,3.9,,,


In [None]:
jobs_asx_join.to_csv("jobs_asx_join.csv")
# This approach only found one match! So ASX data is not useful.

In [100]:
# Nice to have...

# Find number of job listings for each company. Could add this information as a new column in the company information table
job_count = datasci_df["organisation"].value_counts()
job_count

NSW Health Pathology      16
TechSkills Accelerator    11
Deloitte                  10
Talenza                   10
CSIRO                      9
                          ..
Citi                       1
Horizon Consulting         1
DXC                        1
ANSTO                      1
Fusion Sport Pty Ltd       1
Name: organisation, Length: 265, dtype: int64