In [2]:
import pandas as pd
from sqlalchemy import create_engine

### Extract data science jobs csv into DataFrame

In [3]:
# Read in the data science jobs csv
datasci_file = "../Resources/datascientist_jobs_in_australia_Oct_25_2019.csv"
datasci_df = pd.read_csv(datasci_file)
datasci_df.head()

Unnamed: 0.1,Unnamed: 0,title,company,cpage,ratings,location,days_ago,summary
0,1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
1,2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
2,3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
3,4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
4,5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [4]:
# Summary of DataFrame
datasci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619 entries, 0 to 618
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  619 non-null    int64  
 1   title       619 non-null    object 
 2   company     619 non-null    object 
 3   cpage       619 non-null    object 
 4   ratings     619 non-null    float64
 5   location    619 non-null    object 
 6   days_ago    619 non-null    int64  
 7   summary     619 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 38.8+ KB


### Transform data science jobs csv

In [5]:
# Rename the unnamed column to id and set as index
datasci_df.rename(columns={"Unnamed: 0":"id"},inplace=True)
datasci_df.set_index("id",inplace=True)

In [6]:
# Review resulting DataFrame
datasci_df

Unnamed: 0_level_0,title,company,cpage,ratings,location,days_ago,summary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...
...,...,...,...,...,...,...,...
615,Senior Research Analyst,Hudson,https://au.indeed.com/cmp/Hudson,3.6,Melbourne VIC,14,This role will lead to candidates who have a s...
616,Senior Product Manager,Sustainability Consulting,https://au.indeed.comn/a,0.0,Sydney NSW,25,Strong affiliation and experience with Data or...
617,Technical Product Manager,Talenza,https://au.indeed.comn/a,0.0,Sydney NSW,4,This fast growing Tech start up are making wav...
618,6M Contract - Automation Engineer - JavaScript...,Real Time Australia,https://au.indeed.comn/a,0.0,Melbourne VIC,7,You will be working with an exceptional cross ...


In [7]:
# Check for duplicate rows

In [8]:
# Clean up cpage column (invalid URL 'https://au.indeed.comn/a' appears where rating is 0)

In [9]:
# Replace 0.0 ratings with 'No rating'

In [10]:
# Clean up location column... potentially separate city and state into two columns, but not every entry follows the same format

In [11]:
# Drop days_ago columns

In [12]:
# Rename columns with meaningful titles e.g. 'Indeed landing page'

In [13]:
# Find number of listings for each company. Could add this information as a new column in the company information table
company_count = datasci_df["company"].value_counts()
company_count

Real Time Australia              20
Talenza                          18
NSW Health Pathology             16
Onset                            15
Sustainability Consulting        12
                                 ..
MediaCom                          1
University of New South Wales     1
AQ1 Systems                       1
IDP Connect                       1
Mindcloud Consultants             1
Name: company, Length: 265, dtype: int64

In [15]:
# Select columns from data sci DF to create a job information table

In [14]:
# Nice to have

# Attempting to see if there are some keywords that frequently appear in the summary column... not sure what we can do with this information, just curious!
summary_words = pd.Series(' '.join(datasci_df.summary).split()).value_counts()
summary_words_df = pd.DataFrame(summary_words)
summary_words_df.to_csv("summary_words.csv")

### Extract ASX companies csv into DataFrame

In [16]:
# Read in CSV, skipping first row (contained report metadata)
asxlisting_file = "../Resources/ASXListedCompanies.csv"
asx_df = pd.read_csv(asxlisting_file, skiprows=1)
asx_df.head()

Unnamed: 0,Company name,ASX code,GICS industry group
0,MOQ LIMITED,MOQ,Software & Services
1,1300 SMILES LIMITED,ONT,Health Care Equipment & Services
2,1414 DEGREES LIMITED,14D,Capital Goods
3,1ST GROUP LIMITED,1ST,Health Care Equipment & Services
4,333D LIMITED,T3D,Commercial & Professional Services


### Transform ASX companies csv

In [17]:
# Set ASX code as index

In [18]:
# Join select columns from data sci DF to ASX DF using company name as match point

In [19]:
# Fill in blank values where there is no match on company name? Use sector 'Other' or populate in some other way? Could be government organisations, independent companies, etc.

### Extract market capital csv into DataFrame

In [None]:
# Read in csv

### Transform market cap csv

In [None]:
# Set ASX code as index

In [None]:
# Drop weight, total index market cap and unnamed columns

In [None]:
# Select columns to create a sector/industry DataFrame