In [27]:
import pandas as pd
from sqlalchemy import create_engine

### Extract data science jobs csv into DataFrame

In [28]:
# Read in the data science jobs csv
datasci_file = "../Resources/datascientist_jobs_in_australia_Oct_25_2019.csv"
datasci_df = pd.read_csv(datasci_file)
datasci_df.head()

Unnamed: 0.1,Unnamed: 0,title,company,cpage,ratings,location,days_ago,summary
0,1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
1,2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
2,3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
3,4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
4,5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [29]:
# Summary of DataFrame
datasci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619 entries, 0 to 618
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  619 non-null    int64  
 1   title       619 non-null    object 
 2   company     619 non-null    object 
 3   cpage       619 non-null    object 
 4   ratings     619 non-null    float64
 5   location    619 non-null    object 
 6   days_ago    619 non-null    int64  
 7   summary     619 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 38.8+ KB


### Transform data science jobs csv

In [30]:
# Rename the unnamed column to id and set as index
datasci_df.rename(columns={"Unnamed: 0":"id"},inplace=True)
datasci_df.set_index("id",inplace=True)

In [31]:
# Preview resulting DataFrame
datasci_df.head()

Unnamed: 0_level_0,title,company,cpage,ratings,location,days_ago,summary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4.0,Sydney NSW,30,The data will be both temporal and spatial. Ad...
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,Canberra ACT,10,They lead the implementation of data science a...
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,https://au.indeed.comn/a,0.0,Sydney NSW,10,"In data science and big data analytics, the ID..."
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,Melbourne VIC,30,"Experience working with big data sets, especia..."
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Taringa QLD,1,Collaborates with data/software engineers to i...


In [32]:
# Check for duplicate rows, based on all columns
duplicate_rows = datasci_df[datasci_df.duplicated(keep="first")]
# duplicate_rows
# duplicate_rows.count()

In [33]:
# Drop duplicate rows from DataFrame
datasci_df.drop_duplicates(inplace=True)

In [34]:
# Clean up cpage column (replace invalid URL 'https://au.indeed.comn/a' with 'Not available')
datasci_df.replace("https://au.indeed.comn/a","Not available",inplace=True)

In [35]:
# Replace 0.0 ratings with 'No rating'
datasci_df.replace(0.00,"Not available",inplace=True)

In [36]:
# Drop days_ago columns
datasci_df.drop(columns="days_ago",inplace=True)

In [37]:
# Replace state only values in location with "N/A, <state>" to enable splitting of location into city and state
datasci_df["location"].replace(to_replace="New South Wales",value="Not available, NSW",inplace=True)
datasci_df["location"].replace(to_replace="Victoria",value="Not available, VIC",inplace=True)
datasci_df["location"].replace(to_replace="Queensland",value="Not available, QLD",inplace=True)
datasci_df["location"].replace(to_replace="Tasmania",value="Not available, TAS",inplace=True)

In [38]:
# Clean up location column; separate city and state into two columns
location = datasci_df["location"]
# location.unique()
# location.value_counts()
location_df = location.str.rsplit(" ",n=1,expand=True)

In [39]:
# Add new city and state columns into existing DataFrame
datasci_df["city"] = location_df[0]
datasci_df["state"] = location_df[1]
# Drop existing location column
datasci_df.drop(columns="location",inplace=True)

In [40]:
# Check city and state data
# datasci_df["city"].unique()
# datasci_df["state"].unique()

In [41]:
# Clean up city column to replace "Australia" values? OR leave "Australia" amd rename column to location
# datasci_df[datasci_df["state"].isna()]
# datasci_df.loc[datasci_df["state"] == "Australia"]

In [42]:
# Rename columns with meaningful titles e.g. 'Indeed landing page'
datasci_df.rename(columns={"title": "job posting title",
                            "company": "organisation",
                            "cpage": "Indeed profile page",
                            "ratings": "Indeed rating",
                            "summary": "job summary",
                            "city": "location"},
                            inplace=True)

In [43]:
datasci_df.head()

Unnamed: 0_level_0,job posting title,organisation,Indeed profile page,Indeed rating,job summary,location,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,https://au.indeed.com/cmp/Deloitte,4,The data will be both temporal and spatial. Ad...,Sydney,NSW
2,Data Scientist,Australian Government Department of Human Serv...,https://au.indeed.com/cmp/Australian-Governmen...,3.9,They lead the implementation of data science a...,Canberra,ACT
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,Not available,Not available,"In data science and big data analytics, the ID...",Sydney,NSW
4,IBM Research Scientist Data Scientist – Melbourne,IBM,https://au.indeed.com/cmp/IBM,3.9,"Experience working with big data sets, especia...",Melbourne,VIC
5,Data Scientist,Cash Converters,https://au.indeed.com/cmp/Cash-Converters,3.5,Collaborates with data/software engineers to i...,Taringa,QLD


In [44]:
# Select columns from data sci DF to create a job information table
jobinfo_cols = ["job posting title","organisation","location","state"]
jobinfo_df = datasci_df[jobinfo_cols].copy()
jobinfo_df.head()

Unnamed: 0_level_0,job posting title,organisation,location,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Deloitte Access Economics - Junior Data Scientist,Deloitte,Sydney,NSW
2,Data Scientist,Australian Government Department of Human Serv...,Canberra,ACT
3,Junior Data Analyst/Scientist,International Institute of Data & Analytics,Sydney,NSW
4,IBM Research Scientist Data Scientist – Melbourne,IBM,Melbourne,VIC
5,Data Scientist,Cash Converters,Taringa,QLD


### Extract market capital csv into DataFrame

In [None]:
asxlisting_file = "../Resources/ASXListedCompanies.csv"
asx_df = pd.read_csv(asxlisting_file, skiprows=1)
asx_df.head()

### Transform market cap csv

In [None]:
# Set ASX code as index... not necessary?
# asx_df.set_index("ASX code",inplace=True)

In [None]:
# Select columns from data sci DF from organisation dataframe
# Transform values in company name and organisation columns to lowercase to enable merge
organisation_df = datasci_df["organisation" "Indeed profile page","Indeed rating"]

In [49]:
# Join select columns from data sci DF to ASX DF using company name as match point
jobs_asx_join = pd.merge(jobinfo_df,asx_df,left_on="organisation",right_on="Company name",how="inner")
jobs_asx_join

Unnamed: 0,job posting title,organisation,location,state,Company name,ASX code,GICS industry group


In [None]:
# Select columns from data sci and asx to create company information table

In [None]:
# Nice to have...

# Find number of job listings for each company. Could add this information as a new column in the company information table
company_count = datasci_df["organisation"].value_counts()
company_count

In [None]:
# Read in csv

### Transform market cap csv

In [45]:
# Read in CSV, skipping first row (contained report metadata)
asxlisting_file = "../Resources/ASXListedCompanies.csv"
asx_df = pd.read_csv(asxlisting_file, skiprows=1)
asx_df.head()

Unnamed: 0,Company name,ASX code,GICS industry group
0,MOQ LIMITED,MOQ,Software & Services
1,1300 SMILES LIMITED,ONT,Health Care Equipment & Services
2,1414 DEGREES LIMITED,14D,Capital Goods
3,1ST GROUP LIMITED,1ST,Health Care Equipment & Services
4,333D LIMITED,T3D,Commercial & Professional Services


In [None]:
# Set ASX code as index

In [None]:
# Drop weight, total index market cap and unnamed columns

In [None]:
# Select columns to create a sector/industry DataFrame

### Extract ASX companies csv into DataFrame