In [1]:
# Import our dependencies
import pandas as pd

# Raw Data

In [2]:
## RAW - read inital raw data and drop redundant data
#raw = pd.read_csv('14100328.csv')

#dropping bad data quality rows
#data = raw.loc[(df['STATUS'] == 'A')|(raw['STATUS'] == 'B')|(raw['STATUS'] == 'C')|(raw['STATUS'] == 'D')]
#dropping non required years
#data = data.drop(data[(df2['REF_DATE'] == '2020-01')|(data['REF_DATE'] == '2020-10')|
#                      (data['REF_DATE'] == '2021-01')|(data['REF_DATE'] == '2021-04')|
#                      (data['REF_DATE'] == '2015-01')].index)
#resetting index for continous index range
#data = data.reset_index(drop=True)

#data.to_csv('VacanciesRawData.csv',index=False)

In [3]:
## CLEANED - Read cleansed raw data
raw2 = pd.read_csv("Resources/VacanciesRawData.csv", error_bad_lines=False)
df = raw2.copy()

# Data Cleansing

In [4]:
#splitting NOC Description and Code
df[['NOCdesc','NOCcode']] = df['National Occupational Classification'].str.split("[",expand=True)

#removing junk from column
df[['NOCcode','junk']] = df['NOCcode'].str.split("]", expand=True)

#splitting date to year and month
df[['Year','Quarter']] = df['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
df['Quarter'] = df['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#drop redundant columns
df = df.drop(columns=['junk','National Occupational Classification','DGUID','UOM_ID','SCALAR_FACTOR',
                      'SCALAR_ID','VECTOR','COORDINATE','STATUS','SYMBOL','TERMINATED','DECIMALS','UOM'])

#rename columns
df = df.rename(columns={"GEO": "Location", "NOCdesc": "NOC_Desc", "NOCcode": "NOC_Code", 
                        "Job vacancy characteristics" :"JobDetails"})

#drop null NOC
df = df.dropna(subset = ['NOC_Code'])

#creating Id col
df['ID'] = df.index

In [5]:
#creating Canada only dataframe including "total, all occupations"
CANADA = df.loc[(df['Location'] =='Canada')]
CANADA = CANADA.loc[(CANADA['JobDetails'] =='Full-time')|(CANADA['JobDetails'] =='Part-time')]
CANADA = CANADA.loc[(CANADA['Statistics'] !='Proportion of job vacancies')]
CANADA

Unnamed: 0,REF_DATE,Location,JobDetails,Statistics,VALUE,NOC_Desc,NOC_Code,Year,Quarter,ID
1304,2015-04,Canada,Full-time,Job vacancies,23030.00,Management occupations,0,2015,2,1304
1306,2015-04,Canada,Full-time,Average offered hourly wage,33.45,Management occupations,0,2015,2,1306
1307,2015-04,Canada,Part-time,Job vacancies,1065.00,Management occupations,0,2015,2,1307
1309,2015-04,Canada,Part-time,Average offered hourly wage,24.00,Management occupations,0,2015,2,1309
1406,2015-04,Canada,Full-time,Job vacancies,825.00,Senior management occupations,00,2015,2,1406
...,...,...,...,...,...,...,...,...,...,...
2996755,2019-10,Canada,Part-time,Average offered hourly wage,16.90,Labourers in food and beverage processing,9617,2019,4,2996755
2996825,2019-10,Canada,Full-time,Average offered hourly wage,13.25,Labourers in fish and seafood processing,9618,2019,4,2996825
2996826,2019-10,Canada,Part-time,Average offered hourly wage,14.35,Labourers in fish and seafood processing,9618,2019,4,2996826
2996857,2019-10,Canada,Full-time,Average offered hourly wage,14.55,"Other labourers in processing, manufacturing a...",9619,2019,4,2996857


In [6]:
#provincal data 
PROVINCE = df.loc[(df['Location'] != 'Canada')]
PROVINCE = PROVINCE.loc[(PROVINCE['NOC_Desc'] !='Total, all occupations')]
PROVINCE = PROVINCE.loc[(PROVINCE['JobDetails'] =='Full-time')|(PROVINCE['JobDetails'] =='Part-time')]

#remove % vacancies
PROVINCE = PROVINCE.loc[(PROVINCE['Statistics'] !='Proportion of job vacancies')]
PROVINCE

Unnamed: 0,REF_DATE,Location,JobDetails,Statistics,VALUE,NOC_Desc,NOC_Code,Year,Quarter,ID
37387,2015-04,Newfoundland and Labrador,Full-time,Average offered hourly wage,30.35,Management occupations,0,2015,2,37387
37430,2015-04,Newfoundland and Labrador,Full-time,Job vacancies,50.00,Specialized middle management occupations,01-05,2015,2,37430
37432,2015-04,Newfoundland and Labrador,Full-time,Average offered hourly wage,32.80,Specialized middle management occupations,01-05,2015,2,37432
37488,2015-04,Newfoundland and Labrador,Full-time,Average offered hourly wage,37.30,Administrative services managers,011,2015,2,37488
37515,2015-04,Newfoundland and Labrador,Full-time,Average offered hourly wage,36.40,"Managers in engineering, architecture, science...",021,2015,2,37515
...,...,...,...,...,...,...,...,...,...,...
3127113,2019-10,Nunavut,Part-time,Average offered hourly wage,26.25,"Industrial, electrical and construction trades",72,2019,4,3127113
3127141,2019-10,Nunavut,Full-time,Job vacancies,20.00,Maintenance and equipment operation trades,73,2019,4,3127141
3127143,2019-10,Nunavut,Full-time,Average offered hourly wage,38.00,Maintenance and equipment operation trades,73,2019,4,3127143
3127182,2019-10,Nunavut,Full-time,Job vacancies,10.00,Occupations in manufacturing and utilities,9,2019,4,3127182


# Job Vacancies

In [7]:
#splitting vacancies for province - 'total, all occupations' excluded
job_vacancies = PROVINCE.loc[PROVINCE['Statistics'] == "Job vacancies"] 
job_vacancies = job_vacancies.rename(columns={"VALUE": "TotalVacancies"})

#reordering columns
job_vacancies = job_vacancies[['ID','REF_DATE','Year','Quarter','Location','NOC_Code', 'JobDetails','TotalVacancies']]
job_vacancies

Unnamed: 0,ID,REF_DATE,Year,Quarter,Location,NOC_Code,JobDetails,TotalVacancies
37430,37430,2015-04,2015,2,Newfoundland and Labrador,01-05,Full-time,50.0
37584,37584,2015-04,2015,2,Newfoundland and Labrador,1,Full-time,430.0
37795,37795,2015-04,2015,2,Newfoundland and Labrador,122,Full-time,15.0
38169,38169,2015-04,2015,2,Newfoundland and Labrador,2,Full-time,170.0
38224,38224,2015-04,2015,2,Newfoundland and Labrador,21,Full-time,90.0
...,...,...,...,...,...,...,...,...
3127002,3127002,2019-10,2019,4,Nunavut,67,Part-time,0.0
3127026,3127026,2019-10,2019,4,Nunavut,6733,Part-time,0.0
3127057,3127057,2019-10,2019,4,Nunavut,7,Full-time,40.0
3127141,3127141,2019-10,2019,4,Nunavut,73,Full-time,20.0


In [8]:
#splitting vacancies for province - 'totals, all occupations' included
job_vacanciesCAD = CANADA.loc[CANADA['Statistics'] == "Job vacancies"] 
job_vacanciesCAD = job_vacanciesCAD.rename(columns={"VALUE": "TotalVacancies"})

#reordering columns
job_vacanciesCAD = job_vacanciesCAD[['ID','REF_DATE','Year','Quarter','Location','NOC_Code', 'JobDetails','TotalVacancies']]
job_vacanciesCAD

Unnamed: 0,ID,REF_DATE,Year,Quarter,Location,NOC_Code,JobDetails,TotalVacancies
1304,1304,2015-04,2015,2,Canada,0,Full-time,23030.0
1307,1307,2015-04,2015,2,Canada,0,Part-time,1065.0
1406,1406,2015-04,2015,2,Canada,00,Full-time,825.0
1482,1482,2015-04,2015,2,Canada,001,Full-time,825.0
1557,1557,2015-04,2015,2,Canada,0011,Full-time,15.0
...,...,...,...,...,...,...,...,...
2996578,2996578,2019-10,2019,4,Canada,9614,Full-time,1210.0
2996653,2996653,2019-10,2019,4,Canada,9615,Full-time,445.0
2996709,2996709,2019-10,2019,4,Canada,9616,Full-time,155.0
2996750,2996750,2019-10,2019,4,Canada,9617,Full-time,3250.0


In [9]:
#export Data without index
job_vacancies.to_csv('Resources/Tables_Cleaned_Totals/JobVacancies.csv', index=False)
job_vacanciesCAD.to_csv('Resources/Tables_Cleaned_Totals/JobVacanciesCAD.csv', index=False)

# Average Wage

In [10]:
## filter PROVINCE Average Wage data
avg_wage = PROVINCE.loc[PROVINCE['Statistics'] == "Average offered hourly wage"] 
avg_wage = avg_wage.rename(columns={"VALUE": "AvgWage"})

#drop null values
avg_wage = avg_wage.dropna(subset=['AvgWage'])

#reordering columns
avg_wage = avg_wage[['ID','REF_DATE','Year','Quarter','Location','NOC_Code', 'JobDetails','AvgWage']]

avg_wage

Unnamed: 0,ID,REF_DATE,Year,Quarter,Location,NOC_Code,JobDetails,AvgWage
37387,37387,2015-04,2015,2,Newfoundland and Labrador,0,Full-time,30.35
37432,37432,2015-04,2015,2,Newfoundland and Labrador,01-05,Full-time,32.80
37488,37488,2015-04,2015,2,Newfoundland and Labrador,011,Full-time,37.30
37515,37515,2015-04,2015,2,Newfoundland and Labrador,021,Full-time,36.40
37544,37544,2015-04,2015,2,Newfoundland and Labrador,062,Full-time,17.60
...,...,...,...,...,...,...,...,...
3127060,3127060,2019-10,2019,4,Nunavut,7,Part-time,26.25
3127112,3127112,2019-10,2019,4,Nunavut,72,Full-time,31.40
3127113,3127113,2019-10,2019,4,Nunavut,72,Part-time,26.25
3127143,3127143,2019-10,2019,4,Nunavut,73,Full-time,38.00


In [11]:
## filter CANADA Average Wage data
avg_wageCAD = CANADA.loc[CANADA['Statistics'] == "Average offered hourly wage"] 
avg_wageCAD = avg_wage.rename(columns={"VALUE": "AvgWage"})

#drop null values
avg_wageCAD = avg_wage.dropna(subset=['AvgWage'])

#reordering columns
avg_wageCAD = avg_wage[['ID','REF_DATE','Year','Quarter','Location','NOC_Code', 'JobDetails','AvgWage']]
avg_wageCAD

Unnamed: 0,ID,REF_DATE,Year,Quarter,Location,NOC_Code,JobDetails,AvgWage
37387,37387,2015-04,2015,2,Newfoundland and Labrador,0,Full-time,30.35
37432,37432,2015-04,2015,2,Newfoundland and Labrador,01-05,Full-time,32.80
37488,37488,2015-04,2015,2,Newfoundland and Labrador,011,Full-time,37.30
37515,37515,2015-04,2015,2,Newfoundland and Labrador,021,Full-time,36.40
37544,37544,2015-04,2015,2,Newfoundland and Labrador,062,Full-time,17.60
...,...,...,...,...,...,...,...,...
3127060,3127060,2019-10,2019,4,Nunavut,7,Part-time,26.25
3127112,3127112,2019-10,2019,4,Nunavut,72,Full-time,31.40
3127113,3127113,2019-10,2019,4,Nunavut,72,Part-time,26.25
3127143,3127143,2019-10,2019,4,Nunavut,73,Full-time,38.00


In [12]:
#export to csv
avg_wage.to_csv('Resources/Tables_Cleaned_Totals/AverageWage.csv', index=False)
#export Data without index
avg_wageCAD.to_csv('Resources/Tables_Cleaned_Totals/AverageWageCAD.csv', index=False)

# NOC

In [13]:
#creating NOC dataframe
NOC = df[['NOC_Desc','NOC_Code']]

#dropping duplicate rows
NOC = NOC.drop_duplicates()

#resetting indexes
NOC = NOC.reset_index(drop=True)

#creating ID from index
NOC['ID'] = NOC.index

#reordering columns
NOC = NOC[['ID','NOC_Code','NOC_Desc']]
NOC

Unnamed: 0,ID,NOC_Code,NOC_Desc
0,0,0,Management occupations
1,1,00,Senior management occupations
2,2,001,Legislators and senior management
3,3,0012,Senior government managers and officials
4,4,0013,"Senior managers - financial, communications an..."
...,...,...,...
659,659,4422,Correctional service officers
660,660,7361,Railway and yard locomotive engineers
661,661,7234,Boilermakers
662,662,5135,Actors and comedians


In [14]:
#exporting to csv
NOC.to_csv('Resources/Tables_Cleaned_Totals/NOC.csv', index=False)