In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
from pathlib import Path


In [2]:
# Read a starting database
file_path = "Resources/14100328.csv"
raw = pd.read_csv(file_path, error_bad_lines=False)
raw.sample(n=3)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,REF_DATE,GEO,DGUID,National Occupational Classification,Job vacancy characteristics,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
18390860,2019-01,Prince Edward Island,2016A000211,"Crane operators, drillers and blasters [737]",Newspaper ads,Proportion of job vacancies,Percentage,242,units,0,v104748386,3.160.43.2,,..,,,1
12141448,2017-07,Saskatchewan,2016A000247,Telecommunications line and cable workers [7245],30 to 59 days,Average offered hourly wage,Dollars,81,units,0,v105290064,9.534.26.5,,..,,,2
4441178,2015-10,Northwest Territories,2016A000261,Massage therapists [3236],No minimum level of education required,Job vacancies,Number,223,units,0,v105627919,13.388.5.1,,..,,,0


<h1><span style="color:red"> I. Removing Redundant Data</span></h1>

<h3><span style="color:black"> Filter data quality: A - excellent, B - very good, C - good, acceptable - D</span></h3>

In [3]:
df = raw.copy()
df = df.loc[(df['STATUS'] == 'A')|(df['STATUS'] == 'B')|(df['STATUS'] == 'C')|(df['STATUS'] == 'D')]

#appending date to allow for date time data type
df['REF_DATE'] = df['REF_DATE'] + '-01'

df['REF_DATE'].unique()

array(['2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01',
       '2016-01-01', '2016-04-01', '2016-07-01', '2016-10-01',
       '2017-01-01', '2017-04-01', '2017-07-01', '2017-10-01',
       '2018-01-01', '2018-04-01', '2018-07-01', '2018-10-01',
       '2019-01-01', '2019-04-01', '2019-07-01', '2019-10-01',
       '2020-01-01', '2020-10-01', '2021-01-01', '2021-04-01'],
      dtype=object)

## Filtering for Vacancies and Average Wage only

In [4]:
#Filtering for Vacancies and Average Wage
df = df.loc[(df['Statistics'] =='Job vacancies') | (df['Statistics'] == 'Average offered hourly wage')]

In [5]:
#splitting NOC Description and Code
df[['NOCdesc','NOC']] = df['National Occupational Classification'].str.split("[",expand=True)

#removing junk from column
df[['NOC','junk']] = df['NOC'].str.split("]", expand=True)

#dropping columns not required, renaming and reordering columns
df = df.drop(columns=['DGUID','UOM','UOM_ID','SCALAR_FACTOR','SCALAR_ID','VECTOR','STATUS','SYMBOL','COORDINATE',
                 'TERMINATED','DECIMALS','junk','National Occupational Classification'])

df = df.rename(columns={"Job vacancy characteristics" :"JOB_CHAR"})
df = df[['REF_DATE','GEO','NOC','NOCdesc','JOB_CHAR','Statistics','VALUE']]

df.head()

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15


In [6]:
#df.to_csv('Resources/1.2_VacanciesRawData.csv',index=False)

In [7]:
df.isnull().sum()

REF_DATE          0
GEO               0
NOC           25599
NOCdesc           0
JOB_CHAR          0
Statistics        0
VALUE             0
dtype: int64

In [8]:
#checking NOC Descriptions of null NOCs
nullNoc = df[df['NOC'].isna()]
nullNoc['NOCdesc'].unique()

array(['Total, all occupations', 'Unclassified occupations'], dtype=object)

In [9]:
#replacing Null NOCs for Total,all occupations and Unclassified Occupations
temp = df['NOC'].isna()
df.loc[temp, 'NOC'] = np.where(df.loc[temp, 'NOCdesc'].eq('Total, all occupations'), '101', 'X')

In [10]:
#checking outcome
df[df['NOCdesc'] == 'Total, all occupations']
#df[df['NOCdesc'] == 'Unclassified occupations']

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,101,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,101,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,101,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,101,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,101,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15
...,...,...,...,...,...,...,...
27254446,2021-04-01,Nunavut,101,"Total, all occupations",Seasonal,Average offered hourly wage,32.95
27254447,2021-04-01,Nunavut,101,"Total, all occupations","Recruitment strategies, all types",Job vacancies,615.00
27254449,2021-04-01,Nunavut,101,"Total, all occupations","Personal contacts, referrals, informal networks",Job vacancies,445.00
27254455,2021-04-01,Nunavut,101,"Total, all occupations",Online job boards,Job vacancies,475.00


<h2><span style="color:#3346FF"> UPDATE:::: 1.3 Filtering and Saving .csv for ML model. No totals for Provinces, Filtering Broad NOC and Full/Part-time</span></h2>

<p> creating ML Vacancy inputs for both models.
#Broad NOCs and Full/Part time only.
#showing canada and provincal level metrics

In [11]:
df_vac = df.copy()

# Filter job vacancies
df_vac = df_vac.loc[(df_vac['Statistics'] =='Job vacancies')]

# Filter Job vacancy characteristics
df_vac = df_vac.loc[(df_vac['JOB_CHAR'] == 'Full-time') | (df_vac['JOB_CHAR'] == 'Part-time')]

#drop column not required
df_vac = df_vac.drop(columns=['Statistics'])

df_vac

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0
1143187,2015-04-01,Canada,101,"Total, all occupations",Full-time,322450.0
1143190,2015-04-01,Canada,101,"Total, all occupations",Part-time,129475.0
1143305,2015-04-01,Canada,0,Management occupations,Full-time,23030.0
...,...,...,...,...,...,...
27208011,2021-04-01,Northwest Territories,421,"Paraprofessional occupations in legal, social,...",Part-time,25.0
27254351,2021-04-01,Nunavut,101,"Total, all occupations",Full-time,515.0
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0
27309527,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...",Full-time,160.0


In [12]:
# Machine Learning only needs Broad NOC
inc_list = ['101','0','1', '2', '3', '4', '5', '6', '7', '8', '9']

#extract into new dataframe
df_ml = df_vac[df_vac.NOC.isin(inc_list)]

#adding null column to hold predicted vacancies
df_ml["Predicted_Vacancies"] = np.nan

df_ml

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE,Predicted_Vacancies
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0,
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0,
1143187,2015-04-01,Canada,101,"Total, all occupations",Full-time,322450.0,
1143190,2015-04-01,Canada,101,"Total, all occupations",Part-time,129475.0,
1143305,2015-04-01,Canada,0,Management occupations,Full-time,23030.0,
...,...,...,...,...,...,...,...
27170619,2021-04-01,Northwest Territories,101,"Total, all occupations",Full-time,945.0,
27170622,2021-04-01,Northwest Territories,101,"Total, all occupations",Part-time,165.0,
27254351,2021-04-01,Nunavut,101,"Total, all occupations",Full-time,515.0,
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0,


In [13]:
df_ml.to_csv('Resources/MachineLearningTable.csv')

In [45]:
#adding YEAR and Quarter for Vacancies table

#splitting date to year and month
df_vac[['Year','Quarter','Date']] = df_vac['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
df_vac['Quarter'] = df_vac['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#reordeing columns
df_vac = df_vac[['REF_DATE','Year', 'Quarter', 'GEO','NOC','JOB_CHAR','VALUE']]

df_vac


Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
408283,2015-01-01,2015,1,Quebec,101,Full-time,45385.0
653251,2015-01-01,2015,1,Saskatchewan,101,Full-time,10375.0
1143187,2015-04-01,2015,2,Canada,101,Full-time,322450.0
1143190,2015-04-01,2015,2,Canada,101,Part-time,129475.0
1143305,2015-04-01,2015,2,Canada,0,Full-time,23030.0
...,...,...,...,...,...,...,...
27208011,2021-04-01,2021,2,Northwest Territories,421,Part-time,25.0
27254351,2021-04-01,2021,2,Nunavut,101,Full-time,515.0
27263184,2021-04-01,2021,2,Nunavut,1,Full-time,70.0
27309527,2021-04-01,2021,2,Nunavut,7,Full-time,160.0


In [46]:
#Vacancies table output
df_vac.to_csv('Resources/AllVacancies.csv')

<h2><span style="color:#3346FF"> 2.4_NOC_Table</span></h2>

In [42]:
## creating NOC dataframe
NOC = df[['NOC','NOCdesc']]

#dropping duplicate rows
NOC = NOC.drop_duplicates()

NOC

Unnamed: 0,NOC,NOCdesc
2,101,"Total, all occupations"
8616,1,"Business, finance and administration occupations"
44134,6,Sales and service occupations
45432,63,Service supervisors and specialized service oc...
48500,65,Service representatives and other customer and...
...,...,...
14866806,7234,Boilermakers
15972060,5135,Actors and comedians
15999422,826,Fishing vessel masters and fishermen/women
22720601,844,Other workers in fishing and trapping and hunt...


In [43]:
#exporting to csv
NOC.to_csv('Resources/NOCtable.csv', index=False)

 <h1><span style="color:red"> II. Cleaning and Saving Data to .csv</span></h1>

In [18]:
#df2 = df.copy()

In [19]:
#splitting date to year and month
#df2[['Year','Quarter','Date']] = df2['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
#df2['Quarter'] = df2['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

In [20]:
#df2.head()

In [21]:
# Cleaning data

#splitting NOC Description and Code
#df2[['NOCdesc','NOCcode']] = df2['National Occupational Classification'].str.split("[",expand=True)

#removing junkdf2_stat  from column
#df2[['NOCcode','junk']] = df2['NOCcode'].str.split("]", expand=True)

#splitting date to year and month
#df2[['Year','Quarter','Date']] = df2['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
#df2['Quarter'] = df2['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#drop redundant columns
#df2 = df2.drop(columns=['junk', 'National Occupational Classification','DGUID','UOM_ID','SCALAR_FACTOR',
                      #'SCALAR_ID','VECTOR','STATUS','SYMBOL','COORDINATE','TERMINATED','DECIMALS','UOM','Date'])
#rename columns
#df_cleaned = df2.rename(columns={"Job vacancy characteristics" :"JOB_CHAR"})

#creating Id col
#df_cleaned['ID'] = df_cleaned.index
#df_cleaned.head()
# df_cleaned.shape

<h2><span style="color:#3346FF"> Creating New Totals for Canada Level Vacancies Per Quarter in consideration of Data Cleansing</span></h2>

In [22]:
df_no_ttl = df.copy()

In [23]:
# Filter job vacancies
df_no_ttl = df_no_ttl.loc[(df_no_ttl['Statistics'] =='Job vacancies')]

In [24]:
# Remove Totals of Provinces
df_no_ttl = df_no_ttl.loc[(df_no_ttl['GEO'] != 'Canada')]
df_no_ttl.sample(n=3)

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
15362188,2018-04-01,Ontario,8432,Nursery and greenhouse workers,Government employment centre or website,Job vacancies,750.0
20840886,2019-07-01,Quebec,323,Other technical occupations in health care,Full-time,Job vacancies,485.0
9928889,2017-01-01,Alberta,65,Service representatives and other customer and...,High school diploma or equivalent,Job vacancies,1055.0


In [25]:
# Filter number of Job vacancies
df_no_ttl = df_no_ttl.loc[(df_no_ttl['JOB_CHAR'] == 'Full-time') | (df_no_ttl['JOB_CHAR'] == 'Part-time')]

In [26]:
# Filter Broad NOC
inc_list = ['0','1', '2', '3', '4', '5', '6', '7', '8', '9']
df_no_ttl = df_no_ttl[df_no_ttl.NOC.isin(inc_list)]

In [27]:
# Store accurate Canada level totals post data cleansing
TotalCAD = df_no_ttl.groupby(["REF_DATE"]).sum()["VALUE"]
TotalCAD

REF_DATE
2015-04-01    410650.0
2015-07-01    380405.0
2015-10-01    338460.0
2016-01-01    309350.0
2016-04-01    367470.0
2016-07-01    389610.0
2016-10-01    357035.0
2017-01-01    364960.0
2017-04-01    445760.0
2017-07-01    458405.0
2017-10-01    460440.0
2018-01-01    448035.0
2018-04-01    535320.0
2018-07-01    541080.0
2018-10-01    535430.0
2019-01-01    496350.0
2019-04-01    570340.0
2019-07-01    549145.0
2019-10-01    501025.0
2020-01-01    501075.0
2020-10-01    548335.0
2021-01-01    536680.0
2021-04-01    713280.0
Name: VALUE, dtype: float64

In [28]:
#comparing to actuals in raw file for 1 quarter:
comp = df.loc[(df['NOC'] == '101') & (df['REF_DATE'] == '2015-04-01') & (df['Statistics'] =='Job vacancies') & ((df['JOB_CHAR'] == 'Full-time') | (df['JOB_CHAR'] == 'Part-time'))]

comp.groupby(["REF_DATE"]).sum()["VALUE"]

REF_DATE
2015-04-01    903620.0
Name: VALUE, dtype: float64

In [37]:
#adding column for ML output
TotalCAD['Predicted_Vacancies'] = np.nan
TotalCAD

Unnamed: 0_level_0,VALUE,Predicted_Vacancies
REF_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-04-01,410650.0,
2015-07-01,380405.0,
2015-10-01,338460.0,
2016-01-01,309350.0,
2016-04-01,367470.0,
2016-07-01,389610.0,
2016-10-01,357035.0,
2017-01-01,364960.0,
2017-04-01,445760.0,
2017-07-01,458405.0,


In [44]:
TotalCAD.to_csv('Resources/CanadaVacancies.csv')

<h2><span style="color:#3346FF"> 2.5_Average_Hourly_Wage</span></h2>

In [30]:
avg_wage = df.copy()

In [31]:
# Filter job vacancies
avg_wage = avg_wage.loc[(avg_wage['Statistics'] =='Average offered hourly wage')]

# Filter number of Job vacancies
avg_wage = avg_wage.loc[(avg_wage['JOB_CHAR'] == 'Full-time') | (avg_wage['JOB_CHAR'] == 'Part-time')]

avg_wage

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
1143189,2015-04-01,Canada,101,"Total, all occupations",Full-time,Average offered hourly wage,20.40
1143192,2015-04-01,Canada,101,"Total, all occupations",Part-time,Average offered hourly wage,14.60
1143307,2015-04-01,Canada,0,Management occupations,Full-time,Average offered hourly wage,33.45
1143310,2015-04-01,Canada,0,Management occupations,Part-time,Average offered hourly wage,24.00
1143425,2015-04-01,Canada,00,Senior management occupations,Full-time,Average offered hourly wage,62.40
...,...,...,...,...,...,...,...
27309532,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...",Part-time,Average offered hourly wage,20.00
27314611,2021-04-01,Nunavut,73,Maintenance and equipment operation trades,Full-time,Average offered hourly wage,36.00
27315458,2021-04-01,Nunavut,731,Machinery and transportation equipment mechani...,Full-time,Average offered hourly wage,37.30
27315579,2021-04-01,Nunavut,7311,Construction millwrights and industrial mechan...,Full-time,Average offered hourly wage,34.70


In [32]:
#splitting date to year and month
avg_wage[['Year','Quarter','Date']] = avg_wage['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
avg_wage['Quarter'] = avg_wage['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#drop column not required
avg_wage = avg_wage.drop(columns=['Date'])

#reordeing columns
avg_wage = avg_wage[['REF_DATE','Year', 'Quarter', 'GEO','NOC','JOB_CHAR','VALUE']]

avg_wage

Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
1143189,2015-04-01,2015,2,Canada,101,Full-time,20.40
1143192,2015-04-01,2015,2,Canada,101,Part-time,14.60
1143307,2015-04-01,2015,2,Canada,0,Full-time,33.45
1143310,2015-04-01,2015,2,Canada,0,Part-time,24.00
1143425,2015-04-01,2015,2,Canada,00,Full-time,62.40
...,...,...,...,...,...,...,...
27309532,2021-04-01,2021,2,Nunavut,7,Part-time,20.00
27314611,2021-04-01,2021,2,Nunavut,73,Full-time,36.00
27315458,2021-04-01,2021,2,Nunavut,731,Full-time,37.30
27315579,2021-04-01,2021,2,Nunavut,7311,Full-time,34.70


In [38]:
#exporting to csv
avg_wage.to_csv('Resources/AvgWagetable.csv')