In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
from pathlib import Path


In [2]:
# Read a starting database
file_path = "Resources/14100328.csv"
raw = pd.read_csv(file_path, error_bad_lines=False)
raw.sample(n=3)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,REF_DATE,GEO,DGUID,National Occupational Classification,Job vacancy characteristics,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
27296239,2021-04,Nunavut,2016A000262,Actors and comedians [5135],"College, CEGEP and other non-university certif...",Proportion of job vacancies,Percentage,242,units,0,v105729440,14.442.9.2,,..,,,1
12163638,2017-07,Saskatchewan,2016A000247,Aircraft assemblers and aircraft assembly insp...,90 days or more,Average offered hourly wage,Dollars,81,units,0,v105314886,9.670.28.5,,..,,,2
25783220,2021-01,Alberta,2016A000248,Employment counsellors [4156],"Job or recruitment fairs at schools, colleges ...",Job vacancies,Number,223,units,0,v105357688,10.406.40.1,,x,,,0


<h1><span style="color:red"> I. Removing Redundant Data</span></h1>

### Filter data quality: A - excellent, B - very good, C - good, acceptable - D
### Making Ref Date into actual Date

In [3]:
df = raw.copy()
df = df.loc[(df['STATUS'] == 'A')|(df['STATUS'] == 'B')|(df['STATUS'] == 'C')|(df['STATUS'] == 'D')]

#appending date to allow for date time data type
df['REF_DATE'] = df['REF_DATE'] + '-01'

df['REF_DATE'].unique()

array(['2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01',
       '2016-01-01', '2016-04-01', '2016-07-01', '2016-10-01',
       '2017-01-01', '2017-04-01', '2017-07-01', '2017-10-01',
       '2018-01-01', '2018-04-01', '2018-07-01', '2018-10-01',
       '2019-01-01', '2019-04-01', '2019-07-01', '2019-10-01',
       '2020-01-01', '2020-10-01', '2021-01-01', '2021-04-01'],
      dtype=object)

## Filtering for Vacancies and Average Wage only

In [4]:
#Filtering for Vacancies and Average Wage
df = df.loc[(df['Statistics'] =='Job vacancies') | (df['Statistics'] == 'Average offered hourly wage')]

In [5]:
#splitting NOC Description and Code
df[['NOCdesc','NOC']] = df['National Occupational Classification'].str.split("[",expand=True)

#removing junk from column
df[['NOC','junk']] = df['NOC'].str.split("]", expand=True)

#dropping columns not required, renaming and reordering columns
df = df.drop(columns=['DGUID','UOM','UOM_ID','SCALAR_FACTOR','SCALAR_ID','VECTOR','STATUS','SYMBOL','COORDINATE',
                 'TERMINATED','DECIMALS','junk','National Occupational Classification'])

df = df.rename(columns={"Job vacancy characteristics" :"JOB_CHAR"})
df = df[['REF_DATE','GEO','NOC','NOCdesc','JOB_CHAR','Statistics','VALUE']]

df.head()

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15


In [6]:
#df.to_csv('Resources/1.2_VacanciesRawData.csv',index=False)

In [6]:
#stripping trailing spaces
df["NOCdesc"] = df["NOCdesc"].str.rstrip()

In [7]:
df.isnull().sum()

REF_DATE          0
GEO               0
NOC           25599
NOCdesc           0
JOB_CHAR          0
Statistics        0
VALUE             0
dtype: int64

In [8]:
#checking NOC Descriptions of null NOCs
nullNoc = df[df['NOC'].isna()]
nullNoc['NOCdesc'].unique()

array(['Total, all occupations', 'Unclassified occupations'], dtype=object)

In [9]:
#replacing Null NOCs for Total,all occupations and Unclassified Occupations
temp = df['NOC'].isna()
df.loc[temp, 'NOC'] = np.where(df.loc[temp, 'NOCdesc'].eq('Total, all occupations'), '101', 'X')

In [10]:
#checking outcome
df[df['NOCdesc'] == 'Total, all occupations']
#df[df['NOCdesc'] == 'Unclassified occupations']

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,101,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,101,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,101,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,101,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,101,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15
...,...,...,...,...,...,...,...
27254446,2021-04-01,Nunavut,101,"Total, all occupations",Seasonal,Average offered hourly wage,32.95
27254447,2021-04-01,Nunavut,101,"Total, all occupations","Recruitment strategies, all types",Job vacancies,615.00
27254449,2021-04-01,Nunavut,101,"Total, all occupations","Personal contacts, referrals, informal networks",Job vacancies,445.00
27254455,2021-04-01,Nunavut,101,"Total, all occupations",Online job boards,Job vacancies,475.00



### Creating input table for Machine Learning, filtered on Broad NOC Code, Full/Part Time only

In [11]:
df_vac = df.copy()

# Filter job vacancies
df_vac = df_vac.loc[(df_vac['Statistics'] =='Job vacancies')]

# Filter Job vacancy characteristics
df_vac = df_vac.loc[(df_vac['JOB_CHAR'] == 'Full-time') | (df_vac['JOB_CHAR'] == 'Part-time') | (df_vac['JOB_CHAR'] == 'Type of work, all types')]
#df_vac = df_vac.loc[(df_vac['JOB_CHAR'] == 'Full-time') | (df_vac['JOB_CHAR'] == 'Part-time')]

#drop column not required
df_vac = df_vac.drop(columns=['Statistics'])

df_vac

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE
408280,2015-01-01,Quebec,101,"Total, all occupations","Type of work, all types",60505.0
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0
653248,2015-01-01,Saskatchewan,101,"Total, all occupations","Type of work, all types",14230.0
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0
1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",451925.0
...,...,...,...,...,...,...
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0
27309524,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...","Type of work, all types",160.0
27309527,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...",Full-time,160.0
27314606,2021-04-01,Nunavut,73,Maintenance and equipment operation trades,"Type of work, all types",65.0


In [18]:
# Machine Learning only needs Broad NOC
inc_list = ['101','0','1', '2', '3', '4', '5', '6', '7', '8', '9']

#extract into new dataframe
df_ml = df_vac[df_vac.NOC.isin(inc_list)]

#adding null column to hold predicted vacancies
df_ml["Predicted_Vacancies"] = np.nan

df_ml

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE,Predicted_Vacancies
408280,2015-01-01,Quebec,101,"Total, all occupations","Type of work, all types",60505.0,
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0,
653248,2015-01-01,Saskatchewan,101,"Total, all occupations","Type of work, all types",14230.0,
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0,
1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",451925.0,
...,...,...,...,...,...,...,...
27254351,2021-04-01,Nunavut,101,"Total, all occupations",Full-time,515.0,
27263181,2021-04-01,Nunavut,1,"Business, finance and administration occupations","Type of work, all types",75.0,
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0,
27309524,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...","Type of work, all types",160.0,


In [21]:
df_ml.to_csv('Resources/MachineLearningTable.csv')

## Creating Vacancies Table data

In [15]:
#adding YEAR and Quarter for Vacancies table

#splitting date to year and month
df_vac[['Year','Quarter','Date']] = df_vac['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
df_vac['Quarter'] = df_vac['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#reordeing columns
df_vac = df_vac[['REF_DATE','Year', 'Quarter', 'GEO','NOC','JOB_CHAR','VALUE']]

df_vac


Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
408283,2015-01-01,2015,1,Quebec,101,Full-time,45385.0
653251,2015-01-01,2015,1,Saskatchewan,101,Full-time,10375.0
1143187,2015-04-01,2015,2,Canada,101,Full-time,322450.0
1143190,2015-04-01,2015,2,Canada,101,Part-time,129475.0
1143305,2015-04-01,2015,2,Canada,0,Full-time,23030.0
...,...,...,...,...,...,...,...
27208011,2021-04-01,2021,2,Northwest Territories,421,Part-time,25.0
27254351,2021-04-01,2021,2,Nunavut,101,Full-time,515.0
27263184,2021-04-01,2021,2,Nunavut,1,Full-time,70.0
27309527,2021-04-01,2021,2,Nunavut,7,Full-time,160.0


In [16]:
#Vacancies table output
df_vac.to_csv('Resources/AllVacancies.csv')

## NOC Table

In [17]:
## creating NOC dataframe
NOC = df[['NOC','NOCdesc']]

#dropping duplicate rows
NOC = NOC.drop_duplicates()

NOC

Unnamed: 0,NOC,NOCdesc
2,101,"Total, all occupations"
8616,1,"Business, finance and administration occupations"
44134,6,Sales and service occupations
45432,63,Service supervisors and specialized service oc...
48500,65,Service representatives and other customer and...
...,...,...
14866806,7234,Boilermakers
15972060,5135,Actors and comedians
15999422,826,Fishing vessel masters and fishermen/women
22720601,844,Other workers in fishing and trapping and hunt...


In [18]:
#exporting to csv
NOC.to_csv('Resources/NOCtable.csv', index=False)

## Canada Vacancies Table
## Filtering for Canada and JOB_CHAR = 'Type of work, all types'

In [23]:
CAD_vac = df.copy()
CAD_vac.head()

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,101,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,101,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,101,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,101,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,101,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15


In [24]:
# Filter job vacancies
CAD_vac = CAD_vac.loc[(CAD_vac['Statistics'] =='Job vacancies')]

# Only Canada
CAD_vac = CAD_vac.loc[(CAD_vac['GEO'] == 'Canada')]

# Filter number of Job vacancies
CAD_vac = CAD_vac.loc[(CAD_vac['JOB_CHAR'] == 'Type of work, all types')]
CAD_vac = CAD_vac.loc[(CAD_vac['NOCdesc'] == 'Total, all occupations')]

CAD_vac.head()

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",Job vacancies,451925.0
2286368,2015-07-01,Canada,101,"Total, all occupations","Type of work, all types",Job vacancies,407865.0
3429552,2015-10-01,Canada,101,"Total, all occupations","Type of work, all types",Job vacancies,358755.0
4572736,2016-01-01,Canada,101,"Total, all occupations","Type of work, all types",Job vacancies,330215.0
5715920,2016-04-01,Canada,101,"Total, all occupations","Type of work, all types",Job vacancies,391190.0


In [26]:
# Filter number of Job vacancies
CAD_vac = CAD_vac.loc[(CAD_vac['JOB_CHAR'] == 'Type of work, all types')]
CAD_vac = CAD_vac.loc[(CAD_vac['NOCdesc'] == 'Total, all occupations')]

In [28]:
# Store accurate Canada level totals post data cleansing
TotalCAD = CAD_vac.groupby(["REF_DATE"]).sum()["VALUE"]
TotalCAD

REF_DATE
2015-04-01    451925.0
2015-07-01    407865.0
2015-10-01    358755.0
2016-01-01    330215.0
2016-04-01    391190.0
2016-07-01    404860.0
2016-10-01    381010.0
2017-01-01    387080.0
2017-04-01    459685.0
2017-07-01    467395.0
2017-10-01    469360.0
2018-01-01    461845.0
2018-04-01    546820.0
2018-07-01    550670.0
2018-10-01    547330.0
2019-01-01    506140.0
2019-04-01    581595.0
2019-07-01    562910.0
2019-10-01    508590.0
2020-01-01    512760.0
2020-10-01    560215.0
2021-01-01    553480.0
2021-04-01    731905.0
Name: VALUE, dtype: float64

In [None]:
#comparing to actuals in raw file for 1 quarter:
#comp = df.loc[(df['NOC'] == '101') & (df['REF_DATE'] == '2015-04-01') & (df['Statistics'] =='Job vacancies') & ((df['JOB_CHAR'] == 'Full-time') | (df['JOB_CHAR'] == 'Part-time'))]

#comp.groupby(["REF_DATE"]).sum()["VALUE"]

In [41]:
#exporting groupby to a data frame
TotCAD = pd.DataFrame(TotalCAD)
TotCAD['Predicted_Vacancies'] = np.nan
TotCAD = TotCAD[TotCAD.VALUE > 0]

In [42]:
TotCAD.head()

Unnamed: 0_level_0,VALUE,Predicted_Vacancies
REF_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-04-01,451925.0,
2015-07-01,407865.0,
2015-10-01,358755.0,
2016-01-01,330215.0,
2016-04-01,391190.0,


In [40]:
TotCAD.to_csv('Resources/CanadaVacancies.csv')

## Creating Data for Average Wage Table

In [34]:
avg_wage = df.copy()

In [35]:
# Filter job vacancies
avg_wage = avg_wage.loc[(avg_wage['Statistics'] =='Average offered hourly wage')]

# Filter number of Job vacancies
avg_wage = avg_wage.loc[(avg_wage['JOB_CHAR'] == 'Full-time') | (avg_wage['JOB_CHAR'] == 'Part-time')]

avg_wage

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
1143189,2015-04-01,Canada,101,"Total, all occupations",Full-time,Average offered hourly wage,20.40
1143192,2015-04-01,Canada,101,"Total, all occupations",Part-time,Average offered hourly wage,14.60
1143307,2015-04-01,Canada,0,Management occupations,Full-time,Average offered hourly wage,33.45
1143310,2015-04-01,Canada,0,Management occupations,Part-time,Average offered hourly wage,24.00
1143425,2015-04-01,Canada,00,Senior management occupations,Full-time,Average offered hourly wage,62.40
...,...,...,...,...,...,...,...
27309532,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...",Part-time,Average offered hourly wage,20.00
27314611,2021-04-01,Nunavut,73,Maintenance and equipment operation trades,Full-time,Average offered hourly wage,36.00
27315458,2021-04-01,Nunavut,731,Machinery and transportation equipment mechani...,Full-time,Average offered hourly wage,37.30
27315579,2021-04-01,Nunavut,7311,Construction millwrights and industrial mechanics,Full-time,Average offered hourly wage,34.70


In [36]:
#splitting date to year and month
avg_wage[['Year','Quarter','Date']] = avg_wage['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
avg_wage['Quarter'] = avg_wage['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#drop column not required
avg_wage = avg_wage.drop(columns=['Date'])

#reordeing columns
avg_wage = avg_wage[['REF_DATE','Year', 'Quarter', 'GEO','NOC','JOB_CHAR','VALUE']]

avg_wage

Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
1143189,2015-04-01,2015,2,Canada,101,Full-time,20.40
1143192,2015-04-01,2015,2,Canada,101,Part-time,14.60
1143307,2015-04-01,2015,2,Canada,0,Full-time,33.45
1143310,2015-04-01,2015,2,Canada,0,Part-time,24.00
1143425,2015-04-01,2015,2,Canada,00,Full-time,62.40
...,...,...,...,...,...,...,...
27309532,2021-04-01,2021,2,Nunavut,7,Part-time,20.00
27314611,2021-04-01,2021,2,Nunavut,73,Full-time,36.00
27315458,2021-04-01,2021,2,Nunavut,731,Full-time,37.30
27315579,2021-04-01,2021,2,Nunavut,7311,Full-time,34.70


In [37]:
#exporting to csv
avg_wage.to_csv('Resources/AvgWagetable.csv')