In [83]:
# Import our dependencies
import pandas as pd
import numpy as np
from pathlib import Path

# Send output to database
from sqlalchemy import create_engine
# Importing config file for pulling from database
from config import config

In [84]:
# Read a starting database
file_path = "Resources/14100328.csv"
data = pd.read_csv(file_path, error_bad_lines=False)
file_path2 = "Resources/ReferenceList_UUID.csv"
uuid_df = pd.read_csv(file_path2, error_bad_lines=False)
data.sample(n=3)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,REF_DATE,GEO,DGUID,National Occupational Classification,Job vacancy characteristics,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
22615216,2019-10,Nunavut,2016A000262,Cleaning supervisors [6315],Part-time,Average offered hourly wage,Dollars,81,units,0,v105735363,14.474.3.5,,x,,,2
15073787,2018-04,Nova Scotia,2016A000212,Conservation and fishery officers [2224],30 to 59 days,Job vacancies,Number,223,units,0,v104795854,4.330.26.1,,F,,,0
790409,2015-01,Alberta,2016A000248,Ironworkers [7236],"Minimum experience level sought, all levels",Job vacancies,Number,223,units,0,v105380455,10.528.17.1,,F,,,0


<h1><span style="color:red"> I. Removing Redundant Data</span></h1>

### Filter data quality: A - excellent, B - very good, C - good, acceptable - D
### Making Ref Date into actual Date

In [85]:
df = data.copy()
df = df.loc[(df['STATUS'] == 'A')|(df['STATUS'] == 'B')|(df['STATUS'] == 'C')|(df['STATUS'] == 'D')]

#appending date to allow for date time data type
df['REF_DATE'] = df['REF_DATE'] + '-01'

df['REF_DATE'].unique()

array(['2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01',
       '2016-01-01', '2016-04-01', '2016-07-01', '2016-10-01',
       '2017-01-01', '2017-04-01', '2017-07-01', '2017-10-01',
       '2018-01-01', '2018-04-01', '2018-07-01', '2018-10-01',
       '2019-01-01', '2019-04-01', '2019-07-01', '2019-10-01',
       '2020-01-01', '2020-10-01', '2021-01-01', '2021-04-01'],
      dtype=object)

## Filtering for Vacancies and Average Wage only

In [86]:
#Filtering for Vacancies and Average Wage
df = df.loc[(df['Statistics'] =='Job vacancies') | (df['Statistics'] == 'Average offered hourly wage')]

In [87]:
#splitting NOC Description and Code
df[['NOCdesc','NOC']] = df['National Occupational Classification'].str.split("[",expand=True)

#removing junk from column
df[['NOC','junk']] = df['NOC'].str.split("]", expand=True)

#dropping columns not required, renaming and reordering columns
df = df.drop(columns=['DGUID','UOM','UOM_ID','SCALAR_FACTOR','SCALAR_ID','VECTOR','STATUS','SYMBOL','COORDINATE',
                 'TERMINATED','DECIMALS','junk','National Occupational Classification'])

df = df.rename(columns={"Job vacancy characteristics" :"JOB_CHAR"})
df = df[['REF_DATE','GEO','NOC','NOCdesc','JOB_CHAR','Statistics','VALUE']]

df.head()

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15


In [88]:
#stripping trailing spaces
df["NOCdesc"] = df["NOCdesc"].str.rstrip()

In [89]:
df.isnull().sum()

REF_DATE          0
GEO               0
NOC           25599
NOCdesc           0
JOB_CHAR          0
Statistics        0
VALUE             0
dtype: int64

In [90]:
#checking NOC Descriptions of null NOCs
nullNoc = df[df['NOC'].isna()]
nullNoc['NOCdesc'].unique()

array(['Total, all occupations', 'Unclassified occupations'], dtype=object)

In [91]:
#replacing Null NOCs for Total,all occupations and Unclassified Occupations
temp = df['NOC'].isna()
df.loc[temp, 'NOC'] = np.where(df.loc[temp, 'NOCdesc'].eq('Total, all occupations'), '101', 'X')

In [92]:
#checking outcome
df[df['NOCdesc'] == 'Total, all occupations']
#df[df['NOCdesc'] == 'Unclassified occupations']

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,101,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
11,2015-01-01,Canada,101,"Total, all occupations","Minimum level of education required, all levels",Average offered hourly wage,19.15
38,2015-01-01,Canada,101,"Total, all occupations","Certification requirement, all types",Average offered hourly wage,19.15
47,2015-01-01,Canada,101,"Total, all occupations","Minimum experience level sought, all levels",Average offered hourly wage,19.15
65,2015-01-01,Canada,101,"Total, all occupations","Duration of job vacancy, all durations",Average offered hourly wage,19.15
...,...,...,...,...,...,...,...
27254446,2021-04-01,Nunavut,101,"Total, all occupations",Seasonal,Average offered hourly wage,32.95
27254447,2021-04-01,Nunavut,101,"Total, all occupations","Recruitment strategies, all types",Job vacancies,615.00
27254449,2021-04-01,Nunavut,101,"Total, all occupations","Personal contacts, referrals, informal networks",Job vacancies,445.00
27254455,2021-04-01,Nunavut,101,"Total, all occupations",Online job boards,Job vacancies,475.00



### Creating input table for Machine Learning, filtered on Broad NOC Code, Full/Part Time only

In [93]:
df_vac = df.copy()

# Filter job vacancies
df_vac = df_vac.loc[(df_vac['Statistics'] =='Job vacancies')]

# Filter Job vacancy characteristics
df_vac = df_vac.loc[(df_vac['JOB_CHAR'] == 'Full-time') | (df_vac['JOB_CHAR'] == 'Part-time') | (df_vac['JOB_CHAR'] == 'Type of work, all types')]

#drop column not required
df_vac = df_vac.drop(columns=['Statistics'])

df_vac

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE
408280,2015-01-01,Quebec,101,"Total, all occupations","Type of work, all types",60505.0
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0
653248,2015-01-01,Saskatchewan,101,"Total, all occupations","Type of work, all types",14230.0
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0
1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",451925.0
...,...,...,...,...,...,...
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0
27309524,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...","Type of work, all types",160.0
27309527,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...",Full-time,160.0
27314606,2021-04-01,Nunavut,73,Maintenance and equipment operation trades,"Type of work, all types",65.0


In [94]:
# Machine Learning only needs Broad NOC
inc_list = ['101','0','1', '2', '3', '4', '5', '6', '7', '8', '9']

#extract into new dataframe
df_ml = df_vac[df_vac.NOC.isin(inc_list)]

#adding null column to hold predicted vacancies
df_ml["Predicted_Vacancies"] = ''

df_ml

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE,Predicted_Vacancies
408280,2015-01-01,Quebec,101,"Total, all occupations","Type of work, all types",60505.0,
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0,
653248,2015-01-01,Saskatchewan,101,"Total, all occupations","Type of work, all types",14230.0,
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0,
1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",451925.0,
...,...,...,...,...,...,...,...
27254351,2021-04-01,Nunavut,101,"Total, all occupations",Full-time,515.0,
27263181,2021-04-01,Nunavut,1,"Business, finance and administration occupations","Type of work, all types",75.0,
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0,
27309524,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...","Type of work, all types",160.0,


In [96]:
#splitting date to year and month
df_ml[['Year','Quarter','Date']] = df_ml['REF_DATE'].str.split("-",expand=True)                  
#converting Month to Quarter
df_ml['Quarter'] = df_ml['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])
#reordeing columns
df_ml["composition"]=df_ml["Year"] + df_ml["Quarter"] + df_ml["GEO"]+ df_ml["NOC"]+ df_ml["JOB_CHAR"]
df_ml

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE,Predicted_Vacancies,Year,Quarter,Date,composition
408280,2015-01-01,Quebec,101,"Total, all occupations","Type of work, all types",60505.0,,2015,1,01,"20151Quebec101Type of work, all types"
408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0,,2015,1,01,20151Quebec101Full-time
653248,2015-01-01,Saskatchewan,101,"Total, all occupations","Type of work, all types",14230.0,,2015,1,01,"20151Saskatchewan101Type of work, all types"
653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0,,2015,1,01,20151Saskatchewan101Full-time
1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",451925.0,,2015,2,01,"20152Canada101Type of work, all types"
...,...,...,...,...,...,...,...,...,...,...,...
27254351,2021-04-01,Nunavut,101,"Total, all occupations",Full-time,515.0,,2021,2,01,20212Nunavut101Full-time
27263181,2021-04-01,Nunavut,1,"Business, finance and administration occupations","Type of work, all types",75.0,,2021,2,01,"20212Nunavut1Type of work, all types"
27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0,,2021,2,01,20212Nunavut1Full-time
27309524,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...","Type of work, all types",160.0,,2021,2,01,"20212Nunavut7Type of work, all types"


In [97]:
df_ml= pd.merge(df_ml, uuid_df, on= 'composition', how='left')
df_ml.fillna('')
df_ml.sample(n=3)

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,VALUE,Predicted_Vacancies,Year,Quarter,Date,composition,UUID,ref_date,year,quarter,geo,noc_code,job_details
3975,2018-04-01,Alberta,2,Natural and applied sciences and related occup...,Full-time,3160.0,,2018,2,1,20182Alberta2Full-time,15551773,2018-04-01,2018,2,Alberta,2,Full-time
5842,2019-10-01,Ontario,2,Natural and applied sciences and related occup...,Full-time,15365.0,,2019,4,1,20194Ontario2Full-time,22027988,2019-10-01,2019,4,Ontario,2,Full-time
2654,2017-04-01,Manitoba,5,"Occupations in art, culture, recreation and sport",Full-time,150.0,,2017,2,1,20172Manitoba5Full-time,10899191,2017-04-01,2017,2,Manitoba,5,Full-time


In [99]:
df_ml=df_ml[["UUID", "ref_date", "geo", "noc_code", "NOCdesc", "JOB_CHAR","VALUE","Predicted_Vacancies"]]
df_ml=df_ml.rename(columns={"NOCdesc":"noc_desc","VALUE":"total_vacancies"})
df_ml.fillna('')

Unnamed: 0,UUID,ref_date,geo,noc_code,noc_desc,JOB_CHAR,total_vacancies,Predicted_Vacancies
0,408280,2015-01-01,Quebec,101,"Total, all occupations","Type of work, all types",60505.0,
1,408283,2015-01-01,Quebec,101,"Total, all occupations",Full-time,45385.0,
2,653248,2015-01-01,Saskatchewan,101,"Total, all occupations","Type of work, all types",14230.0,
3,653251,2015-01-01,Saskatchewan,101,"Total, all occupations",Full-time,10375.0,
4,1143184,2015-04-01,Canada,101,"Total, all occupations","Type of work, all types",451925.0,
...,...,...,...,...,...,...,...,...
7225,27254351,2021-04-01,Nunavut,101,"Total, all occupations",Full-time,515.0,
7226,27263181,2021-04-01,Nunavut,1,"Business, finance and administration occupations","Type of work, all types",75.0,
7227,27263184,2021-04-01,Nunavut,1,"Business, finance and administration occupations",Full-time,70.0,
7228,27309524,2021-04-01,Nunavut,7,"Trades, transport and equipment operators and ...","Type of work, all types",160.0,


In [21]:
df_ml.to_csv('Resources/MachineLearningTable.csv', index = False)

In [100]:
engine = create_engine(f"postgresql://{config['user']}:{config['password']}@{config['host']}:5432/{config['dbname']}")
conn=engine.connect()
df_ml.to_sql(name="machinelearning", con=conn, index=False, if_exists="replace")
conn.close()

## Creating Vacancies Table data

In [13]:
#adding YEAR and Quarter for Vacancies table

#splitting date to year and month
df_vac[['Year','Quarter','Date']] = df_vac['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
df_vac['Quarter'] = df_vac['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#reordeing columns
df_vac = df_vac[['REF_DATE','Year', 'Quarter', 'GEO','NOC','JOB_CHAR','VALUE']]
df_vac

Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
408280,2015-01-01,2015,1,Quebec,101,"Type of work, all types",60505.0
408283,2015-01-01,2015,1,Quebec,101,Full-time,45385.0
653248,2015-01-01,2015,1,Saskatchewan,101,"Type of work, all types",14230.0
653251,2015-01-01,2015,1,Saskatchewan,101,Full-time,10375.0
1143184,2015-04-01,2015,2,Canada,101,"Type of work, all types",451925.0
...,...,...,...,...,...,...,...
27263184,2021-04-01,2021,2,Nunavut,1,Full-time,70.0
27309524,2021-04-01,2021,2,Nunavut,7,"Type of work, all types",160.0
27309527,2021-04-01,2021,2,Nunavut,7,Full-time,160.0
27314606,2021-04-01,2021,2,Nunavut,73,"Type of work, all types",65.0


In [54]:
df_vac["composition"]=df_vac["Year"] + df_vac["Quarter"] + df_vac["GEO"]+ df_vac["NOC"]+ df_vac["JOB_CHAR"]
df_vac

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE,composition
408280,2015-01-01,2015,1,Quebec,101,"Type of work, all types",60505.0,"20151Quebec101Type of work, all types"
408283,2015-01-01,2015,1,Quebec,101,Full-time,45385.0,20151Quebec101Full-time
653248,2015-01-01,2015,1,Saskatchewan,101,"Type of work, all types",14230.0,"20151Saskatchewan101Type of work, all types"
653251,2015-01-01,2015,1,Saskatchewan,101,Full-time,10375.0,20151Saskatchewan101Full-time
1143184,2015-04-01,2015,2,Canada,101,"Type of work, all types",451925.0,"20152Canada101Type of work, all types"
...,...,...,...,...,...,...,...,...
27263184,2021-04-01,2021,2,Nunavut,1,Full-time,70.0,20212Nunavut1Full-time
27309524,2021-04-01,2021,2,Nunavut,7,"Type of work, all types",160.0,"20212Nunavut7Type of work, all types"
27309527,2021-04-01,2021,2,Nunavut,7,Full-time,160.0,20212Nunavut7Full-time
27314606,2021-04-01,2021,2,Nunavut,73,"Type of work, all types",65.0,"20212Nunavut73Type of work, all types"


In [69]:
df_vacan= pd.merge(df_vac, uuid_df, on= 'composition', how='right')
df_vacan.fillna('')
df_vacan.sample(n=3)

Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE,composition,UUID,ref_date,year,quarter,geo,noc_code,job_details
3284,,,,,,,,20152Canada9416Full-time,1217765,2015-04-01,2015,2,Canada,9416,Full-time
153119,2019-10-01,2019.0,4.0,Ontario,632.0,Full-time,2850.0,20194Ontario632Full-time,22017139,2019-10-01,2019,4,Ontario,632,Full-time
146834,2019-07-01,2019.0,3.0,Saskatchewan,623.0,"Type of work, all types",75.0,"20193Saskatchewan623Type of work, all types",21097046,2019-07-01,2019,3,Saskatchewan,623,"Type of work, all types"


In [75]:
df_vacan=df_vacan[["UUID", "ref_date", "year", "quarter", "geo", "noc_code", "job_details","VALUE"]]
df_vacan.fillna('')

Unnamed: 0,UUID,ref_date,year,quarter,geo,noc_code,job_details,VALUE
0,2,2015-01-01,2015,1,Canada,101,"Type of work, all types",
1,8616,2015-01-01,2015,1,Canada,1,"Type of work, all types",
2,45432,2015-01-01,2015,1,Canada,63,"Type of work, all types",
3,48500,2015-01-01,2015,1,Canada,65,"Type of work, all types",
4,52276,2015-01-01,2015,1,Canada,67,"Type of work, all types",
...,...,...,...,...,...,...,...,...
189856,27316293,2025-10-01,2025,4,Saskatchewan,6,Full-time,
189857,27316294,2025-10-01,2025,4,Saskatchewan,6,Part-time,
189858,27316592,2025-10-01,2025,4,Saskatchewan,7,Full-time,
189859,27314997,2025-10-01,2025,4,Yukon,1,Full-time,


In [76]:
#Vacancies table output , index_label="UUID"
# df_vacan.to_csv('Resources/AllVacanciesUUID.csv', index = False)

In [81]:
engine2 = create_engine(f"postgresql://{config['user']}:{config['password']}@{config['host']}:5432/{config['dbname']}")
conn=engine2.connect()
df_vacan.to_sql(name="vacancies", con=conn, index=False, if_exists="replace")
conn.close()

## NOC Table

In [17]:
## creating NOC dataframe
NOC = df[['NOC','NOCdesc']]

#dropping duplicate rows
NOC = NOC.drop_duplicates()

NOC

Unnamed: 0,NOC,NOCdesc
2,101,"Total, all occupations"
8616,1,"Business, finance and administration occupations"
44134,6,Sales and service occupations
45432,63,Service supervisors and specialized service oc...
48500,65,Service representatives and other customer and...
...,...,...
14866806,7234,Boilermakers
15972060,5135,Actors and comedians
15999422,826,Fishing vessel masters and fishermen/women
22720601,844,Other workers in fishing and trapping and hunt...


In [None]:
engine3 = create_engine(f"postgresql://{config['user']}:{config['password']}@{config['host']}:5432/{config['dbname']}")
conn=engine3.connect()
df_ml.to_sql(name="noc", con=conn, index=False, if_exists="replace")
conn.close()

In [18]:
#exporting to csv
# NOC.to_csv('Resources/NOCtable.csv', index=False)

## Creating Data for Average Wage Table

In [35]:
avg_wage = df.copy()

In [36]:
# Filter job vacancies
avg_wage = avg_wage.loc[(avg_wage['Statistics'] =='Average offered hourly wage')]

# Filter number of Job vacancies
avg_wage = avg_wage.loc[(avg_wage['JOB_CHAR'] == 'Full-time') | (avg_wage['JOB_CHAR'] == 'Part-time') | (avg_wage['JOB_CHAR'] == 'Type of work, all types')]

avg_wage

Unnamed: 0,REF_DATE,GEO,NOC,NOCdesc,JOB_CHAR,Statistics,VALUE
2,2015-01-01,Canada,101,"Total, all occupations","Type of work, all types",Average offered hourly wage,19.15
8616,2015-01-01,Canada,1,"Business, finance and administration occupations","Type of work, all types",Average offered hourly wage,21.35
44134,2015-01-01,Canada,6,Sales and service occupations,"Type of work, all types",Average offered hourly wage,13.00
45432,2015-01-01,Canada,63,Service supervisors and specialized service oc...,"Type of work, all types",Average offered hourly wage,13.75
48500,2015-01-01,Canada,65,Service representatives and other customer and...,"Type of work, all types",Average offered hourly wage,12.40
...,...,...,...,...,...,...,...
27315458,2021-04-01,Nunavut,731,Machinery and transportation equipment mechani...,Full-time,Average offered hourly wage,37.30
27315576,2021-04-01,Nunavut,7311,Construction millwrights and industrial mechanics,"Type of work, all types",Average offered hourly wage,34.70
27315579,2021-04-01,Nunavut,7311,Construction millwrights and industrial mechanics,Full-time,Average offered hourly wage,34.70
27319811,2021-04-01,Nunavut,75,Transport and heavy equipment operation and re...,"Type of work, all types",Average offered hourly wage,22.45


In [37]:
#splitting date to year and month
avg_wage[['Year','Quarter','Date']] = avg_wage['REF_DATE'].str.split("-",expand=True)                  

#converting Month to Quarter
avg_wage['Quarter'] = avg_wage['Quarter'].replace(['01','04','07','10'],['1','2','3','4'])

#drop column not required
avg_wage = avg_wage.drop(columns=['Date'])

#reordeing columns
avg_wage = avg_wage[['REF_DATE','Year', 'Quarter', 'GEO','NOC','JOB_CHAR','VALUE']]

avg_wage

Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
2,2015-01-01,2015,1,Canada,101,"Type of work, all types",19.15
8616,2015-01-01,2015,1,Canada,1,"Type of work, all types",21.35
44134,2015-01-01,2015,1,Canada,6,"Type of work, all types",13.00
45432,2015-01-01,2015,1,Canada,63,"Type of work, all types",13.75
48500,2015-01-01,2015,1,Canada,65,"Type of work, all types",12.40
...,...,...,...,...,...,...,...
27315458,2021-04-01,2021,2,Nunavut,731,Full-time,37.30
27315576,2021-04-01,2021,2,Nunavut,7311,"Type of work, all types",34.70
27315579,2021-04-01,2021,2,Nunavut,7311,Full-time,34.70
27319811,2021-04-01,2021,2,Nunavut,75,"Type of work, all types",22.45


In [38]:
avg_wage.drop_duplicates(keep='first', ignore_index=False, inplace=True)
avg_wage

Unnamed: 0,REF_DATE,Year,Quarter,GEO,NOC,JOB_CHAR,VALUE
2,2015-01-01,2015,1,Canada,101,"Type of work, all types",19.15
8616,2015-01-01,2015,1,Canada,1,"Type of work, all types",21.35
44134,2015-01-01,2015,1,Canada,6,"Type of work, all types",13.00
45432,2015-01-01,2015,1,Canada,63,"Type of work, all types",13.75
48500,2015-01-01,2015,1,Canada,65,"Type of work, all types",12.40
...,...,...,...,...,...,...,...
27315458,2021-04-01,2021,2,Nunavut,731,Full-time,37.30
27315576,2021-04-01,2021,2,Nunavut,7311,"Type of work, all types",34.70
27315579,2021-04-01,2021,2,Nunavut,7311,Full-time,34.70
27319811,2021-04-01,2021,2,Nunavut,75,"Type of work, all types",22.45


In [None]:
avg_wage["composition"]=avg_wage["Year"] + avg_wage["Quarter"] + avg_wage["GEO"]+ avg_wage["NOC"]+ avg_wage["JOB_CHAR"]
avg_wage

In [None]:
avg_wage= pd.merge(avg_wage, uuid_df, on= 'composition', how='right')
avg_wage.fillna('')
avg_wage.sample(n=3)

In [None]:
avg_wage=avg_wage[["UUID", "ref_date", "year", "quarter", "geo", "noc_code", "job_details","VALUE"]]
avg_wage.fillna('')

In [39]:
#exporting to csv
avg_wage.to_csv('Resources/AvgWagetable.csv', index = False)

In [None]:
engine4 = create_engine(f"postgresql://{config['user']}:{config['password']}@{config['host']}:5432/{config['dbname']}")
conn=engine4.connect()
df_ml.to_sql(name="noc", con=conn, index=False, if_exists="replace")
conn.close()