In [1]:
import urllib.request
import ssl
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

In [2]:
url = (
    'https://open.canada.ca/data/en/api/3/action/datastore_search?'
    'resource_id=d16e10ea-77bf-4db8-bdb5-adc709e6cada&limit=100000'
)

context = ssl._create_unverified_context()
response = urllib.request.urlopen(url, context=context)
data = json.loads(response.read())

# Extract column headers from metadata
fields = data['result']['fields']
column_order = [field['id'] for field in fields]

# Extract records and reformat DataFrame
records = data['result']['records']
df = pd.DataFrame(records)

# Reorder columns if needed
df = df[column_order]

# View first few rows
pd.set_option('display.max_columns', None) 


In [3]:
df

Unnamed: 0,_id,NOC_CNP,NOC_Title_eng,NOC_Title_fra,prov,ER_Code_Code_RE,ER_Name,Nom_RE,Low_Wage_Salaire_Minium,Median_Wage_Salaire_Median,High_Wage_Salaire_Maximal,Average_Wage_Salaire_Moyen,Quartile1_Wage_Salaire_Quartile1,Quartile3_Wage_Salaire_Quartile3,Data_Source_E,Data_Source_F,Reference_Period,Revision_Date_Date_revision,Annual_Wage_Flag_Salaire_annuel,Wage_Comment_E,Wage_Comment_F,Non_WageBen_pct
0,1,NOC_00010,Legislators,Membres des corps législatifs,NAT,ER00,Canada,Canada,32360,84000,184000,97600,54400,132000,2021 Census,Recensement 2021,2021,2024-12-03,1,Wages for this occupation are presented at an ...,Pour cette profession sont présentés au taux a...,62.8
1,2,NOC_00010,Legislators,Membres des corps législatifs,NL,ER10,Newfoundland and Labrador,Terre-Neuve-et-Labrador,,99000,,90000,,,2021 Census,Recensement 2021,2021,2024-12-03,1,Wages for this occupation are presented at an ...,Pour cette profession sont présentés au taux a...,
2,3,NOC_00010,Legislators,Membres des corps législatifs,NL,ER1010,Avalon Peninsula,Avalon Peninsula,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",
3,4,NOC_00010,Legislators,Membres des corps législatifs,NL,ER1020,South Coast--Burin Peninsula,Côte-sud--Burin Peninsula,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",
4,5,NOC_00010,Legislators,Membres des corps législatifs,NL,ER1030,West Coast--Northern Peninsula--Labrador,Côte-ouest--Northern Peninsula--Labrador,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,31996,NOC_72422,Electrical mechanics,Électromécaniciens/électromécaniciennes,NL,ER1020,South Coast--Burin Peninsula,Côte-sud--Burin Peninsula,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",
31996,31997,NOC_72422,Electrical mechanics,Électromécaniciens/électromécaniciennes,NL,ER1030,West Coast--Northern Peninsula--Labrador,Côte-ouest--Northern Peninsula--Labrador,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",
31997,31998,NOC_72422,Electrical mechanics,Électromécaniciens/électromécaniciennes,NL,ER1040,Notre Dame--Central Bonavista Bay,Notre Dame--Central Bonavista Bay,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",
31998,31999,NOC_72422,Electrical mechanics,Électromécaniciens/électromécaniciennes,PEI,ER1110,Prince Edward Island,Île-du-Prince-Édouard,,,,,,,,,,2024-12-03,0,"Due to data limitations, the wage for this occ...","En raison des limites associées aux données, l...",


In [4]:
columns_to_drop = [
    'Quartile1_Wage_Salaire_Quartile1',
    'Quartile3_Wage_Salaire_Quartile3',
    'Data_Source_E',
    'Data_Source_F',
    'Reference_Period',
    'Revision_Date_Date_revision',
    'Annual_Wage_Flag_Salaire_annuel',
    'Wage_Comment_E',
    'Wage_Comment_F',
    'Non_WageBen_pct',
    'ER_Code_Code_RE',
    'NOC_Title_fra',
    'Nom_RE'
]
df.drop(columns=columns_to_drop, inplace=True)

In [5]:
df.drop(columns='NOC_CNP', inplace=True)

In [6]:
df

Unnamed: 0,_id,NOC_Title_eng,prov,ER_Name,Low_Wage_Salaire_Minium,Median_Wage_Salaire_Median,High_Wage_Salaire_Maximal,Average_Wage_Salaire_Moyen
0,1,Legislators,NAT,Canada,32360,84000,184000,97600
1,2,Legislators,NL,Newfoundland and Labrador,,99000,,90000
2,3,Legislators,NL,Avalon Peninsula,,,,
3,4,Legislators,NL,South Coast--Burin Peninsula,,,,
4,5,Legislators,NL,West Coast--Northern Peninsula--Labrador,,,,
...,...,...,...,...,...,...,...,...
31995,31996,Electrical mechanics,NL,South Coast--Burin Peninsula,,,,
31996,31997,Electrical mechanics,NL,West Coast--Northern Peninsula--Labrador,,,,
31997,31998,Electrical mechanics,NL,Notre Dame--Central Bonavista Bay,,,,
31998,31999,Electrical mechanics,PEI,Prince Edward Island,,,,


In [7]:
has_missing =  df.notna()
has_missing

Unnamed: 0,_id,NOC_Title_eng,prov,ER_Name,Low_Wage_Salaire_Minium,Median_Wage_Salaire_Median,High_Wage_Salaire_Maximal,Average_Wage_Salaire_Moyen
0,True,True,True,True,True,True,True,True
1,True,True,True,True,False,True,False,True
2,True,True,True,True,False,False,False,False
3,True,True,True,True,False,False,False,False
4,True,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...
31995,True,True,True,True,False,False,False,False
31996,True,True,True,True,False,False,False,False
31997,True,True,True,True,False,False,False,False
31998,True,True,True,True,False,False,False,False


In [9]:
df_clean = df.dropna()

In [10]:
df_clean

Unnamed: 0,_id,NOC_Title_eng,prov,ER_Name,Low_Wage_Salaire_Minium,Median_Wage_Salaire_Median,High_Wage_Salaire_Maximal,Average_Wage_Salaire_Moyen
0,1,Legislators,NAT,Canada,32360,84000,184000,97600
7,8,Legislators,NS,Nova Scotia,32360,94000,158000,94000
13,14,Legislators,NB,New Brunswick,34800,68000,150000,78000
19,20,Legislators,QC,Quebec,41200,96000,184000,102800
26,27,Legislators,QC,Montérégie,38000,97000,200000,112000
...,...,...,...,...,...,...,...,...
31977,31978,Appliance servicers and repairers,AB,Edmonton,19,25,35.35,26.29
31980,31981,Appliance servicers and repairers,BC,British Columbia,18.2,26,37.3,27.24
31981,31982,Appliance servicers and repairers,BC,Vancouver Island and Coast,18.2,26,37.3,27.24
31982,31983,Appliance servicers and repairers,BC,Lower Mainland,18,26.25,36.5,27.1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   _id                         32000 non-null  int64 
 1   NOC_Title_eng               32000 non-null  object
 2   prov                        32000 non-null  object
 3   ER_Name                     32000 non-null  object
 4   Low_Wage_Salaire_Minium     15035 non-null  object
 5   Median_Wage_Salaire_Median  15878 non-null  object
 6   High_Wage_Salaire_Maximal   15039 non-null  object
 7   Average_Wage_Salaire_Moyen  14743 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB
