In [1]:
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:90% !important; }</style>'))

# BLS Employment Projection 2016

* In this notebook, we get together three data source of BLS (Bureau of Labor Statistics) Employment Projection:
    1. Table 1.2 Employment by detailed occupation, 2016 and projected 2026
    2. Table 1.7 Occupational projections, 2016–26, and worker characteristics, 2016
    3. Table 1.10 Occupational separations and openings, projected 2016–26
---


* The aim is to generate a complete dataset where we have related parts of BLS Employment Projection
* The (updated 2019) tables can be obtained from the [link](https://www.bls.gov/emp/data/occupational-data.htm).


    
    

In [2]:
import pandas as pd
import numpy as np

## Employment by Detailed Occupation

* Read the Employment by Detailed Occupation
* The BLS detailed occupations are the main source of O*NET structure. For this reason it is important to have the employment projection for those titles.

In [3]:
df_employment = pd.read_excel('csv_files/occupation.xlsx', sheet_name='Table 1.2')
df_employment.head()

Unnamed: 0,"Table 1.2 Employment by detailed occupation, 2016 and projected 2026\n(Numbers in thousands)",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,2016 National Employment Matrix title and code,,Occupation type,Employment,,,,"Change, 2016-26",,"Occupational openings, 2016-26 annual average"
1,,,,Number,,Percent distribution,,,,
2,,,,2016,2026.0,2016,2026.0,Number,Percent,
3,"Total, all occupations",00-0000,Summary,156064,167582.3,100,100.0,11518.6,7.4,18742
4,Management occupations,11-0000,Summary,9533.1,10340.4,6.1,6.2,807.3,8.5,841.5


* `bls_title`: BLS Detailed Occupational Title
* `onetsoccode`: BLS Detailed Occupational Code
* `type`: Occupation Type
* `emp_number_2016`: Employment Number in 2016
* `emp_number_2026`: Projected Employment Number in 2026
* `per_distr_2016`: Employment Percent Distribution (ratio to all employment) in 2016
* `per_distr_2026`: Projected Employment Percent Distribution in 2026
* `chng_number`: Change in Employment between 2016-2026 in number
* `chng_perc`: Change in Employment between 2016-2026 in percentage
* `open_average`: Annual Average Occupational Openings 2016-2026

In [4]:
columns = ['bls_title', 'onetsoccode', 'type', 'emp_number_2016',
           'emp_number_2026', 'per_distr_2016', 'per_distr_2026',
           'chng_number', 'chng_perc', 'open_average']
df_employment.columns = columns
df_employment.head(7)

Unnamed: 0,bls_title,onetsoccode,type,emp_number_2016,emp_number_2026,per_distr_2016,per_distr_2026,chng_number,chng_perc,open_average
0,2016 National Employment Matrix title and code,,Occupation type,Employment,,,,"Change, 2016-26",,"Occupational openings, 2016-26 annual average"
1,,,,Number,,Percent distribution,,,,
2,,,,2016,2026.0,2016,2026.0,Number,Percent,
3,"Total, all occupations",00-0000,Summary,156064,167582.3,100,100.0,11518.6,7.4,18742
4,Management occupations,11-0000,Summary,9533.1,10340.4,6.1,6.2,807.3,8.5,841.5
5,Top executives,11-1000,Summary,2627.5,2824.5,1.7,1.7,197,7.5,235
6,Chief executives,11-1011,Line item,308.9,296.8,0.2,0.2,-12.1,-3.9,20


* What we need is the single occupational titles so that we dropped first 4 rows
* Moreover, drop all summary `type` occupations
* Finally, drop `type`

In [5]:
df_employment = df_employment.iloc[4:]
mask = df_employment[df_employment.type == 'Summary'].index
df_employment.drop(mask, axis=0, inplace=True)
df_employment.drop('type',  axis=1, inplace=True)
df_employment.reset_index(drop=True, inplace=True)
df_employment.head(2)

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,per_distr_2016,per_distr_2026,chng_number,chng_perc,open_average
0,Chief executives,11-1011,308.9,296.8,0.2,0.2,-12.1,-3.9,20.0
1,General and operations managers,11-1021,2263.1,2468.3,1.5,1.5,205.2,9.1,210.7


In [6]:
df_employment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bls_title        820 non-null    object 
 1   onetsoccode      819 non-null    object 
 2   emp_number_2016  819 non-null    object 
 3   emp_number_2026  819 non-null    float64
 4   per_distr_2016   819 non-null    object 
 5   per_distr_2026   819 non-null    float64
 6   chng_number      819 non-null    object 
 7   chng_perc        819 non-null    object 
 8   open_average     819 non-null    object 
dtypes: float64(2), object(7)
memory usage: 57.8+ KB


* There is a missing value in `onetsoccode` and following features.
* Let's examine and drop it if necessary

In [7]:
df_employment[df_employment.onetsoccode.isnull()]

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,per_distr_2016,per_distr_2026,chng_number,chng_perc,open_average
819,"Source: Employment Projections program, U.S. B...",,,,,,,,


In [8]:
df_employment.dropna(axis=0, inplace=True)
df_employment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 818
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bls_title        819 non-null    object 
 1   onetsoccode      819 non-null    object 
 2   emp_number_2016  819 non-null    object 
 3   emp_number_2026  819 non-null    float64
 4   per_distr_2016   819 non-null    object 
 5   per_distr_2026   819 non-null    float64
 6   chng_number      819 non-null    object 
 7   chng_perc        819 non-null    object 
 8   open_average     819 non-null    object 
dtypes: float64(2), object(7)
memory usage: 64.0+ KB


* The missing value is not a detailed occupational title. For this reason, it is dropped from the data.
* The data set is now clear with no missing values.
* However, some features need type casting since data types are not corresponding the nature of the data.
* For example, `chng_number` is supposed to be `float64` however it has `object` datatype.

In [9]:
t = ['emp_number_2016', 'per_distr_2016', 'chng_number', 'chng_perc', 'open_average']
for feature in t:
    df_employment[feature] = df_employment[feature].astype(float)

In [10]:
df_employment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 818
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bls_title        819 non-null    object 
 1   onetsoccode      819 non-null    object 
 2   emp_number_2016  819 non-null    float64
 3   emp_number_2026  819 non-null    float64
 4   per_distr_2016   819 non-null    float64
 5   per_distr_2026   819 non-null    float64
 6   chng_number      819 non-null    float64
 7   chng_perc        819 non-null    float64
 8   open_average     819 non-null    float64
dtypes: float64(7), object(2)
memory usage: 64.0+ KB


* The data is ready, we can move to the next part.

## Occupational Projections 2016 - 2026

* `bls_title`: BLS Detailed Occupational Title
* `onetsoccode`: BLS Detailed Occupational Code
* `type`: Occupation Type
* `emp_number_2016`: Employment in 2016
* `emp_number_2026`: Employment in 2026
* `chng_number`: Change in Employment between 2016 and 2026 in number
* `chng_perc`: Change in Employment between 2016 and 2026 in percentage
* `self_employed`: Percent Self Employed in 2016
* `open_average`: Annual Average Occupational Openings 2016-2026
* `median_wage`: Median Annual Wage in 2016
* `education`: Typical Education Needed for Entry
* `experience`: Work Experience in a Related Occupation
* `training`: Typical on-the-job Training Needed to Attain Competency in the Occupation

In [11]:
df_projection = pd.read_excel('csv_files/occupation.xlsx', sheet_name='Table 1.7')
columns = ['bls_title', 'onetsoccode', 'type', 'emp_number_2016',
           'emp_number_2026', 'chng_number', 'chng_perc', 'self_employed',
           'open_average', 'median_wage', 'education', 'experience', 'training']
df_projection.columns = columns
df_projection.head()

Unnamed: 0,bls_title,onetsoccode,type,emp_number_2016,emp_number_2026,chng_number,chng_perc,self_employed,open_average,median_wage,education,experience,training
0,2016 National Employment Matrix title and code,,Occupation\ntype,Employment,,"Employment Change, 2016-26",,"Percent self employed, 2016","Occupational openings, 2016-26 annual average","Median annual wage, 2017(1)",Typical education needed for entry,Work experience in a related occupation,Typical on-the-job training needed to attain c...
1,,,,2016,2026.0,Number,Percent,,,,,,
2,"Total, all occupations",00-0000,Summary,156064,167582.3,11518.6,7.4,6.1,18742,37690,—,—,—
3,Management occupations,11-0000,Summary,9533.1,10340.4,807.3,8.5,19.8,841.5,102590,—,—,—
4,Top executives,11-1000,Summary,2627.5,2824.5,197,7.5,3.2,235,103120,—,—,—


* As in __Employment by Detailed Occupation__, we focus on single occupational titles
* Moreover, drop all summary `type` occupations
* Finally, drop `type`

In [12]:
df_projection = df_projection.iloc[3:]
mask = df_projection[df_projection.type == 'Summary'].index
df_projection.drop(mask, axis=0, inplace=True)
df_projection.drop('type',  axis=1, inplace=True)
df_projection.reset_index(drop=True, inplace=True)
df_projection.head(2)

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,self_employed,open_average,median_wage,education,experience,training
0,Chief executives,11-1011,308.9,296.8,-12.1,-3.9,22.8,20.0,183270,Bachelor's degree,5 years or more,
1,General and operations managers,11-1021,2263.1,2468.3,205.2,9.1,0.6,210.7,100410,Bachelor's degree,5 years or more,


In [13]:
df_projection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bls_title        823 non-null    object 
 1   onetsoccode      819 non-null    object 
 2   emp_number_2016  819 non-null    object 
 3   emp_number_2026  819 non-null    float64
 4   chng_number      819 non-null    object 
 5   chng_perc        819 non-null    object 
 6   self_employed    819 non-null    object 
 7   open_average     819 non-null    object 
 8   median_wage      819 non-null    object 
 9   education        819 non-null    object 
 10  experience       819 non-null    object 
 11  training         819 non-null    object 
dtypes: float64(1), object(11)
memory usage: 77.3+ KB


* As in the above data, there are missing values and wrong data types

In [14]:
df_projection[df_projection.onetsoccode.isnull()]

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,self_employed,open_average,median_wage,education,experience,training
819,Footnotes:,,,,,,,,,,,
820,(1) Data are from the Occupational Employment ...,,,,,,,,,,,
821,Note: Data is unavailable for values denoted w...,,,,,,,,,,,
822,"Source: Employment Projections program, U.S. B...",,,,,,,,,,,


In [15]:
df_projection.dropna(axis=0, inplace=True)
df_projection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 818
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bls_title        819 non-null    object 
 1   onetsoccode      819 non-null    object 
 2   emp_number_2016  819 non-null    object 
 3   emp_number_2026  819 non-null    float64
 4   chng_number      819 non-null    object 
 5   chng_perc        819 non-null    object 
 6   self_employed    819 non-null    object 
 7   open_average     819 non-null    object 
 8   median_wage      819 non-null    object 
 9   education        819 non-null    object 
 10  experience       819 non-null    object 
 11  training         819 non-null    object 
dtypes: float64(1), object(11)
memory usage: 83.2+ KB


* The missing values are footnotes. They are dropped from the data.
* The data set is now clear with no missing values.
* However, some features need type casting since data types are not corresponding the nature of the data.
* For example, `chng_number` is supposed to be `float64` however it has `object` datatype.
* There are some non-numeric values (`—`) representing missing values. They are need to be replaces with `np.nan`

In [16]:
', '.join(df_projection.columns) # all columns

'bls_title, onetsoccode, emp_number_2016, emp_number_2026, chng_number, chng_perc, self_employed, open_average, median_wage, education, experience, training'

In [17]:
df_projection.replace('—', np.nan, inplace=True)
df_projection.median_wage = np.where(df_projection.median_wage == '>=$208,000',
                                     '208000', df_projection.median_wage)
t = ['emp_number_2016', 'chng_number', 'chng_perc',
     'self_employed', 'open_average', 'median_wage']
for feature in t:
    df_projection[feature] = df_projection[feature].astype(float)
df_projection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 818
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bls_title        819 non-null    object 
 1   onetsoccode      819 non-null    object 
 2   emp_number_2016  819 non-null    float64
 3   emp_number_2026  819 non-null    float64
 4   chng_number      819 non-null    float64
 5   chng_perc        819 non-null    float64
 6   self_employed    614 non-null    float64
 7   open_average     819 non-null    float64
 8   median_wage      794 non-null    float64
 9   education        819 non-null    object 
 10  experience       819 non-null    object 
 11  training         819 non-null    object 
dtypes: float64(7), object(5)
memory usage: 83.2+ KB


In [18]:
df_projection.head()

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,self_employed,open_average,median_wage,education,experience,training
0,Chief executives,11-1011,308.9,296.8,-12.1,-3.9,22.8,20.0,183270.0,Bachelor's degree,5 years or more,
1,General and operations managers,11-1021,2263.1,2468.3,205.2,9.1,0.6,210.7,100410.0,Bachelor's degree,5 years or more,
2,Legislators,11-1031,55.5,59.4,3.9,7.1,,4.4,25630.0,Bachelor's degree,Less than 5 years,
3,Advertising and promotions managers,11-2011,31.3,33.0,1.7,5.5,5.2,3.4,106130.0,Bachelor's degree,Less than 5 years,
4,Marketing managers,11-2021,218.3,240.4,22.1,10.1,3.5,21.3,132230.0,Bachelor's degree,5 years or more,


__Encode Categorical Features__

* Because, `education`, `experience`, and `training` are ordinal features, they need to be encoded manually.

In [19]:
education_unique = sorted(df_projection.education.unique())
education_numeric = [3, 5, 8, 2, 7, 1, 6, 4]
print('Encoding education feature:')
print('-'*79)
for val, num in zip(education_unique, education_numeric):
    print(val + '--> ' + str(num))

Encoding education feature:
-------------------------------------------------------------------------------
Associate's degree--> 3
Bachelor's degree--> 5
Doctoral or professional degree--> 8
High school diploma or equivalent--> 2
Master's degree--> 7
No formal educational credential--> 1
Postsecondary nondegree award--> 6
Some college, no degree--> 4


In [20]:
df_projection.education.replace(to_replace=education_unique, 
                                value=education_numeric, 
                                inplace=True)

In [21]:
experience_unique = sorted(df_projection.experience.unique())
experience_numeric = [3, 2, 1]
print('Encoding experience feature:')
print('-'*79)
for val, num in zip(experience_unique, experience_numeric):
    print(val + '--> ' + str(num))

Encoding experience feature:
-------------------------------------------------------------------------------
5 years or more--> 3
Less than 5 years--> 2
None--> 1


In [22]:
df_projection.experience.replace(to_replace=experience_unique,
                                 value=experience_numeric,
                                 inplace=True)

In [23]:
training_unique = sorted(df_projection.training.unique())
training_numeric = [2, 3, 6, 5, 1, 4]
print('Encoding training feature:')
print('-'*79)
for val, num in zip(training_unique, training_numeric):
    print(val + '--> ' + str(num))

Encoding training feature:
-------------------------------------------------------------------------------
Apprenticeship--> 2
Internship/residency--> 3
Long-term on-the-job training--> 6
Moderate-term on-the-job training--> 5
None--> 1
Short-term on-the-job training--> 4


In [24]:
df_projection.training.replace(to_replace=training_unique,
                               value=training_numeric,
                               inplace=True)

* Let's look at the overlapping features between `df_employment` and `df_projection`

In [25]:
display(df_employment.head())
display(df_projection.head())

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,per_distr_2016,per_distr_2026,chng_number,chng_perc,open_average
0,Chief executives,11-1011,308.9,296.8,0.2,0.2,-12.1,-3.9,20.0
1,General and operations managers,11-1021,2263.1,2468.3,1.5,1.5,205.2,9.1,210.7
2,Legislators,11-1031,55.5,59.4,0.0,0.0,3.9,7.1,4.4
3,Advertising and promotions managers,11-2011,31.3,33.0,0.0,0.0,1.7,5.5,3.4
4,Marketing managers,11-2021,218.3,240.4,0.1,0.1,22.1,10.1,21.3


Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,self_employed,open_average,median_wage,education,experience,training
0,Chief executives,11-1011,308.9,296.8,-12.1,-3.9,22.8,20.0,183270.0,5,3,1
1,General and operations managers,11-1021,2263.1,2468.3,205.2,9.1,0.6,210.7,100410.0,5,3,1
2,Legislators,11-1031,55.5,59.4,3.9,7.1,,4.4,25630.0,5,2,1
3,Advertising and promotions managers,11-2011,31.3,33.0,1.7,5.5,5.2,3.4,106130.0,5,2,1
4,Marketing managers,11-2021,218.3,240.4,22.1,10.1,3.5,21.3,132230.0,5,3,1


In [26]:
# Let's check if features are overlapping
display(df_employment.bls_title.equals(df_projection.bls_title))
display(df_employment.onetsoccode.equals(df_projection.onetsoccode))
display(df_employment.emp_number_2016.equals(df_projection.emp_number_2016))
display(df_employment.emp_number_2026.equals(df_projection.emp_number_2026))
display(df_employment.chng_number.equals(df_projection.chng_number))
display(df_employment.chng_perc.equals(df_projection.chng_perc))
display(df_employment.open_average.equals(df_projection.open_average))

True

True

True

True

True

True

True

* All overlapping features are equal, then drop them for merging
* We will merge dataframes based on `onetsoccode`.

In [27]:
df_projection.drop(['bls_title', 'emp_number_2016', 'emp_number_2026',
                   'chng_number', 'chng_perc', 'open_average'], axis=1,
                   inplace=True)
df_projection.head()

Unnamed: 0,onetsoccode,self_employed,median_wage,education,experience,training
0,11-1011,22.8,183270.0,5,3,1
1,11-1021,0.6,100410.0,5,3,1
2,11-1031,,25630.0,5,2,1
3,11-2011,5.2,106130.0,5,2,1
4,11-2021,3.5,132230.0,5,3,1


## Occupational Separations and Openings

* `bls_title`: BLS Detailed Occupational Title
* `onetsoccode`: BLS Detailed Occupational Code
* `type`: Occupation Type
* `emp_number_2016`: Employment in 2016
* `emp_number_2026`: Employment in 2026
* `chng_number`: Change in Employment between 2016 and 2026 in number
* `chng_perc`: Change in Employment between 2016 and 2026 in percentage
* `lab_force_exit_rate`: Labor Force Exit Percentage Annual Average between 2016-2026
* `occ_transfer_rate`: Occupational Transfers Percentage Annual Average between 2016-2026
* `total_rate`: Total (Labor Force Exit + Occupational Transfer) Percentage Annual Average between 2016-2026
* `lab_force_exit_number`: Labor Force Exit Annual Average between 2016-2026
* `occ_transfer_number`: Occupational Transfers Annual Average between 2016-2026
* `total_number`: Total (Labor Force Exit + Occupational Transfer) Annual Average between 2016-2026
* `open_average`: Annual Average Occupational Openings 2016-2026

In [28]:
df_openings = pd.read_excel('csv_files/occupation.xlsx', sheet_name='Table 1.10')
columns = ['bls_title', 'onetsoccode', 'type', 'emp_number_2016',
           'emp_number_2026', 'chng_number', 'chng_perc',
           'lab_force_exit_rate', 'occ_transfer_rate', 'total_rate',
           'lab_force_exit_number', 'occ_transfer_number',
           'total_number', 'open_average']
df_openings.columns = columns
df_openings.head()

Unnamed: 0,bls_title,onetsoccode,type,emp_number_2016,emp_number_2026,chng_number,chng_perc,lab_force_exit_rate,occ_transfer_rate,total_rate,lab_force_exit_number,occ_transfer_number,total_number,open_average
0,2016 National Employment Matrix title and code,,Occupation\ntype,Employment,,"Employment Change, 2016-26",,"Occupational separations rate, 2016-26 annual ...",,,"Occupational separations, 2016-26 annual average",,,"Occupational openings, 2016-26 annual average"
1,,,,2016,2026.0,Number,Percent,Labor force exits,Occupational transfers,Total,Labor force exits,Occupational transfers,Total,
2,"Total, all occupations",00-0000,Summary,156064,167582.3,11518.6,7.4,4.7,6.2,10.9,7548.6,10041.5,17590.2,18742
3,Management occupations,11-0000,Summary,9533.1,10340.4,807.3,8.5,2.8,4.8,7.7,281.6,479.1,760.8,841.5
4,Top executives,11-1000,Summary,2627.5,2824.5,197,7.5,2.2,5.7,7.9,59.3,156,215.3,235


* As in above datasets, we focus on single occupational titles
* Moreover, drop all summary `type` occupations
* Finally, drop `type`

In [29]:
df_openings = df_openings.iloc[3:]
mask = df_openings[df_openings.type == 'Summary'].index
df_openings.drop(mask, axis=0, inplace=True)
df_openings.drop('type',  axis=1, inplace=True)
df_openings.reset_index(drop=True, inplace=True)
df_openings.head(2)

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,lab_force_exit_rate,occ_transfer_rate,total_rate,lab_force_exit_number,occ_transfer_number,total_number,open_average
0,Chief executives,11-1011,308.9,296.8,-12.1,-3.9,2.9,4.1,7,8.7,12.4,21.2,20.0
1,General and operations managers,11-1021,2263.1,2468.3,205.2,9.1,2.1,6.0,8,49.0,141.1,190.2,210.7


In [30]:
df_openings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bls_title              820 non-null    object 
 1   onetsoccode            819 non-null    object 
 2   emp_number_2016        819 non-null    object 
 3   emp_number_2026        819 non-null    float64
 4   chng_number            819 non-null    object 
 5   chng_perc              819 non-null    object 
 6   lab_force_exit_rate    819 non-null    object 
 7   occ_transfer_rate      819 non-null    object 
 8   total_rate             819 non-null    object 
 9   lab_force_exit_number  819 non-null    object 
 10  occ_transfer_number    819 non-null    object 
 11  total_number           819 non-null    object 
 12  open_average           819 non-null    object 
dtypes: float64(1), object(12)
memory usage: 83.4+ KB


* Let's look at the missing value

In [31]:
df_openings[df_openings.onetsoccode.isnull()]

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,lab_force_exit_rate,occ_transfer_rate,total_rate,lab_force_exit_number,occ_transfer_number,total_number,open_average
819,"Source: Employment Projections program, U.S. B...",,,,,,,,,,,,


In [32]:
df_openings.dropna(axis=0, inplace=True)
df_openings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 818
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bls_title              819 non-null    object 
 1   onetsoccode            819 non-null    object 
 2   emp_number_2016        819 non-null    object 
 3   emp_number_2026        819 non-null    float64
 4   chng_number            819 non-null    object 
 5   chng_perc              819 non-null    object 
 6   lab_force_exit_rate    819 non-null    object 
 7   occ_transfer_rate      819 non-null    object 
 8   total_rate             819 non-null    object 
 9   lab_force_exit_number  819 non-null    object 
 10  occ_transfer_number    819 non-null    object 
 11  total_number           819 non-null    object 
 12  open_average           819 non-null    object 
dtypes: float64(1), object(12)
memory usage: 89.6+ KB


* Let's correct the data types

In [33]:
t = ['emp_number_2016', 'chng_number', 'chng_perc', 'lab_force_exit_rate',
     'occ_transfer_rate', 'total_rate', 'lab_force_exit_number',
     'occ_transfer_number', 'total_number', 'open_average']
for feature in t:
    df_openings[feature] = df_openings[feature].astype(float)
df_openings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 818
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bls_title              819 non-null    object 
 1   onetsoccode            819 non-null    object 
 2   emp_number_2016        819 non-null    float64
 3   emp_number_2026        819 non-null    float64
 4   chng_number            819 non-null    float64
 5   chng_perc              819 non-null    float64
 6   lab_force_exit_rate    819 non-null    float64
 7   occ_transfer_rate      819 non-null    float64
 8   total_rate             819 non-null    float64
 9   lab_force_exit_number  819 non-null    float64
 10  occ_transfer_number    819 non-null    float64
 11  total_number           819 non-null    float64
 12  open_average           819 non-null    float64
dtypes: float64(11), object(2)
memory usage: 89.6+ KB


* Let's check the overlapping features

In [34]:
display(df_employment.head())
display(df_openings.head())

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,per_distr_2016,per_distr_2026,chng_number,chng_perc,open_average
0,Chief executives,11-1011,308.9,296.8,0.2,0.2,-12.1,-3.9,20.0
1,General and operations managers,11-1021,2263.1,2468.3,1.5,1.5,205.2,9.1,210.7
2,Legislators,11-1031,55.5,59.4,0.0,0.0,3.9,7.1,4.4
3,Advertising and promotions managers,11-2011,31.3,33.0,0.0,0.0,1.7,5.5,3.4
4,Marketing managers,11-2021,218.3,240.4,0.1,0.1,22.1,10.1,21.3


Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,chng_number,chng_perc,lab_force_exit_rate,occ_transfer_rate,total_rate,lab_force_exit_number,occ_transfer_number,total_number,open_average
0,Chief executives,11-1011,308.9,296.8,-12.1,-3.9,2.9,4.1,7.0,8.7,12.4,21.2,20.0
1,General and operations managers,11-1021,2263.1,2468.3,205.2,9.1,2.1,6.0,8.0,49.0,141.1,190.2,210.7
2,Legislators,11-1031,55.5,59.4,3.9,7.1,2.7,4.2,6.9,1.6,2.4,4.0,4.4
3,Advertising and promotions managers,11-2011,31.3,33.0,1.7,5.5,2.6,7.5,10.1,0.8,2.4,3.2,3.4
4,Marketing managers,11-2021,218.3,240.4,22.1,10.1,2.3,6.1,8.3,5.2,13.9,19.1,21.3


In [35]:
t = ['bls_title', 'onetsoccode', 'emp_number_2016', 'emp_number_2026',
     'chng_number', 'chng_perc', 'open_average']
for feature in t:
    display(df_employment[feature].equals(df_openings[feature]))

True

True

True

True

True

True

True

* Drop overlapping features

In [36]:
t.remove('onetsoccode')
df_openings.drop(t, axis=1, inplace=True)
df_openings.head()

Unnamed: 0,onetsoccode,lab_force_exit_rate,occ_transfer_rate,total_rate,lab_force_exit_number,occ_transfer_number,total_number
0,11-1011,2.9,4.1,7.0,8.7,12.4,21.2
1,11-1021,2.1,6.0,8.0,49.0,141.1,190.2
2,11-1031,2.7,4.2,6.9,1.6,2.4,4.0
3,11-2011,2.6,7.5,10.1,0.8,2.4,3.2
4,11-2021,2.3,6.1,8.3,5.2,13.9,19.1


## Merge all files

In [37]:
employment_projection = df_employment.merge(df_projection, on='onetsoccode',
                                            validate='one_to_one')
employment_projection = employment_projection.merge(df_openings, on='onetsoccode', 
                                                    validate='one_to_one')
employment_projection.head()

Unnamed: 0,bls_title,onetsoccode,emp_number_2016,emp_number_2026,per_distr_2016,per_distr_2026,chng_number,chng_perc,open_average,self_employed,median_wage,education,experience,training,lab_force_exit_rate,occ_transfer_rate,total_rate,lab_force_exit_number,occ_transfer_number,total_number
0,Chief executives,11-1011,308.9,296.8,0.2,0.2,-12.1,-3.9,20.0,22.8,183270.0,5,3,1,2.9,4.1,7.0,8.7,12.4,21.2
1,General and operations managers,11-1021,2263.1,2468.3,1.5,1.5,205.2,9.1,210.7,0.6,100410.0,5,3,1,2.1,6.0,8.0,49.0,141.1,190.2
2,Legislators,11-1031,55.5,59.4,0.0,0.0,3.9,7.1,4.4,,25630.0,5,2,1,2.7,4.2,6.9,1.6,2.4,4.0
3,Advertising and promotions managers,11-2011,31.3,33.0,0.0,0.0,1.7,5.5,3.4,5.2,106130.0,5,2,1,2.6,7.5,10.1,0.8,2.4,3.2
4,Marketing managers,11-2021,218.3,240.4,0.1,0.1,22.1,10.1,21.3,3.5,132230.0,5,3,1,2.3,6.1,8.3,5.2,13.9,19.1


In [38]:
employment_projection.to_csv('employment_projection_final.csv', index=False)