BLS Industry Employment Data Exploration and Cleaning

In [1]:
# Import libraries
import pandas as pd

print('Libraries imported successfully!')

Libraries imported successfully!


In [2]:
# Read in bls employment data csv file
blsemp_df = pd.read_csv('../../../workspace/nf-insights-project/data_raw/bls_employ_data_raw.csv')

print('CSV read successfully!')

blsemp_df.head()

CSV read successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value
0,SMU54000000000000001,2009,M01,2009 Jan,706.5
1,SMU54000000000000001,2009,M02,2009 Feb,706.5
2,SMU54000000000000001,2009,M03,2009 Mar,710.6
3,SMU54000000000000001,2009,M04,2009 Apr,712.6
4,SMU54000000000000001,2009,M05,2009 May,716.5


In [3]:
# Data check
print('Data check:')
print(blsemp_df.shape)
print(blsemp_df.info())
blsemp_df.columns

Data check:
(2856, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2856 entries, 0 to 2855
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Series ID  2856 non-null   object 
 1   Year       2856 non-null   int64  
 2   Period     2856 non-null   object 
 3   Label      2856 non-null   object 
 4   Value      2856 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 111.7+ KB
None


Index(['Series ID', 'Year', 'Period', 'Label', 'Value'], dtype='object')

In [4]:
# Pull unique Series ID's
print("Unique Series ID's pulled successfully!")
blsemp_df['Series ID'].unique()

Unique Series ID's pulled successfully!


array(['SMU54000000000000001', 'SMU54000001000000001',
       'SMU54000002000000001', 'SMU54000003000000001',
       'SMU54000004000000001', 'SMU54000005000000001',
       'SMU54000005552000001', 'SMU54000005553000001',
       'SMU54000006000000001', 'SMU54000006561000001',
       'SMU54000006562000001', 'SMU54000007000000001',
       'SMU54000008000000001', 'SMU54000009000000001'], dtype=object)

In [5]:
# Create dictionary of Series ID's to readable industry names
series_map = {
    'SMU54000000000000001': 'Total Nonfarm',
    'SMU54000001000000001': 'Mining and Logging',
    'SMU54000002000000001': 'Construction',
    'SMU54000003000000001': 'Manufacturing',
    'SMU54000004000000001': 'Trade, Transportation, and Utilities',
    'SMU54000005000000001': 'Information',
    'SMU54000005552000001': 'Finance and Insurance',
    'SMU54000005553000001': 'Real Estate and Rental and Leasing',
    'SMU54000006000000001': 'Professional and Business Services',
    'SMU54000006561000001': 'Private Educational Services',
    'SMU54000006562000001': 'Health Care and Social Assistance',
    'SMU54000007000000001': 'Leisure and Hospitality',
    'SMU54000008000000001': 'Other Services',
    'SMU54000009000000001': 'Government'
}

# Map Series ID's to industry names
blsemp_df['Industry'] = blsemp_df['Series ID'].map(series_map)

print('Series ID mapped successfully!')

blsemp_df.head()

Series ID mapped successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Industry
0,SMU54000000000000001,2009,M01,2009 Jan,706.5,Total Nonfarm
1,SMU54000000000000001,2009,M02,2009 Feb,706.5,Total Nonfarm
2,SMU54000000000000001,2009,M03,2009 Mar,710.6,Total Nonfarm
3,SMU54000000000000001,2009,M04,2009 Apr,712.6,Total Nonfarm
4,SMU54000000000000001,2009,M05,2009 May,716.5,Total Nonfarm


In [6]:
# Create a proper date column
blsemp_df['Date'] = pd.to_datetime(blsemp_df['Year'].astype(str) + '-' + blsemp_df['Period'].str[1:], format='%Y-%m')

print('Date column created successfully!')

blsemp_df.head()

Date column created successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Industry,Date
0,SMU54000000000000001,2009,M01,2009 Jan,706.5,Total Nonfarm,2009-01-01
1,SMU54000000000000001,2009,M02,2009 Feb,706.5,Total Nonfarm,2009-02-01
2,SMU54000000000000001,2009,M03,2009 Mar,710.6,Total Nonfarm,2009-03-01
3,SMU54000000000000001,2009,M04,2009 Apr,712.6,Total Nonfarm,2009-04-01
4,SMU54000000000000001,2009,M05,2009 May,716.5,Total Nonfarm,2009-05-01


In [7]:
# Drop redundant columns
blsemp_df = blsemp_df.drop(columns=['Series ID', 'Year', 'Period', 'Label'])

print('Redundant columns dropped successfully!')

blsemp_df.head()

Redundant columns dropped successfully!


Unnamed: 0,Value,Industry,Date
0,706.5,Total Nonfarm,2009-01-01
1,706.5,Total Nonfarm,2009-02-01
2,710.6,Total Nonfarm,2009-03-01
3,712.6,Total Nonfarm,2009-04-01
4,716.5,Total Nonfarm,2009-05-01


In [8]:
# Pivot the dataframe long -> wide
blsemp_wide = blsemp_df.pivot(index='Date', columns='Industry', values='Value').reset_index()

print('Data pivoted successfully!')

blsemp_wide.head()

Data pivoted successfully!


Industry,Date,Construction,Finance and Insurance,Government,Health Care and Social Assistance,Information,Leisure and Hospitality,Manufacturing,Mining and Logging,Other Services,Private Educational Services,Professional and Business Services,Real Estate and Rental and Leasing,Total Nonfarm,"Trade, Transportation, and Utilities"
0,2009-01-01,32.6,20.1,146.5,106.9,10.9,68.8,53.6,32.7,25.3,5.4,60.1,7.1,706.5,136.5
1,2009-02-01,32.6,20.1,149.0,107.0,10.7,68.7,52.2,32.0,25.3,7.3,60.0,7.0,706.5,134.6
2,2009-03-01,33.9,20.1,151.4,107.9,10.7,70.3,51.1,31.8,25.2,7.1,59.7,6.9,710.6,134.5
3,2009-04-01,33.8,20.0,151.0,108.3,10.6,71.3,51.7,30.9,25.4,7.5,60.3,6.9,712.6,134.9
4,2009-05-01,35.1,19.9,152.3,108.8,10.5,73.4,51.0,29.9,25.5,7.4,60.0,7.1,716.5,135.6


In [9]:
# Extract year and month from date column
blsemp_wide['Year'] = blsemp_wide['Date'].dt.year.astype('Int64')
blsemp_wide['Month'] = blsemp_wide['Date'].dt.month.astype('Int64')

print('Date columns extracted successfully!')

blsemp_wide.head()

Date columns extracted successfully!


Industry,Date,Construction,Finance and Insurance,Government,Health Care and Social Assistance,Information,Leisure and Hospitality,Manufacturing,Mining and Logging,Other Services,Private Educational Services,Professional and Business Services,Real Estate and Rental and Leasing,Total Nonfarm,"Trade, Transportation, and Utilities",Year,Month
0,2009-01-01,32.6,20.1,146.5,106.9,10.9,68.8,53.6,32.7,25.3,5.4,60.1,7.1,706.5,136.5,2009,1
1,2009-02-01,32.6,20.1,149.0,107.0,10.7,68.7,52.2,32.0,25.3,7.3,60.0,7.0,706.5,134.6,2009,2
2,2009-03-01,33.9,20.1,151.4,107.9,10.7,70.3,51.1,31.8,25.2,7.1,59.7,6.9,710.6,134.5,2009,3
3,2009-04-01,33.8,20.0,151.0,108.3,10.6,71.3,51.7,30.9,25.4,7.5,60.3,6.9,712.6,134.9,2009,4
4,2009-05-01,35.1,19.9,152.3,108.8,10.5,73.4,51.0,29.9,25.5,7.4,60.0,7.1,716.5,135.6,2009,5


In [10]:
# Reorder columns
blsemp_wide = blsemp_wide[['Date', 'Month', 'Year', 'Construction', 'Finance and Insurance', 'Government', 'Health Care and Social Assistance', 'Information', 'Leisure and Hospitality', 'Manufacturing', 'Mining and Logging', 'Other Services', 'Private Educational Services', 'Professional and Business Services', 'Real Estate and Rental and Leasing', 'Trade, Transportation, and Utilities',  'Total Nonfarm']]

print('Columns reordered successfully!')

blsemp_wide.info()

Columns reordered successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 17 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   Date                                  204 non-null    datetime64[ns]
 1   Month                                 204 non-null    Int64         
 2   Year                                  204 non-null    Int64         
 3   Construction                          204 non-null    float64       
 4   Finance and Insurance                 204 non-null    float64       
 5   Government                            204 non-null    float64       
 6   Health Care and Social Assistance     204 non-null    float64       
 7   Information                           204 non-null    float64       
 8   Leisure and Hospitality               204 non-null    float64       
 9   Manufacturing                         204 no

In [11]:
# Check for duplicates
duplicates = blsemp_wide.duplicated().sum()
print('Number of duplicate rows:', duplicates)

Number of duplicate rows: 0


In [12]:
# Check for missing values
print('Number of missing values:', blsemp_wide.isna().sum())

Number of missing values: Industry
Date                                    0
Month                                   0
Year                                    0
Construction                            0
Finance and Insurance                   0
Government                              0
Health Care and Social Assistance       0
Information                             0
Leisure and Hospitality                 0
Manufacturing                           0
Mining and Logging                      0
Other Services                          0
Private Educational Services            0
Professional and Business Services      0
Real Estate and Rental and Leasing      0
Trade, Transportation, and Utilities    0
Total Nonfarm                           0
dtype: int64


In [13]:
# Save cleaned data
blsemp_wide.to_csv('../../../workspace/nf-insights-project/data_clean/bls_employment_clean.csv')

print('CSV saved successfully!')

CSV saved successfully!
