BLS Industry Employment Data Exploration and Cleaning

In [135]:
# Import libraries
import pandas as pd

print('Libraries imported successfully!')

Libraries imported successfully!


In [136]:
# Read in bls employment data csv file
blsemp_df = pd.read_csv('../../../workspace/nf-insights-project/data_raw/bls_employ_data_raw.csv')

print('CSV read successfully!')

blsemp_df.head()

CSV read successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value
0,SMU54000000000000001,2006,M01,2006 Jan,706.1
1,SMU54000000000000001,2006,M02,2006 Feb,709.0
2,SMU54000000000000001,2006,M03,2006 Mar,718.2
3,SMU54000000000000001,2006,M04,2006 Apr,720.9
4,SMU54000000000000001,2006,M05,2006 May,736.4


In [137]:
# Data check
print('Data check:')
print('Data shape:', blsemp_df.shape)
print(blsemp_df.info())
blsemp_df.columns

Data check:
Data shape: (3360, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3360 entries, 0 to 3359
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Series ID  3360 non-null   object 
 1   Year       3360 non-null   int64  
 2   Period     3360 non-null   object 
 3   Label      3360 non-null   object 
 4   Value      3360 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 131.4+ KB
None


Index(['Series ID', 'Year', 'Period', 'Label', 'Value'], dtype='object')

In [138]:
# Pull unique Series ID's
print('Unique Series IDs pulled successfully!')
blsemp_df['Series ID'].unique()

Unique Series IDs pulled successfully!


array(['SMU54000000000000001', 'SMU54000001000000001',
       'SMU54000002000000001', 'SMU54000003000000001',
       'SMU54000004000000001', 'SMU54000005000000001',
       'SMU54000005552000001', 'SMU54000005553000001',
       'SMU54000006000000001', 'SMU54000006561000001',
       'SMU54000006562000001', 'SMU54000007000000001',
       'SMU54000008000000001', 'SMU54000009000000001'], dtype=object)

In [139]:
# Create dictionary of Series ID's to industry names
series_map = {
    'SMU54000000000000001': 'Total Nonfarm',
    'SMU54000001000000001': 'Mining and Logging',
    'SMU54000002000000001': 'Construction',
    'SMU54000003000000001': 'Manufacturing',
    'SMU54000004000000001': 'Trade, Transportation, and Utilities',
    'SMU54000005000000001': 'Information',
    'SMU54000005552000001': 'Finance and Insurance',
    'SMU54000005553000001': 'Real Estate and Rental and Leasing',
    'SMU54000006000000001': 'Professional and Business Services',
    'SMU54000006561000001': 'Private Educational Services',
    'SMU54000006562000001': 'Health Care and Social Assistance',
    'SMU54000007000000001': 'Leisure and Hospitality',
    'SMU54000008000000001': 'Other Services',
    'SMU54000009000000001': 'Government'
}

# Map Series ID's to industry names
blsemp_df['Industry'] = blsemp_df['Series ID'].map(series_map)

print('Series IDs mapped successfully!')

blsemp_df.head()

Series IDs mapped successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Industry
0,SMU54000000000000001,2006,M01,2006 Jan,706.1,Total Nonfarm
1,SMU54000000000000001,2006,M02,2006 Feb,709.0,Total Nonfarm
2,SMU54000000000000001,2006,M03,2006 Mar,718.2,Total Nonfarm
3,SMU54000000000000001,2006,M04,2006 Apr,720.9,Total Nonfarm
4,SMU54000000000000001,2006,M05,2006 May,736.4,Total Nonfarm


In [140]:
# # Create a proper date column
# blsemp_df['Date'] = pd.to_datetime(blsemp_df['Year'].astype(str) + '-' + blsemp_df['Period'].str[1:], format='%Y-%m')

# print('Date column created successfully!')

# print('Data check:')
# blsemp_df.info()

In [141]:
# Drop unnecessary columns
blsemp_df = blsemp_df.drop(columns=['Series ID','Period', 'Label'])

print('Unnecessary columns dropped successfully!')

print('Data check:')
print('Data shape:', blsemp_df.shape)
print(blsemp_df.info())

Unnecessary columns dropped successfully!
Data check:
Data shape: (3360, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3360 entries, 0 to 3359
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      3360 non-null   int64  
 1   Value     3360 non-null   float64
 2   Industry  3360 non-null   object 
dtypes: float64(1), int64(1), object(1)
memory usage: 78.9+ KB
None


In [142]:
# Remove Total Nonfarm row before calculating metrics
blsemp_df = blsemp_df[blsemp_df["Industry"] != "Total Nonfarm"]

print('Total Nonfarm row removed successfully!')

print('Data check:')
print('Data shape:', blsemp_df.shape)

Total Nonfarm row removed successfully!
Data check:
Data shape: (3120, 3)


In [143]:
# Change measure of values into correct format (1,000s of jobs)
blsemp_df['Value'] = blsemp_df['Value'] * 1000

# Round for readability
blsemp_df['Value'] = blsemp_df['Value'].round(2)

print('Data check:')
blsemp_df.head()

Data check:


Unnamed: 0,Year,Value,Industry
240,2006,27200.0,Mining and Logging
241,2006,27400.0,Mining and Logging
242,2006,27700.0,Mining and Logging
243,2006,27800.0,Mining and Logging
244,2006,27900.0,Mining and Logging


In [144]:
# Create yearly employment averages by industry

blsemp_yearly = (
    blsemp_df
        .groupby(['Industry', 'Year'])['Value']
        .mean()
        .reset_index()
)

# Round for readability
blsemp_yearly['Value'] = blsemp_yearly['Value'].round(1)

# Rename for clarity
blsemp_yearly = blsemp_yearly.rename(columns={'Value': 'Employment'})

print('Data check:')
print('Data shape:', blsemp_yearly.shape)
blsemp_yearly.info()
blsemp_yearly.head()

Data check:
Data shape: (260, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Industry    260 non-null    object 
 1   Year        260 non-null    int64  
 2   Employment  260 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.2+ KB


Unnamed: 0,Industry,Year,Employment
0,Construction,2006,39425.0
1,Construction,2007,38933.3
2,Construction,2008,38908.3
3,Construction,2009,34075.0
4,Construction,2010,32791.7


In [145]:
# Check for duplicates
duplicates = blsemp_yearly.duplicated().sum()
print('Number of duplicate rows:', duplicates)

# Check for missing values
print('Number of missing values:', blsemp_yearly.isna().sum())

# Sort by Industry and Year
blsemp_yearly = blsemp_yearly.sort_values(['Industry', 'Year'])

print('Data sorted successfully!')

Number of duplicate rows: 0
Number of missing values: Industry      0
Year          0
Employment    0
dtype: int64
Data sorted successfully!


In [146]:
# Calculate Year-over-Year percentage growth
blsemp_yearly['YoY_Growth'] = (
    blsemp_yearly
        .groupby('Industry')['Employment']
        .pct_change() * 100
)

# Round for readability
blsemp_yearly['YoY_Growth'] = blsemp_yearly['YoY_Growth'].round(2)

# Data check
print('Data check:')
print('Data shape:', blsemp_yearly.shape)
blsemp_yearly.head()

Data check:
Data shape: (260, 4)


Unnamed: 0,Industry,Year,Employment,YoY_Growth
0,Construction,2006,39425.0,
1,Construction,2007,38933.3,-1.25
2,Construction,2008,38908.3,-0.06
3,Construction,2009,34075.0,-12.42
4,Construction,2010,32791.7,-3.77


In [147]:
# Calculate absolute yearly employment change
blsemp_yearly['Yearly_Change'] = (
    blsemp_yearly
        .groupby('Industry')['Employment']
        .diff()
)

print('Absolute yearly employment change calculated successfully!')

# Round for readability
blsemp_yearly['Yearly_Change'] = blsemp_yearly['Yearly_Change'].round(2)

# Data check
print('Data check:')
print('Data shape:', blsemp_yearly.shape)
blsemp_yearly.head()

Absolute yearly employment change calculated successfully!
Data check:
Data shape: (260, 5)


Unnamed: 0,Industry,Year,Employment,YoY_Growth,Yearly_Change
0,Construction,2006,39425.0,,
1,Construction,2007,38933.3,-1.25,-491.7
2,Construction,2008,38908.3,-0.06,-25.0
3,Construction,2009,34075.0,-12.42,-4833.3
4,Construction,2010,32791.7,-3.77,-1283.3


In [149]:
# Reset index
blsemp_final = blsemp_yearly.reset_index(drop=True)

print('Index reset successfully!')

Index reset successfully!


In [150]:
# Save cleaned data
blsemp_final.to_csv('../../../workspace/nf-insights-project/data_clean/bls_employ_clean.csv')

print('CSV saved successfully!')

CSV saved successfully!
