BLS Industry Employment Data Exploration and Cleaning

In [107]:
# Import libraries
import pandas as pd

print('Libraries imported successfully!')

Libraries imported successfully!


In [108]:
# Read in bls employment data csv file
blsemp_df = pd.read_csv('../../../workspace/nf-insights-project/data_raw/bls_employ_data_raw.csv')

print('CSV read successfully!')

blsemp_df.head()

CSV read successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value
0,SMU54000000000000001,2009,M01,2009 Jan,706.5
1,SMU54000000000000001,2009,M02,2009 Feb,706.5
2,SMU54000000000000001,2009,M03,2009 Mar,710.6
3,SMU54000000000000001,2009,M04,2009 Apr,712.6
4,SMU54000000000000001,2009,M05,2009 May,716.5


In [109]:
# Data check
print('Data check:')
print('Data shape:', blsemp_df.shape)
print(blsemp_df.info())
blsemp_df.columns

Data check:
Data shape: (2856, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2856 entries, 0 to 2855
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Series ID  2856 non-null   object 
 1   Year       2856 non-null   int64  
 2   Period     2856 non-null   object 
 3   Label      2856 non-null   object 
 4   Value      2856 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 111.7+ KB
None


Index(['Series ID', 'Year', 'Period', 'Label', 'Value'], dtype='object')

In [110]:
# Pull unique Series ID's
print("Unique Series ID's pulled successfully!")
blsemp_df['Series ID'].unique()

Unique Series ID's pulled successfully!


array(['SMU54000000000000001', 'SMU54000001000000001',
       'SMU54000002000000001', 'SMU54000003000000001',
       'SMU54000004000000001', 'SMU54000005000000001',
       'SMU54000005552000001', 'SMU54000005553000001',
       'SMU54000006000000001', 'SMU54000006561000001',
       'SMU54000006562000001', 'SMU54000007000000001',
       'SMU54000008000000001', 'SMU54000009000000001'], dtype=object)

In [111]:
# Create dictionary of Series ID's to industry names
series_map = {
    'SMU54000000000000001': 'Total Nonfarm',
    'SMU54000001000000001': 'Mining and Logging',
    'SMU54000002000000001': 'Construction',
    'SMU54000003000000001': 'Manufacturing',
    'SMU54000004000000001': 'Trade, Transportation, and Utilities',
    'SMU54000005000000001': 'Information',
    'SMU54000005552000001': 'Finance and Insurance',
    'SMU54000005553000001': 'Real Estate and Rental and Leasing',
    'SMU54000006000000001': 'Professional and Business Services',
    'SMU54000006561000001': 'Private Educational Services',
    'SMU54000006562000001': 'Health Care and Social Assistance',
    'SMU54000007000000001': 'Leisure and Hospitality',
    'SMU54000008000000001': 'Other Services',
    'SMU54000009000000001': 'Government'
}

# Map Series ID's to industry names
blsemp_df['Industry'] = blsemp_df['Series ID'].map(series_map)

print('Series ID mapped successfully!')

blsemp_df.head()

Series ID mapped successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Industry
0,SMU54000000000000001,2009,M01,2009 Jan,706.5,Total Nonfarm
1,SMU54000000000000001,2009,M02,2009 Feb,706.5,Total Nonfarm
2,SMU54000000000000001,2009,M03,2009 Mar,710.6,Total Nonfarm
3,SMU54000000000000001,2009,M04,2009 Apr,712.6,Total Nonfarm
4,SMU54000000000000001,2009,M05,2009 May,716.5,Total Nonfarm


In [112]:
# Create a proper date column
blsemp_df['Date'] = pd.to_datetime(blsemp_df['Year'].astype(str) + '-' + blsemp_df['Period'].str[1:], format='%Y-%m')

print('Date column created successfully!')

print('Data check:')
blsemp_df.info()

Date column created successfully!
Data check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2856 entries, 0 to 2855
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Series ID  2856 non-null   object        
 1   Year       2856 non-null   int64         
 2   Period     2856 non-null   object        
 3   Label      2856 non-null   object        
 4   Value      2856 non-null   float64       
 5   Industry   2856 non-null   object        
 6   Date       2856 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 156.3+ KB


In [113]:
# Drop unnecessary columns
blsemp_df = blsemp_df.drop(columns=['Series ID', 'Year', 'Period', 'Label'])

print('Unnecessary columns dropped successfully!')

print('Data shape:', blsemp_df.shape)

Unnecessary columns dropped successfully!
Data shape: (2856, 3)


In [114]:
# Extract year and month from date column
blsemp_df['Year'] = blsemp_df['Date'].dt.year.astype('Int64')
blsemp_df['Month'] = blsemp_df['Date'].dt.month.astype('Int64')

print('Date columns extracted successfully!')

print('Data check:')
blsemp_df.info()

Date columns extracted successfully!
Data check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2856 entries, 0 to 2855
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Value     2856 non-null   float64       
 1   Industry  2856 non-null   object        
 2   Date      2856 non-null   datetime64[ns]
 3   Year      2856 non-null   Int64         
 4   Month     2856 non-null   Int64         
dtypes: Int64(2), datetime64[ns](1), float64(1), object(1)
memory usage: 117.3+ KB


In [115]:
# Change measure of values into correct format (1,000s of jobs)
blsemp_df['Value'] = blsemp_df['Value'] * 1000

# Round for readability
blsemp_df['Value'] = blsemp_df['Value'].round(2)

print('Data check:')
blsemp_df.head()

Data check:


Unnamed: 0,Value,Industry,Date,Year,Month
0,706500.0,Total Nonfarm,2009-01-01,2009,1
1,706500.0,Total Nonfarm,2009-02-01,2009,2
2,710600.0,Total Nonfarm,2009-03-01,2009,3
3,712600.0,Total Nonfarm,2009-04-01,2009,4
4,716500.0,Total Nonfarm,2009-05-01,2009,5


In [116]:
# Check for duplicates
duplicates = blsemp_df.duplicated().sum()
print('Number of duplicate rows:', duplicates)

# Check for missing values
print('Number of missing values:', blsemp_df.isna().sum())

# Sort by Industry and Date
blsemp_df = blsemp_df.sort_values(['Industry', 'Date'])

print('Data sorted successfully!')

Number of duplicate rows: 0
Number of missing values: Value       0
Industry    0
Date        0
Year        0
Month       0
dtype: int64
Data sorted successfully!


In [117]:
# Create a column for last year's employment
blsemp_df['Employment_Last_Year'] = blsemp_df.groupby('Industry')['Value'].shift(12)

# Calculate YoY growth
blsemp_df['YoY_Growth'] = ((blsemp_df['Value'] - blsemp_df['Employment_Last_Year']) 
                           / blsemp_df['Employment_Last_Year']) * 100

# Round for readability
blsemp_df['YoY_Growth'] = blsemp_df['YoY_Growth'].round(2)

print('Data check:')
blsemp_df.head()

Data check:


Unnamed: 0,Value,Industry,Date,Year,Month,Employment_Last_Year,YoY_Growth
408,32600.0,Construction,2009-01-01,2009,1,,
409,32600.0,Construction,2009-02-01,2009,2,,
410,33900.0,Construction,2009-03-01,2009,3,,
411,33800.0,Construction,2009-04-01,2009,4,,
412,35100.0,Construction,2009-05-01,2009,5,,


In [118]:
# Calculate total employment by month
total_employment = blsemp_df.groupby('Date')['Value'].sum().reset_index()
total_employment = total_employment.rename(columns={'Value': 'Total_Employment'})

# Merge total employment back into blsemp_df
blsemp_df = blsemp_df.merge(total_employment, on='Date')

# Calculate employment share
blsemp_df['Employment_Share'] = (blsemp_df['Value'] / blsemp_df['Total_Employment']) * 100

# Round for readability
blsemp_df['Employment_Share'] = blsemp_df['Employment_Share'].round(2)

print('Data check:')
blsemp_df.head()

Data check:


Unnamed: 0,Value,Industry,Date,Year,Month,Employment_Last_Year,YoY_Growth,Total_Employment,Employment_Share
0,32600.0,Construction,2009-01-01,2009,1,,,1413000.0,2.31
1,32600.0,Construction,2009-02-01,2009,2,,,1413000.0,2.31
2,33900.0,Construction,2009-03-01,2009,3,,,1421200.0,2.39
3,33800.0,Construction,2009-04-01,2009,4,,,1425200.0,2.37
4,35100.0,Construction,2009-05-01,2009,5,,,1433000.0,2.45


In [119]:
# Sort by Industry and Date
blsemp_df = blsemp_df.sort_values(['Industry', 'Date'])

# Create a column for last month's employment
blsemp_df['Employment_Last_Month'] = blsemp_df.groupby('Industry')['Value'].shift(1)

# Calculate month-to-month job change
blsemp_df['Job_Change_MoM'] = blsemp_df['Value'] - blsemp_df['Employment_Last_Month']

# Year-over-Year Job Change
blsemp_df['Job_Change_YoY'] = blsemp_df['Value'] - blsemp_df['Employment_Last_Year']

# Round for readability
blsemp_df['Job_Change_MoM'] = blsemp_df['Job_Change_MoM'].round(4)
blsemp_df['Job_Change_YoY'] = blsemp_df['Job_Change_YoY'].round(4)

# Data check
print('Data check:')
blsemp_df.head()

Data check:


Unnamed: 0,Value,Industry,Date,Year,Month,Employment_Last_Year,YoY_Growth,Total_Employment,Employment_Share,Employment_Last_Month,Job_Change_MoM,Job_Change_YoY
0,32600.0,Construction,2009-01-01,2009,1,,,1413000.0,2.31,,,
1,32600.0,Construction,2009-02-01,2009,2,,,1413000.0,2.31,32600.0,0.0,
2,33900.0,Construction,2009-03-01,2009,3,,,1421200.0,2.39,32600.0,1300.0,
3,33800.0,Construction,2009-04-01,2009,4,,,1425200.0,2.37,33900.0,-100.0,
4,35100.0,Construction,2009-05-01,2009,5,,,1433000.0,2.45,33800.0,1300.0,


In [120]:
# Save cleaned data
blsemp_df.to_csv('../../../workspace/nf-insights-project/data_clean/bls_employ_clean.csv')

print('CSV saved successfully!')

CSV saved successfully!
