BLS Industry Employment Data Exploration and Cleaning

In [1]:
# Import libraries
import pandas as pd

print('Libraries imported successfully!')

Libraries imported successfully!


In [2]:
# Read in bls employment data csv file
blsemp_df = pd.read_csv('../../../workspace/nf-insights-project/data_raw/bls_employ_data_raw.csv')

print('CSV read successfully!')

blsemp_df.head()

CSV read successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value
0,SMU54000000000000001,2009,M01,2009 Jan,706.5
1,SMU54000000000000001,2009,M02,2009 Feb,706.5
2,SMU54000000000000001,2009,M03,2009 Mar,710.6
3,SMU54000000000000001,2009,M04,2009 Apr,712.6
4,SMU54000000000000001,2009,M05,2009 May,716.5


In [3]:
# Data check
print('Data check:')
print('Data shape:', blsemp_df.shape)
print(blsemp_df.info())
blsemp_df.columns

Data check:
Data shape: (2856, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2856 entries, 0 to 2855
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Series ID  2856 non-null   object 
 1   Year       2856 non-null   int64  
 2   Period     2856 non-null   object 
 3   Label      2856 non-null   object 
 4   Value      2856 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 111.7+ KB
None


Index(['Series ID', 'Year', 'Period', 'Label', 'Value'], dtype='object')

In [4]:
# Pull unique Series ID's
print("Unique Series ID's pulled successfully!")
blsemp_df['Series ID'].unique()

Unique Series ID's pulled successfully!


array(['SMU54000000000000001', 'SMU54000001000000001',
       'SMU54000002000000001', 'SMU54000003000000001',
       'SMU54000004000000001', 'SMU54000005000000001',
       'SMU54000005552000001', 'SMU54000005553000001',
       'SMU54000006000000001', 'SMU54000006561000001',
       'SMU54000006562000001', 'SMU54000007000000001',
       'SMU54000008000000001', 'SMU54000009000000001'], dtype=object)

In [None]:
# Create dictionary of Series ID's to industry names
series_map = {
    'SMU54000000000000001': 'Total Nonfarm',
    'SMU54000001000000001': 'Mining and Logging',
    'SMU54000002000000001': 'Construction',
    'SMU54000003000000001': 'Manufacturing',
    'SMU54000004000000001': 'Trade, Transportation, and Utilities',
    'SMU54000005000000001': 'Information',
    'SMU54000005552000001': 'Finance and Insurance',
    'SMU54000005553000001': 'Real Estate and Rental and Leasing',
    'SMU54000006000000001': 'Professional and Business Services',
    'SMU54000006561000001': 'Private Educational Services',
    'SMU54000006562000001': 'Health Care and Social Assistance',
    'SMU54000007000000001': 'Leisure and Hospitality',
    'SMU54000008000000001': 'Other Services',
    'SMU54000009000000001': 'Government'
}

# Map Series ID's to industry names
blsemp_df['Industry'] = blsemp_df['Series ID'].map(series_map)

print('Series ID mapped successfully!')

blsemp_df.head()

Series ID mapped successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Industry
0,SMU54000000000000001,2009,M01,2009 Jan,706.5,Total Nonfarm
1,SMU54000000000000001,2009,M02,2009 Feb,706.5,Total Nonfarm
2,SMU54000000000000001,2009,M03,2009 Mar,710.6,Total Nonfarm
3,SMU54000000000000001,2009,M04,2009 Apr,712.6,Total Nonfarm
4,SMU54000000000000001,2009,M05,2009 May,716.5,Total Nonfarm


In [6]:
# Create a proper date column
blsemp_df['Date'] = pd.to_datetime(blsemp_df['Year'].astype(str) + '-' + blsemp_df['Period'].str[1:], format='%Y-%m')

print('Date column created successfully!')

blsemp_df.head()

Date column created successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Industry,Date
0,SMU54000000000000001,2009,M01,2009 Jan,706.5,Total Nonfarm,2009-01-01
1,SMU54000000000000001,2009,M02,2009 Feb,706.5,Total Nonfarm,2009-02-01
2,SMU54000000000000001,2009,M03,2009 Mar,710.6,Total Nonfarm,2009-03-01
3,SMU54000000000000001,2009,M04,2009 Apr,712.6,Total Nonfarm,2009-04-01
4,SMU54000000000000001,2009,M05,2009 May,716.5,Total Nonfarm,2009-05-01


In [7]:
# Drop unnecessary columns
blsemp_df = blsemp_df.drop(columns=['Series ID', 'Year', 'Period', 'Label'])

print('Unnecessary columns dropped successfully!')

blsemp_df.head()

Unnecessary columns dropped successfully!


Unnamed: 0,Value,Industry,Date
0,706.5,Total Nonfarm,2009-01-01
1,706.5,Total Nonfarm,2009-02-01
2,710.6,Total Nonfarm,2009-03-01
3,712.6,Total Nonfarm,2009-04-01
4,716.5,Total Nonfarm,2009-05-01


In [8]:
# Extract year and month from date column
blsemp_df['Year'] = blsemp_df['Date'].dt.year.astype('Int64')
blsemp_df['Month'] = blsemp_df['Date'].dt.month.astype('Int64')

print('Date columns extracted successfully!')

blsemp_df.head()

Date columns extracted successfully!


Unnamed: 0,Value,Industry,Date,Year,Month
0,706.5,Total Nonfarm,2009-01-01,2009,1
1,706.5,Total Nonfarm,2009-02-01,2009,2
2,710.6,Total Nonfarm,2009-03-01,2009,3
3,712.6,Total Nonfarm,2009-04-01,2009,4
4,716.5,Total Nonfarm,2009-05-01,2009,5


In [9]:
# Check for duplicates
duplicates = blsemp_df.duplicated().sum()
print('Number of duplicate rows:', duplicates)

Number of duplicate rows: 0


In [10]:
# Check for missing values
print('Number of missing values:', blsemp_df.isna().sum())

Number of missing values: Value       0
Industry    0
Date        0
Year        0
Month       0
dtype: int64


In [11]:
# Save cleaned data
blsemp_df.to_csv('../../../workspace/nf-insights-project/data_clean/bls_employ_clean.csv')

print('CSV saved successfully!')

CSV saved successfully!
