BLS Unemployment Data Exploration and Cleaning

In [231]:
# Import libraries
import pandas as pd

print('Libraries imported successfully!')

Libraries imported successfully!


In [232]:
# Read in bls unemployment data csv file
blsunemp_df = pd.read_csv('../../../workspace/nf-insights-project/data_raw/bls_unemploy_data_raw.csv')

print('CSV read successfully!')

blsunemp_df.head()

CSV read successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value
0,LASST540000000000003,2006,M01,2006 Jan,4.7
1,LASST540000000000003,2006,M02,2006 Feb,4.6
2,LASST540000000000003,2006,M03,2006 Mar,4.6
3,LASST540000000000003,2006,M04,2006 Apr,4.7
4,LASST540000000000003,2006,M05,2006 May,4.8


In [233]:
# Data check
print('Data check:')
print('Data shape:', blsunemp_df.shape)
print(blsunemp_df.info())
blsunemp_df.columns

Data check:
Data shape: (720, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Series ID  720 non-null    object
 1   Year       720 non-null    int64 
 2   Period     720 non-null    object
 3   Label      720 non-null    object
 4   Value      720 non-null    object
dtypes: int64(1), object(4)
memory usage: 28.3+ KB
None


Index(['Series ID', 'Year', 'Period', 'Label', 'Value'], dtype='object')

In [234]:
# Pull unique Series ID's
print('Unique Series IDs pulled successfully!')
blsunemp_df['Series ID'].unique()

Unique Series IDs pulled successfully!


array(['LASST540000000000003', 'LASST540000000000008',
       'LAUST540000000000006'], dtype=object)

In [235]:
# Create dictionary of Series ID's to category names
series_map = {
    'LASST540000000000003': 'Unemployment Rate',
    'LASST540000000000008': 'Labor Force Participation Rate',
    'LAUST540000000000006': 'Labor Force'
}

# Map Series ID's to industry names
blsunemp_df['Unemployment Statistic'] = blsunemp_df['Series ID'].map(series_map)

print('Series IDs mapped successfully!')

# Data check
print('Data check:')
print('Data shape:', blsunemp_df.shape)
print(blsunemp_df.info())

Series IDs mapped successfully!
Data check:
Data shape: (720, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Series ID               720 non-null    object
 1   Year                    720 non-null    int64 
 2   Period                  720 non-null    object
 3   Label                   720 non-null    object
 4   Value                   720 non-null    object
 5   Unemployment Statistic  720 non-null    object
dtypes: int64(1), object(5)
memory usage: 33.9+ KB
None


In [236]:
# Create a proper date column
blsunemp_df['Date'] = pd.to_datetime(blsunemp_df['Year'].astype(str) + '-' + blsunemp_df['Period'].str[1:], format='%Y-%m')

print('Date column created successfully!')

# Data check
print('Data check:')
blsunemp_df.info()

Date column created successfully!
Data check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Series ID               720 non-null    object        
 1   Year                    720 non-null    int64         
 2   Period                  720 non-null    object        
 3   Label                   720 non-null    object        
 4   Value                   720 non-null    object        
 5   Unemployment Statistic  720 non-null    object        
 6   Date                    720 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 39.5+ KB


In [237]:
# Drop unnecessary columns
blsunemp_df = blsunemp_df.drop(columns=['Series ID', 'Period', 'Label'])

print('Unnecessary columns dropped successfully!')

# Data check
print('Data check:')
print('Data shape:', blsunemp_df.shape)
print(blsunemp_df.info())

Unnecessary columns dropped successfully!
Data check:
Data shape: (720, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Year                    720 non-null    int64         
 1   Value                   720 non-null    object        
 2   Unemployment Statistic  720 non-null    object        
 3   Date                    720 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 22.6+ KB
None


In [238]:
# Convert Value column from object to numeric
blsunemp_df["Value"] = pd.to_numeric(blsunemp_df["Value"], errors="coerce")

print('Value column converted successfully!')

# Data check
print('Data check:')
print('Data shape:', blsunemp_df.shape)
blsunemp_df.info()

Value column converted successfully!
Data check:
Data shape: (720, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Year                    720 non-null    int64         
 1   Value                   717 non-null    float64       
 2   Unemployment Statistic  720 non-null    object        
 3   Date                    720 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 22.6+ KB


In [239]:
# Check for missing values
print('Number of missing values:', blsunemp_df.isna().sum())

Number of missing values: Year                      0
Value                     3
Unemployment Statistic    0
Date                      0
dtype: int64


In [240]:
# Identify missing values
blsunemp_df[blsunemp_df["Value"].isna()]

Unnamed: 0,Year,Value,Unemployment Statistic,Date
237,2025,,Unemployment Rate,2025-10-01
477,2025,,Labor Force Participation Rate,2025-10-01
717,2025,,Labor Force,2025-10-01


In [241]:
# Group by Year and Unemployment Statistic, then take the mean of Value
blsunemp_yearly = (
    blsunemp_df
    .groupby(['Year', 'Unemployment Statistic'], as_index=False)
    .agg({'Value': 'mean'})
)

print('Data grouped successfully!')

# Data check
print('Data check:')
print('Data shape:', blsunemp_yearly.shape)
blsunemp_yearly.info()

Data grouped successfully!
Data check:
Data shape: (60, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    60 non-null     int64  
 1   Unemployment Statistic  60 non-null     object 
 2   Value                   60 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.5+ KB


In [242]:
# Pivot data from long -> wide
blsunemp_wide = blsunemp_yearly.pivot(
    index='Year',
    columns='Unemployment Statistic',
    values='Value'
).reset_index()

print('Data pivoted successfully!')

# Data check
print('Data check:')
print('Data shape:', blsunemp_wide.shape)
blsunemp_wide.info()
blsunemp_wide.head()

Data pivoted successfully!
Data check:
Data shape: (20, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Year                            20 non-null     int64  
 1   Labor Force                     20 non-null     float64
 2   Labor Force Participation Rate  20 non-null     float64
 3   Unemployment Rate               20 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 772.0 bytes


Unemployment Statistic,Year,Labor Force,Labor Force Participation Rate,Unemployment Rate
0,2006,807644.75,55.491667,4.775
1,2007,808237.083333,55.291667,4.475
2,2008,810940.333333,55.233333,4.525
3,2009,815892.583333,55.283333,7.975
4,2010,810913.583333,54.691667,8.4


In [243]:
# Round for readability
blsunemp_wide["Labor Force Participation Rate"] = blsunemp_wide["Labor Force Participation Rate"].round(2)
blsunemp_wide["Labor Force"] = blsunemp_wide["Labor Force"].astype(int).round(0)
blsunemp_wide["Unemployment Rate"] = blsunemp_wide["Unemployment Rate"].round(2)

# Data check
print('Data check:')
print('Data shape:', blsunemp_wide.shape)
blsunemp_wide.head()

Data check:
Data shape: (20, 4)


Unemployment Statistic,Year,Labor Force,Labor Force Participation Rate,Unemployment Rate
0,2006,807644,55.49,4.77
1,2007,808237,55.29,4.48
2,2008,810940,55.23,4.52
3,2009,815892,55.28,7.98
4,2010,810913,54.69,8.4


In [244]:
# Calculate YoY % change for each column except the Year column
for col in ['Unemployment Rate', 'Labor Force', 'Labor Force Participation Rate']:
    blsunemp_wide[col + ' YoY Change'] = blsunemp_wide[col].pct_change() * 100

# Round for readability
blsunemp_wide['Unemployment Rate YoY Change'] = blsunemp_wide['Unemployment Rate YoY Change'].round(2)
blsunemp_wide['Labor Force Participation Rate YoY Change'] = blsunemp_wide['Labor Force Participation Rate YoY Change'].round(2)
blsunemp_wide['Labor Force YoY Change'] = blsunemp_wide['Labor Force YoY Change'].round(0)

print('YoY % change calculated successfully!')

# Data check
print('Data check:')
print('Data shape:', blsunemp_wide.shape)
print(blsunemp_wide.info())
blsunemp_wide.head()

YoY % change calculated successfully!
Data check:
Data shape: (20, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Year                                       20 non-null     int64  
 1   Labor Force                                20 non-null     int64  
 2   Labor Force Participation Rate             20 non-null     float64
 3   Unemployment Rate                          20 non-null     float64
 4   Unemployment Rate YoY Change               19 non-null     float64
 5   Labor Force YoY Change                     19 non-null     float64
 6   Labor Force Participation Rate YoY Change  19 non-null     float64
dtypes: float64(5), int64(2)
memory usage: 1.2 KB
None


Unemployment Statistic,Year,Labor Force,Labor Force Participation Rate,Unemployment Rate,Unemployment Rate YoY Change,Labor Force YoY Change,Labor Force Participation Rate YoY Change
0,2006,807644,55.49,4.77,,,
1,2007,808237,55.29,4.48,-6.08,0.0,-0.36
2,2008,810940,55.23,4.52,0.89,0.0,-0.11
3,2009,815892,55.28,7.98,76.55,1.0,0.09
4,2010,810913,54.69,8.4,5.26,-1.0,-1.07


In [245]:
# Check for duplicates
duplicates = blsunemp_wide.duplicated().sum()
print('Number of duplicate rows:', duplicates)

# Reset index
blsunemp_final = blsunemp_wide.reset_index(drop=True)

print('Index reset successfully!')

Number of duplicate rows: 0
Index reset successfully!


In [246]:
# Save cleaned data
blsunemp_final.to_csv('../../../workspace/nf-insights-project/data_clean/bls_unemploy_clean.csv')

print('CSV saved successfully!')

CSV saved successfully!
