BLS Unemployment Data Exploration and Cleaning

In [1]:
# Import libraries
import pandas as pd

print('Libraries imported successfully!')

Libraries imported successfully!


In [2]:
# Read in bls unemployment data csv file
blsunemp_df = pd.read_csv('../../../workspace/nf-insights-project/data_raw/bls_unemploy_data_raw.csv')

print('CSV read successfully!')

blsunemp_df.head()

CSV read successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value
0,LASST540000000000003,2009,M01,2009 Jan,6.2
1,LASST540000000000003,2009,M02,2009 Feb,6.8
2,LASST540000000000003,2009,M03,2009 Mar,7.3
3,LASST540000000000003,2009,M04,2009 Apr,7.8
4,LASST540000000000003,2009,M05,2009 May,8.1


In [None]:
# Data check
print('Data check:')
print('Data shape:', blsunemp_df.shape)
print(blsunemp_df.info())
blsunemp_df.columns

Data check:
(612, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Series ID  612 non-null    object
 1   Year       612 non-null    int64 
 2   Period     612 non-null    object
 3   Label      612 non-null    object
 4   Value      612 non-null    object
dtypes: int64(1), object(4)
memory usage: 24.0+ KB
None


Index(['Series ID', 'Year', 'Period', 'Label', 'Value'], dtype='object')

In [4]:
# Pull unique Series ID's
print('Unique Series IDs pulled successfully!')
blsunemp_df['Series ID'].unique()

Unique Series ID's pulled successfully!


array(['LASST540000000000003', 'LASST540000000000008',
       'LAUST540000000000006'], dtype=object)

In [5]:
# Create dictionary of Series ID's to category names
series_map = {
    'LASST540000000000003': 'Unemployment Rate',
    'LASST540000000000008': 'Labor Force Participation Rate',
    'LAUST540000000000006': 'Labor Force'
}

# Map Series ID's to industry names
blsunemp_df['Unemployment Statistic'] = blsunemp_df['Series ID'].map(series_map)

print('Series IDs mapped successfully!')

blsunemp_df.head()

Series ID's mapped successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Unemployment Statistic
0,LASST540000000000003,2009,M01,2009 Jan,6.2,Unemployment Rate
1,LASST540000000000003,2009,M02,2009 Feb,6.8,Unemployment Rate
2,LASST540000000000003,2009,M03,2009 Mar,7.3,Unemployment Rate
3,LASST540000000000003,2009,M04,2009 Apr,7.8,Unemployment Rate
4,LASST540000000000003,2009,M05,2009 May,8.1,Unemployment Rate


In [6]:
# Create a proper date column
blsunemp_df['Date'] = pd.to_datetime(blsunemp_df['Year'].astype(str) + '-' + blsunemp_df['Period'].str[1:], format='%Y-%m')

print('Date column created successfully!')

blsunemp_df.head()

Date column created successfully!


Unnamed: 0,Series ID,Year,Period,Label,Value,Unemployment Statistic,Date
0,LASST540000000000003,2009,M01,2009 Jan,6.2,Unemployment Rate,2009-01-01
1,LASST540000000000003,2009,M02,2009 Feb,6.8,Unemployment Rate,2009-02-01
2,LASST540000000000003,2009,M03,2009 Mar,7.3,Unemployment Rate,2009-03-01
3,LASST540000000000003,2009,M04,2009 Apr,7.8,Unemployment Rate,2009-04-01
4,LASST540000000000003,2009,M05,2009 May,8.1,Unemployment Rate,2009-05-01


In [7]:
# Drop unnecessary columns
blsunemp_df = blsunemp_df.drop(columns=['Series ID', 'Year', 'Period', 'Label'])

print('Unnecessary columns dropped successfully!')

blsunemp_df.head()

Unnecessary columns dropped successfully!


Unnamed: 0,Value,Unemployment Statistic,Date
0,6.2,Unemployment Rate,2009-01-01
1,6.8,Unemployment Rate,2009-02-01
2,7.3,Unemployment Rate,2009-03-01
3,7.8,Unemployment Rate,2009-04-01
4,8.1,Unemployment Rate,2009-05-01


In [8]:
# Extract year and month from date column
blsunemp_df['Year'] = blsunemp_df['Date'].dt.year.astype('Int64')
blsunemp_df['Month'] = blsunemp_df['Date'].dt.month.astype('Int64')

print('Date columns extracted successfully!')

blsunemp_df.head()

Date columns extracted successfully!


Unnamed: 0,Value,Unemployment Statistic,Date,Year,Month
0,6.2,Unemployment Rate,2009-01-01,2009,1
1,6.8,Unemployment Rate,2009-02-01,2009,2
2,7.3,Unemployment Rate,2009-03-01,2009,3
3,7.8,Unemployment Rate,2009-04-01,2009,4
4,8.1,Unemployment Rate,2009-05-01,2009,5


In [9]:
# Check for duplicates
duplicates = blsunemp_df.duplicated().sum()
print('Number of duplicate rows:', duplicates)

Number of duplicate rows: 0


In [10]:
# Check for missing values
print('Number of missing values:', blsunemp_df.isna().sum())

Number of missing values: Value                     0
Unemployment Statistic    0
Date                      0
Year                      0
Month                     0
dtype: int64


In [11]:
# Save cleaned data
blsunemp_df.to_csv('../../../workspace/nf-insights-project/data_clean/bls_unemploy_clean.csv')

print('CSV saved successfully!')

CSV saved successfully!
