# Exploratory Data Analysis (EDA)

This notebook is for analyzing the cleaned locum tenens job data. I will explore trends in pay rates by specialty, location, and seasonality to answer our core research questions.


In [None]:
# Load Data & Initial Setup
import os
import pandas as pd
import plotly.express as px

# Set plotly to a clean template
px.defaults.template = "plotly_white"

# Load the cleaned data from the Parquet file
PROCESSED_DATA_PATH = os.path.join('..', 'data', 'processed', 'jobs.parquet')

# Check if the file exists before trying to load
if os.path.exists(PROCESSED_DATA_PATH):
    df = pd.read_parquet(PROCESSED_DATA_PATH)
    print("Successfully loaded the cleaned dataset.")
    print(f"Dataset shape: {df.shape}")
else:
    print(f"Error: Processed data file not found at {PROCESSED_DATA_PATH}")
    print("Please run the data cleaning script first (`src/data_cleaning.py`).")
    df = pd.DataFrame() # Create an empty DataFrame to prevent errors in cells



Successfully loaded the cleaned dataset.
Dataset shape: (2019, 16)


In [None]:
# Data Overview 
# Show the first few rows of the DataFrame to get a feel for the data
if not df.empty:
    display(df.head())

    # Display a concise summary of the DataFrame
    print("\n--- Data Info ---")
    df.info()

    # Display descriptive statistics for meaningful numeric columns
    print("\n--- Descriptive Statistics (Pay Rate Data) ---")
    display(df[['rate_hourly', 'rate_daily']].describe())


Unnamed: 0,job_id,job_title,specialty,state,state_id,city,description_html,posted_date,start_date,end_date,source,job_id_string,scrape_timestamp_utc,rate_hourly,rate_daily,job_url
0,8278,Infectious Disease Locum Opportunity in Indiana,Infectious Disease,Indiana,IN,Indianapolis,"<h1><strong style=""background-color: transpare...",2025-08-15 05:57:47.363,2025-12-01,2026-06-30,LocumSmart,JB-IN-INFE-081525-8278,2025-08-15T14:01:31.995865,,,https://www.prolocums.com/job-detail/JB-IN-INF...
1,1103,Internal Medicine Locum Opportunity in California,Internal Medicine,California,CA,Fortuna,"<p><span style=""background-color: rgb(255, 255...",NaT,NaT,NaT,LocumSmart,JB-CA-INME-032123-1103,2025-08-15T14:01:31.995865,150.0,1200.0,https://www.prolocums.com/job-detail/JB-CA-INM...
2,1085,Hospitalist Locum Opportunity in Montana,Hospitalist,Montana,MT,Missoula,"<p><span style=""background-color: rgb(255, 255...",NaT,NaT,NaT,LocumSmart,JB-MT-HOSP-031623-1085,2025-08-15T14:01:31.995865,200.0,1600.0,https://www.prolocums.com/job-detail/JB-MT-HOS...
3,1086,Hospitalist Locum Opportunity in Wyoming,Hospitalist,Wyoming,WY,Riverton,"<ul><li><span style=""background-color: rgb(255...",NaT,NaT,NaT,LocumSmart,JB-WY-HOSP-031623-1086,2025-08-15T14:01:31.995865,220.0,1760.0,https://www.prolocums.com/job-detail/JB-WY-HOS...
4,1087,Psychiatry Locum Opportunity in Kentucky,Psychiatry,Kentucky,KY,Harlan,"<p><strong style=""background-color: transparen...",NaT,NaT,NaT,LocumSmart,JB-KY-PSYC-031623-1087,2025-08-15T14:01:31.995865,200.0,1600.0,https://www.prolocums.com/job-detail/JB-KY-PSY...



--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   job_id                2019 non-null   int64         
 1   job_title             2019 non-null   object        
 2   specialty             2019 non-null   object        
 3   state                 2019 non-null   object        
 4   state_id              2019 non-null   object        
 5   city                  2019 non-null   object        
 6   description_html      2019 non-null   object        
 7   posted_date           12 non-null     datetime64[ns]
 8   start_date            12 non-null     datetime64[ns]
 9   end_date              2 non-null      datetime64[ns]
 10  source                1802 non-null   object        
 11  job_id_string         2019 non-null   object        
 12  scrape_timestamp_utc  2019 non-null   object        
 13 

Unnamed: 0,rate_hourly,rate_daily
count,1162.0,1162.0
mean,273.43395,2187.471601
std,99.187467,793.499734
min,1.3,10.4
25%,200.0,1600.0
50%,250.0,2000.0
75%,350.0,2800.0
max,550.0,4400.0


# 3. Pay Rate Analysis by Specialty

Now, I will analyze which medical specialties have the highest pay rates. I will filter for jobs that have pay rate data, group them by specialty, and visualize the average daily rate.


In [None]:
# Filter for jobs where an hourly rate is available
df_with_rates = df.dropna(subset=['rate_hourly'])

if not df_with_rates.empty:
    # Group by specialty and calculate the median hourly rate and job count
    specialty_pay = df_with_rates.groupby('specialty').agg(
        median_hourly_rate=('rate_hourly', 'median'),
        job_count=('job_id', 'count')
    ).sort_values(by='median_hourly_rate', ascending=False).reset_index()

    print(f"\nFound {len(specialty_pay)} unique specialties with pay data.")
    
    #Visualization: Treemap of All Specialties 
    
    # Prepare the data by defining the parent-child relationship
    treemap_data = specialty_pay.copy()
    treemap_data['parent'] = 'All Specialties' # Assign a common parent
    
    # Create the figure using the 'names' and 'parents' arguments
    fig = px.treemap(
        treemap_data,
        names='specialty',
        parents='parent',
        values='job_count',
        color='median_hourly_rate',
        color_continuous_scale='YlGnBu',
        title='Treemap of Specialties by Job Count and Median Pay Rate'
    )

    fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
    fig.update_traces(
        # Customize the hover text for clarity
        hovertemplate='<b>%{label}</b><br>Job Count: %{value}<br>Median Hourly Rate: $%{color:.2f}<extra></extra>'
    )
    fig.show()

else:
    print("No jobs with pay rate data found to analyze.")




Found 65 unique specialties with pay data.


# 4. Pay Rate Analysis by Location

Next, I'll investigate the geographical trends in pay rates. We'll analyze which states offer the highest average daily rates for locum tenens positions. A bar chart will show the direct comparison, and a choropleth map will provide an intuitive geographical visualization.


In [None]:
# 4. Pay Rate Analysis by Location

# Define US regions for coloring using the 6 regions
regions = {
    'New England': ['CT', 'ME', 'MA', 'NH', 'RI', 'VT'],
    'Mid-Atlantic': ['DC', 'DE', 'MD', 'NJ', 'NY', 'PA'],
    'South': ['AL', 'AR', 'FL', 'GA', 'KY', 'LA', 'MS', 'NC', 'SC', 'TN', 'VA', 'WV'],
    'Midwest': ['IL', 'IN', 'IA', 'KS', 'MI', 'MN', 'MO', 'NE', 'ND', 'OH', 'SD', 'WI'],
    'Southwest': ['AZ', 'NM', 'OK', 'TX'],
    'West': ['AK', 'CA', 'CO', 'HI', 'ID', 'MT', 'NV', 'OR', 'UT', 'WA', 'WY']
}

def get_region(state_id):
    for region, states in regions.items():
        if state_id in states:
            return region
    return 'Other'

# Group by state and calculate the median hourly rate and job count
state_pay = df_with_rates.groupby(['state', 'state_id']).agg(
    median_hourly_rate=('rate_hourly', 'median'),
    job_count=('job_id', 'count')
).sort_values(by='median_hourly_rate', ascending=False).reset_index()

# Add region information
state_pay['region'] = state_pay['state_id'].apply(get_region)

# Define a clear color map for the 6 regions
color_map = {
    'New England': '#a6cee3',
    'Mid-Atlantic': '#1f78b4',
    'South': '#b2df8a',
    'Midwest': '#33a02c',
    'Southwest': '#fb9a99',
    'West': '#e31a1c'
}

print("Median Hourly Rate by State")
display(state_pay)

# Horizontal Bar Chart for States
fig_bar = px.bar(
    state_pay,
    y='state',
    x='median_hourly_rate',
    orientation='h',
    title='Median Hourly Pay Rate by State, Colored by Region',
    labels={'state': '', 'median_hourly_rate': 'Median Hourly Rate (USD)'},
    color='region',
    color_discrete_map=color_map
)

fig_bar.update_layout(
    showlegend=True, 
    yaxis={'categoryorder':'total ascending'},
    height=800, # Increase height to accommodate all states
    legend_title_text='Region'
)
fig_bar.show()

# Choropleth Map
fig_map = px.choropleth(
    state_pay,
    locations='state_id',
    locationmode="USA-states",
    color='median_hourly_rate',
    scope="usa",
    hover_name='state',
    hover_data={'state_id': False, 'median_hourly_rate': ':.2f', 'job_count': True},
    title='Median Hourly Pay Rate Across the United States',
    color_continuous_scale=px.colors.sequential.Blues,
    labels={'median_hourly_rate': 'Median Hourly Rate', 'job_count': 'Job Count'}
)
fig_map.update_layout(title_x=0.5)
fig_map.show()



--- Median Hourly Rate by State ---


Unnamed: 0,state,state_id,median_hourly_rate,job_count,region
0,Florida,FL,400.0,10,South
1,Alabama,AL,350.0,12,South
2,Maryland,MD,325.0,24,Mid-Atlantic
3,Minnesota,MN,312.5,6,Midwest
4,Arkansas,AR,300.0,17,South
5,Michigan,MI,300.0,54,Midwest
6,North Dakota,ND,300.0,8,Midwest
7,Nebraska,NE,300.0,3,Midwest
8,Missouri,MO,300.0,50,Midwest
9,Ohio,OH,300.0,49,Midwest
