# Analysis of wage and hour case data from US state labor agencies

## Data import and preparation

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.float_format', lambda x: '%.3f' % x)


In [2]:
df = pd.read_csv(
    "input/state_complaints.csv.gz",
    low_memory=False,
    parse_dates=["date_opened", "date_closed", "date_paid"],
)


### Removal of open, dismissed, withdrawn and other cases

Some states provided case statuses that indicated the outcome of the case. Some states provided case statuses that indicated only "closed" or "open" and no more specifics. Others provided no case status at all. In the transform task, I converted the statuses provided into the categories below. Each agency provided definitions of their statuses prior to this, unless otherwise documented. 

Below I will remove case statuses that indicate the case was rejected or withdrawn, or is still open. For states that didn't provide provide statuses, it is my understanding that they only contain closed cases and that open or rejected cases were not provided, so this should make this data as consistent across agencies as possible.

In [3]:
print(df.case_status.unique())


[nan 'closed' 'overturned' 'pending appeal' 'affirmed' 'open' 'dismissed'
 'pending enforcement' 'withdrawn' 'amount exceeds statutory limit']


In [4]:
states_with_status = df.query("case_status.notna()").state_name.unique()
orig_len = len(df)
df = df[
     # from a state that provided a status and has a status indicating
    # the case is concluded
    (
        (df.state_name.isin(states_with_status))
        & (
            df.case_status.isin(
                [
                    "closed",
                    "pending enforcement",
                    "affirmed",
                    "amount exceeds statutory limit",
                    np.NaN,
                ]
            )
        )
    )
    # or from a state that did not provide a status
    | ~(df.state_name.isin(states_with_status))
]
new_len = len(df)
print(f"Removed {orig_len - new_len} rows")
df.fillna("NaN").pipe(
    lambda df: pd.crosstab(index=df.state_name, columns=df.case_status)
)


Removed 29530 rows


case_status,NaN,affirmed,amount exceeds statutory limit,closed,pending enforcement
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,190227,0,0,0,0
Colorado,454,0,0,0,0
Illinois,15,0,0,11126,769
Indiana,17544,0,325,0,1814
Iowa,7370,0,0,0,0
Kansas,10021,0,0,3,0
Kentucky,7137,0,0,0,0
Maine,792,0,0,0,0
Maryland,5051,0,0,0,0
Massachusetts,2525,0,0,0,0


# Analysis

## Case duration

In [5]:
df = df.assign(
    case_duration=lambda df: (
        df.apply(
            lambda row: row.date_paid - row.date_opened
            if pd.notna(row.date_paid)
            else row.date_closed - row.date_opened,
            axis=1,
        ).apply(lambda val: val.days)
    )
)
cd_df = df.query("case_duration.notna()").copy()
print(cd_df.case_duration.describe())


count   145502.000
mean       232.791
std        369.323
min          0.000
25%         41.000
50%        119.000
75%        268.000
max       4585.000
Name: case_duration, dtype: float64


In [6]:
bins = pd.cut(bins=[0, 7, 28, 180, 365, 730, 999999999], x=cd_df.case_duration)
cd_df.groupby(bins).size().to_frame("count").rename(
    index={
        pd.Interval(0, 7, closed="right"): "0-7 days",
        pd.Interval(7, 28, closed="right"): "7-28 days",
        pd.Interval(28, 180, closed="right"): "28-180 days",
        pd.Interval(180, 365, closed="right"): "180-365 days",
        pd.Interval(365, 730, closed="right"): "365-730 days",
        pd.Interval(730, 999999999, closed="right"): "730+ days",
    }
)


Unnamed: 0_level_0,count
case_duration,Unnamed: 1_level_1
0-7 days,4923
7-28 days,22485
28-180 days,62956
180-365 days,31617
365-730 days,14020
730+ days,9160


In [7]:
cd_df.groupby("state_name").case_duration.describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
California,14480.0,529.207,305.746,26.0,306.0,446.0,683.0,1985.0
Indiana,3436.0,29.839,28.236,0.0,14.0,24.0,38.0,389.0
Kansas,9105.0,213.001,401.861,0.0,69.0,127.0,208.0,4105.0
South Carolina,14290.0,62.379,54.661,0.0,28.0,53.0,81.0,1203.0
Texas,60683.0,332.816,450.52,29.0,133.0,208.0,323.0,4585.0
Washington,37646.0,72.916,165.037,0.0,18.0,29.0,59.0,1238.0
West Virginia,3770.0,33.124,42.204,0.0,8.0,20.0,40.0,444.0
Wyoming,2092.0,100.001,137.623,0.0,24.0,48.0,124.0,1114.0
