# Pulling and cleaning the state minimum wage data

In [69]:
# load packages
import requests
import pandas as pd
import numpy as np

# State minimum wage data
url = 'https://www.dol.gov/agencies/whd/mw-consolidated'
html = requests.get(url).content
df_list = pd.read_html(html)
original = df_list[0]
print(original)

        Greater than federal MW Equals federal MW of $7.25 No MW Required
0                     AK $10.85                       CNMI             AL
1                     AR $11.00                         GA             LA
2                     AZ $13.85                         IA             MS
3                     CA $15.50                         ID             SC
4                     CO $13.65                         IN             TN
5                     CT $14.00                         KS            NaN
6                     DC $16.50                         KY            NaN
7                     DE $11.75                         NC            NaN
8                     FL $11.00                         ND            NaN
9                     HI $12.00                         NH            NaN
10                    IL $13.00                         OK            NaN
11                    MA $15.00                         PA            NaN
12                    MD $13.25       

In [70]:
# Rename column headers
original.columns = ['state_specific', 'federal', 'none']
original.head()

Unnamed: 0,state_specific,federal,none
0,AK $10.85,CNMI,AL
1,AR $11.00,GA,LA
2,AZ $13.85,IA,MS
3,CA $15.50,ID,SC
4,CO $13.65,IN,TN


In [71]:
# Split first column into two new columns
original[['state', 'specific']] = original.state_specific.str.split("$", expand = True)

# Move state to first column
original = original[['state', 'specific', 'federal', 'none']]

# Drop the last row that had the subtotals
original = original[:-1]

print(original)

   state    specific federal none
0    AK        10.85    CNMI   AL
1    AR        11.00      GA   LA
2    AZ        13.85      IA   MS
3    CA        15.50      ID   SC
4    CO        13.65      IN   TN
5    CT        14.00      KS  NaN
6    DC        16.50      KY  NaN
7    DE        11.75      NC  NaN
8    FL        11.00      ND  NaN
9    HI        12.00      NH  NaN
10   IL        13.00      OK  NaN
11   MA        15.00      PA  NaN
12   MD        13.25      TX  NaN
13   ME        13.80      UT  NaN
14   MI        10.10      WI  NaN
15   MN        10.59      WY  NaN
16   MO        12.00     NaN  NaN
17   MT         9.95     NaN  NaN
18   NE        10.50     NaN  NaN
19   NJ        14.13     NaN  NaN
20   NM        12.00     NaN  NaN
21   NV   10.50/9.50     NaN  NaN
22   NY        14.20     NaN  NaN
23   OH        10.10     NaN  NaN
24   OR        13.50     NaN  NaN
25   PR         8.50     NaN  NaN
26   RI        13.00     NaN  NaN
27   SD        10.80     NaN  NaN
28   VA       

In [72]:
# Create three data frames to later append to each other

# Create data frame with states with state-specific mws
df1 = original[['state','specific']]
df1.columns = ['state','wage']

# Create data frame with states with federal mws
df2 = original[['federal']]
df2.columns = ['state']
df2['wage']='7.25'

# Create data frame with states with no mws
df3 = original[['none']]
df3.columns = ['state']
df3['wage']='7.25'

   state  wage
0     AL  7.25
1     LA  7.25
2     MS  7.25
3     SC  7.25
4     TN  7.25
5    NaN  7.25
6    NaN  7.25
7    NaN  7.25
8    NaN  7.25
9    NaN  7.25
10   NaN  7.25
11   NaN  7.25
12   NaN  7.25
13   NaN  7.25
14   NaN  7.25
15   NaN  7.25
16   NaN  7.25
17   NaN  7.25
18   NaN  7.25
19   NaN  7.25
20   NaN  7.25
21   NaN  7.25
22   NaN  7.25
23   NaN  7.25
24   NaN  7.25
25   NaN  7.25
26   NaN  7.25
27   NaN  7.25
28   NaN  7.25
29   NaN  7.25
30   NaN  7.25
31   NaN  7.25
32   NaN  7.25
33   NaN  7.25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['wage']='7.25'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['wage']='7.25'


In [74]:
# Append the data frames into one 
df = pd.concat([df1, df2, df3])
df = df.dropna()
print(df)

   state        wage
0    AK        10.85
1    AR        11.00
2    AZ        13.85
3    CA        15.50
4    CO        13.65
5    CT        14.00
6    DC        16.50
7    DE        11.75
8    FL        11.00
9    HI        12.00
10   IL        13.00
11   MA        15.00
12   MD        13.25
13   ME        13.80
14   MI        10.10
15   MN        10.59
16   MO        12.00
17   MT         9.95
18   NE        10.50
19   NJ        14.13
20   NM        12.00
21   NV   10.50/9.50
22   NY        14.20
23   OH        10.10
24   OR        13.50
25   PR         8.50
26   RI        13.00
27   SD        10.80
28   VA        12.00
29   VT        13.18
30   WA        15.74
31   WV         8.75
32   VI        10.50
33   GU         9.25
0   CNMI        7.25
1     GA        7.25
2     IA        7.25
3     ID        7.25
4     IN        7.25
5     KS        7.25
6     KY        7.25
7     NC        7.25
8     ND        7.25
9     NH        7.25
10    OK        7.25
11    PA        7.25
12    TX     