In [137]:
import pandas as pd
import os

In [138]:
# =========
# H1B
# =========

# Dictionary
year_to_df = {}

# Load all files
for year in range(2009, 2025):
    year_str = str(year)
    filepath = f"./data/h1b/{year_str}.csv"
    yearly_df = pd.read_csv(filepath, encoding="utf-16", sep="\t")
    year_to_df[year] = yearly_df

  yearly_df = pd.read_csv(filepath, encoding="utf-16", sep="\t")


In [139]:
# Re-index
for year in range(2009, 2025):
    df = year_to_df[2009]

    df["Industry (NAICS) Code"].fillna("Unknown", inplace=True)

    for col in ["Initial Approval", "Continuing Approval"]:
        df[col] = (
            df[col]
            .astype(str)  # Ensure it's a string for replacement
            .str.replace(",", "", regex=True)  # Remove commas
            .astype(float)  # Convert to float (handles NaNs)
            .fillna(0)  # Replace NaNs with 0
            .astype(int)  # Convert to integer
        )

    # Compute total approvals
    df["Total Approvals"] = df["Initial Approval"] + df["Continuing Approval"]

    # Ensure 'Total Approvals' is properly numeric
    df["Total Approvals"] = (
        pd.to_numeric(df["Total Approvals"], errors="coerce").fillna(0).astype(int)
    )

    # Re-index with (State, Industry) and sum approvals
    df_grouped = (
        df.groupby(["Petitioner State", "Industry (NAICS) Code"])["Total Approvals"]
        .sum()
        .reset_index()
    )

    # Set as multi-index
    df_grouped.set_index(["Petitioner State", "Industry (NAICS) Code"], inplace=True)
    df_grouped.to_csv(f"data/h1b_processed/{str(year)}.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Industry (NAICS) Code"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Industry (NAICS) Code"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [140]:
# =========
# Labor
# =========

# Dictionary
quarter_to_df = {}

# Load all files
for year in range(2003, 2021):
    year_str = str(year)
    filepath = f"./data/labor/{year_str}/"
    for month in {"jan", "apr", "jul", "oct"}:
        quarterly_filepath = filepath + month + ".xlsx"
        quarterly_df = pd.read_excel(quarterly_filepath)
        quarter_to_df[year_str + month] = quarterly_df

In [141]:
months = ["jan", "apr", "jul", "oct"]
for year in range(2011, 2021):
    df1 = quarter_to_df["2011oct"]

    for i, month in enumerate(months):
        # Check between year
        df2 = quarter_to_df[str(year) + month]
        print(df1.shape == df2.shape)

# 2003 jan to 2005 apr (inclusive)
# 2005 apr to 2005 jul (inclusive)
# 2005 oct
# 2006 jan to 2010 jul (inclusive)
# 2010 jul
# 2010 oct
# 2011 jan to 2011 jul (inclusive)
# 2011 oct to end (except 2020 jul)
    # 2020 jul remove last row




False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True


In [156]:
import re
import collections
new_labor_df = collections.defaultdict(int)
def labor_helper_1(year, month, col1, col2, col3, col4, range1, range2, range3):
    cols = [col1, col2, col3, col4]
    for i, industry in enumerate(
        ["Total", "Natural Res. and Mining", "Construction", "Manufacturing"]
    ):
        for row in range1:
            state_raw = df.loc[row, "State"]

            state_trimmed = re.sub(r"\s*\.*$", "", state_raw.strip())
            k = tuple([year, month, state_trimmed, industry])
            new_labor_df[k] = df.loc[row, cols[i]]

    for i, industry in enumerate(
        ["Trade, Transport, Util.", "Info", "Finance", "Prof. Bus. Services"]
    ):
        # cols = [col1, "Unnamed: 12", col3, col4] # For 2005 jul
        # cols = [col1, "Unnamed: 13", col3, col4] # For 2005 oct to 2010 apr
        # cols = [ "Unnamed: 6", "Unnamed: 15", "Unnamed: 23", "Unnamed: 31"]  # For 2010 jul
        cols = [ col1, "Unnamed: 14", "Unnamed: 22", "Unnamed: 30"]  # For 2010 oct to 2011 jul
        for row in range2:
            state_raw = df.loc[row, "State"]
            state_trimmed = re.sub(r"\s*\.*$", "", state_raw.strip())
            k = tuple([year, month, state_trimmed, industry])
            if industry == "Info":
                print(df.loc[row, cols[i]])
            new_labor_df[k] = df.loc[row, cols[i]]

    for i, industry in enumerate(
        ["Education and health", "Leisure and hospitality", "Other", "Gov"]
    ):
        for row in range3:
            state_raw = df.loc[row, "State"]
            state_trimmed = re.sub(r"\s*\.*$", "", state_raw.strip())
            k = tuple([year, month, state_trimmed, industry])
            new_labor_df[k] = df.loc[row, cols[i]]


def labor_helper_2(year, month, col1, col2, col3, range1, range2, range3, range4):
    cols = [col1, col2, col3]
    for i, industry in enumerate(
        ["Total", "Natural Res. and Mining", "Construction"]
    ):
        for row in range1:
            state_raw = df.iloc[row, 0]
            state_trimmed = re.sub(r"\d+\s*|\.+", "", state_raw.strip()).strip()
            k = tuple([year, month, state_trimmed, industry])
            print(df.loc[row, cols[i]])
            new_labor_df[k] = df.loc[row, cols[i]]

    for i, industry in enumerate(
        ["Manufacturing", "Trade, Transport, Util.", "Info"]
    ):
        for row in range2:
            state_raw = df.iloc[row, 0]
            state_trimmed = re.sub(r"\d+\s*|\.+", "", state_raw.strip()).strip()
            k = tuple([year, month, state_trimmed, industry])
            print(df.loc[row, cols[i]])
            new_labor_df[k] = df.loc[row, cols[i]]

    for i, industry in enumerate(
        ["Finance", "Prof. Bus. Services", "Education and health"]
    ):
        for row in range3:
            state_raw = df.iloc[row, 0]
            state_trimmed = re.sub(r"\d+\s*|\.+", "", state_raw.strip()).strip()
            k = tuple([year, month, state_trimmed, industry])
            print(df.loc[row, cols[i]])
            new_labor_df[k] = df.loc[row, cols[i]]

    for i, industry in enumerate(
        ["Leisure and hospitality", "Other", "Gov"]
    ):
        for row in range4:
            state_raw = df.iloc[row, 0]
            state_trimmed = re.sub(r"\d+\s*|\.+", "", state_raw.strip()).strip()
            k = tuple([year, month, state_trimmed, industry])
            print(df.loc[row, cols[i]])
            new_labor_df[k] = df.loc[row, cols[i]]


# 2003 to 2004 (inclusive)
# for year in range(2003, 2005):
#     for i, month in enumerate(months):
#         if year == 2006 and month in {"jul", "oct"}:
#             continue
#         df = quarter_to_df[str(year) + month]
#         labor_helper_1(
#             year,
#             month,
#             "Unnamed: 4",
#             "Unnamed: 9",
#             "Unnamed: 13",
#             "Unnamed: 17",
#             range(2, 54),
#             range(57, 109),
#             range(112, 164),
#         )

# 2005 jan
# df = quarter_to_df["2005jan"]
# labor_helper_1(
#     2005,
#     "jan",
#     "Unnamed: 4",
#     "Unnamed: 9",
#     "Unnamed: 13",
#     "Unnamed: 17",
#     range(2, 54),
#     range(57, 109),
#     range(112, 164),
# )

# 2005 apr
# df = quarter_to_df["2005apr"]
# labor_helper_1(
#     2005,
#     "apr",
#     "Unnamed: 4",
#     "Unnamed: 9",
#     "Unnamed: 13",
#     "Unnamed: 17",
#     range(2, 54),
#     range(57, 109),
#     range(112, 164),
# )

# 2005 jul
# df = quarter_to_df["2005jul"]
# labor_helper_1(
#     2005,
#     "jul",
#     "Unnamed: 6",
#     "Unnamed: 13",
#     "Unnamed: 17",
#     "Unnamed: 21",
#     range(2, 54),
#     range(57, 109),
#     range(112, 164),
# )

# 2005 oct
# df = quarter_to_df["2005oct"]
# labor_helper_1(
#     2005,
#     "oct",
#     "Unnamed: 6",
#     "Unnamed: 14",
#     "Unnamed: 18",
#     "Unnamed: 22",
#     range(2, 54),
#     range(57, 109),
#     range(112, 164),
# )

# 2006 to 2010 apr (inclusive)
# for year in range(2006, 2011):
#     for i, month in enumerate(months):
#         if year == 2010 and month == "oct" or month == "jul":
#             continue
#         df = quarter_to_df[str(year) + month]
#         labor_helper_1(
#             year,
#             month,
#             "Unnamed: 6",
#             "Unnamed: 14",
#             "Unnamed: 18",
#             "Unnamed: 22",
#             range(2, 54),
#             range(57, 109),
#             range(112, 164),
#         )

# 2010 jul
# df = quarter_to_df["2010jul"]
# labor_helper_1(
#     year,
#     month,
#     "Unnamed: 7",
#     "Unnamed: 16",
#     "Unnamed: 24",
#     "Unnamed: 32",
#     range(2, 54),
#     range(57, 109),
#     range(112, 164),
# )

# 2010 oct
# df = quarter_to_df["2010oct"]
# labor_helper_1(
#     year,
#     month,
#     "Unnamed: 6",
#     "Unnamed: 15",
#     "Unnamed: 23",
#     "Unnamed: 31",
#     range(2, 54),
#     range(57, 109),
#     range(112, 164),
# )

# 2011 jan to 2011 jul (inclusive)
# for i, month in enumerate(months):
#     if month == "oct":
#         continue
#     df = quarter_to_df["2011" + month]
#     labor_helper_1(
#         2011,
#         month,
#         "Unnamed: 6",
#         "Unnamed: 15",
#         "Unnamed: 23",
#         "Unnamed: 31",
#         range(2, 55),
#         range(58, 111),
#         range(114, 167),
#     )

# 2011 oct to 2020 oct (end)
for year in range(2011, 2021):
    for i, month in enumerate(months):
        if year == 2011 and month != "oct":
            continue
        df = quarter_to_df[str(year) + month]
        labor_helper_2(
            year,
            month,
            "Unnamed: 4",
            "Unnamed: 8",
            "Unnamed: 12",
            range(3, 56),
            range(60, 113),
            range(116, 169),
            range(172, 225)
        )

1880.7
327.1
2433.5
1180.5
14227.7
2261.2
1640.3
415.3
716.4
7261.9
3818.8
597
617.4
5743
2828.7
1504.8
1340.9
1811.8
1939.9
605.7
2547.7
3274.9
3987.6
2701.9
1104.4
2671
440.4
967.3
1130
636.6
3894.7
811.8
8727
3909
405.9
5166.6
1584.2
1645.5
5748.4
466.3
1832.5
410
2667.9
10681.9
1226.8
308.6
3681.6
2848.8
762.5
2779.4
292.5
916
42.2
12.7
16.8
11.1
10.9
28.6
28.8
0.6
–
–
5.7
9
–
4
9.6
6.8
2.2
9.3
23
55.7
2.8
–
1.1
8
6.6
9.1
4.4
8.3
–
12.8
0.9
1.5
20.7
5.7
5.7
17.5
12.2
49.5
7
34.1
0.2
4
–
–
256.8
11.7
0.8
10.5
6.1
33.1
3.3
28.3
–
–
81.9
17.5
118.6
49
582
113.7
53.1
19.7
11.1
327.2
134.4
28.7
32.6
218.8
129.1
68.2
56.6
67.1
125.4
26.4
143.1
114.3
139.8
95.9
47.5
105.8
24.7
44.2
56.8
21.6
135.8
41.4
325.7
174.3
28
187.4
74.2
74.4
230.8
16.8
76.4
22.7
113.2
598.1
70.7
14.9
184.5
146
35.7
94
24.7
29.6
2
237.7
9.5
151.8
154.6
1256.2
125.9
166.4
26.3
1.1
306.9
345.6
12.8
55.7
572.5
457.3
208.9
164.1
218.1
146.9
50.9
112.8
260
506.4
299.4
133.1
255.3
16.3
94.5
35.9
66.1
250.3
29.7
452.4
433

In [157]:
print(new_labor_df)

defaultdict(<class 'int'>, {(2011, 'oct', 'Alabama', 'Total'): 1880.7, (2011, 'oct', 'Alaska', 'Total'): 327.1, (2011, 'oct', 'Arizona', 'Total'): 2433.5, (2011, 'oct', 'Arkansas', 'Total'): 1180.5, (2011, 'oct', 'California', 'Total'): 14227.7, (2011, 'oct', 'Colorado', 'Total'): 2261.2, (2011, 'oct', 'Connecticut', 'Total'): 1640.3, (2011, 'oct', 'Delaware', 'Total'): 415.3, (2011, 'oct', 'District of Columbia', 'Total'): 716.4, (2011, 'oct', 'Florida', 'Total'): 7261.9, (2011, 'oct', 'Georgia', 'Total'): 3818.8, (2011, 'oct', 'Hawaii', 'Total'): 597, (2011, 'oct', 'Idaho', 'Total'): 617.4, (2011, 'oct', 'Illinois', 'Total'): 5743, (2011, 'oct', 'Indiana', 'Total'): 2828.7, (2011, 'oct', 'Iowa', 'Total'): 1504.8, (2011, 'oct', 'Kansas', 'Total'): 1340.9, (2011, 'oct', 'Kentucky', 'Total'): 1811.8, (2011, 'oct', 'Louisiana', 'Total'): 1939.9, (2011, 'oct', 'Maine', 'Total'): 605.7, (2011, 'oct', 'Maryland', 'Total'): 2547.7, (2011, 'oct', 'Massachusetts', 'Total'): 3274.9, (2011, 'oct

In [136]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
quarter_to_df["2011oct"]

Unnamed: 0,"Table 6. Employees on nonfarm payrolls by state and selected industry sector, not seasonally adjusted\n[In thousands]",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,State,Total,,,,Mining and logging,,,,Construction,,,
1,,September,,October,,September,,October,,September,,October,
2,,2010,2011.0,2010,2011p,2010,2011,2010,2011p,2010,2011.0,2010,2011p
3,Alabama. . . . . . . . . . . . . . . . . . . ....,1871.1,1871.0,1876.9,1880.7,12,12.7,12.1,12.7,88.2,83.8,88,81.9
4,Alaska. . . . . . . . . . . . . . . . . . . . ...,339.2,343.3,324.8,327.1,16.3,17.1,16.3,16.8,19,18.6,17.6,17.5
5,Arizona. . . . . . . . . . . . . . . . . . . ....,2363.5,2418.0,2388.8,2433.5,10.7,10.9,10.8,11.1,110.8,116.3,114,118.6
6,Arkansas. . . . . . . . . . . . . . . . . . . ...,1170.1,1177.5,1181.7,1180.5,10.7,11,10.6,10.9,49.8,50.4,49,49
7,California. . . . . . . . . . . . . . . . . . ...,13865.5,14126.5,13987.2,14227.7,28.1,28.7,28.2,28.6,566.7,584.7,565,582
8,Colorado. . . . . . . . . . . . . . . . . . . ...,2228.3,2255.0,2229.2,2261.2,25.1,28.4,25.5,28.8,118.1,112.7,117.5,113.7
9,Connecticut. . . . . . . . . . . . . . . . . ....,1616.3,1622.9,1630.1,1640.3,0.6,0.6,0.6,0.6,53.1,52.2,52,53.1


In [64]:
quarter_to_df["2005jul"].shape

(166, 22)

In [66]:
quarter_to_df["2005jul"]

Unnamed: 0,State,Total,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Natural resources and mining,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Construction,Unnamed: 15,Unnamed: 16,Unnamed: 17,Manufacturing,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,,June,,,,July,,,June,,...,,,June,,July,,June,,July,
1,,2004,,2005.0,,2004,2005p,,2004,,...,,2005p,2004,2005.0,2004,2005p,2004,2005.0,2004,2005p
2,Alabama ...................,1907.3,,1936.3,,1903.7,1924,,12.4,,...,,12.6,102,105.6,104.5,105.6,292,298.2,291.7,297.1
3,Alaska .......................,319,,323.5,,326.7,331.6,,10.4,,...,,10.7,20.1,20.9,21.3,22.2,14.3,14.7,21.2,21.6
4,Arizona .....................,2348.4,,2437.3,,2321.4,2419.9,,8.4,,...,,8.3,191.1,212.8,191.9,215.4,177.2,176.3,176.9,176.2
5,Arkansas ...................,1162.2,,1174.7,,1149.9,1161,,7.1,,...,,7.5,53.2,54.2,53.6,54.4,204.9,203.0,204.7,202.5
6,California ..................,14600.9,,14836.4,,14515.4,14727.4,,23.3,,...,,23.6,856.2,912.2,870.9,928.1,1537.9,1539.6,1551.1,1547.5
7,Colorado ...................,2200,,2246.9,,2188.7,2234.6,,14.5,,...,,16.8,156.5,165.6,158.3,167.4,155.5,154.2,156.1,153.8
8,Connecticut ...............,1666.6,,1687.3,,1644.4,1665.8,,0.8,,...,,0.8,68.6,74.1,69.7,75.3,198.9,198.4,196.7,196.5
9,Delaware .................,430.4,,439.1,,427.8,436.9,,( 1 ),,...,,( 1 ),27,28.7,27.4,28.8,35.5,33.3,35.1,33.6


In [67]:
quarter_to_df["2005jan"]

Unnamed: 0,State,Total,Unnamed: 2,Unnamed: 3,Unnamed: 4,Natural resources and mining,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Construction,Unnamed: 11,Unnamed: 12,Unnamed: 13,Manufacturing,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,December,,January,,December,,January,,,December,,January,,December,,January,
1,,2003,2004.0,2004,2005p,2003,2004,2004,,2005p,2003,2004.0,2004,2005p,2003,2004.0,2004,2005p
2,Alabama ...............................,1891.8,1927.5,1862.7,1904.9,12.5,12.8,12.2,,12.8,101.7,106.4,99.6,104.7,290.1,293.8,286.5,292.3
3,Alaska ...................................,288.5,293.2,283.3,288.6,9.7,10,9.4,,9.8,15.5,15.9,13.7,14.2,5.8,5.8,10.6,10.5
4,Arizona .................................,2363.2,2451.0,2308.5,2408,8,8.8,7.8,,8.8,181.7,198.7,178.3,196.4,175.4,176.4,174.5,175.9
5,Arkansas ...............................,1158.4,1171.6,1135.2,1151.6,7.2,7.5,7,,7.2,50.2,51.7,47.8,49.7,204.3,204.3,202.7,203.1
6,California ..............................,14538.5,14755.9,14231.3,14507.4,22.7,22.9,21.5,,21.8,814.7,863.5,792,840.7,1527.6,1528.6,1508.4,1519.4
7,Colorado ...............................,2175.5,2223.1,2121.6,2174.8,13.5,15.3,13.2,,15,147.2,153.4,140.1,146.7,154,155.7,152.3,154.2
8,Connecticut ...........................,1665.7,1686.3,1616.6,1638.1,0.7,0.7,0.6,,0.6,63.3,68.0,57.5,61.8,198.4,199.0,197,197.7
9,Delaware .............................,422.9,432.6,410.7,419.8,( 1 ),( 1 ),( 1 ),,( 1 ),25.6,27.2,24.4,26.4,34.9,34.9,34.3,33.2
