In [1]:
# Specification 1: 
# Normal - Z_(t - 5); industry composition 5 years before
    # 1.1 - ignore unknown industries
        # 1.1.1 - Real estate => Prof business
            # 1.1.1.1 - Without time fixed effect or state fixed effect
            # 1.1.1.2 - With Time fixed effect
        # 1.1.2 - Real estate => finance
            # 1.1.2.1 - Without time fixed effect or state fixed effect
            # 1.1.2.2 - With Time fixed effect
    # 1.2 - even out unknown industries
        # 1.2.1 - Real estate => Prof business
            # 1.2.1.1 - Without time fixed effect or state fixed effect
            # 1.2.1.2 - With Time fixed effect
        # 1.2.2 - Real estate => finance
            # 1.2.2.1 - Without time fixed effect or state fixed effect
            # 1.2.2.2 - With Time fixed effect

# a) ( 1 ) values replaced with half of construction
# b) ( 1 ) values replaced with 0


# Specification 2:
# Lagged, Y_(t + 1)
    # 1.1 - ignore unknown industries
        # 1.1.1 - Real estate => Prof business
            # 1.1.1.1 - Without time fixed efect or state fixed effect
            # 1.1.1.2 - With Time fixed effect
        # 1.1.2 - Real estate => finance
            # 1.1.2.1 - Without time fixed effect or state fixed effect
            # 1.1.2.2 - With Time fixed effect
    # 1.2 - even out unknown industries
        # 1.2.1 - Real estate => Prof business
            # 1.2.1.1 - Without time fixed effect or state fixed effect
            # 1.2.1.2 - With Time fixed effect
        # 1.2.2 - Real estate => finance
            # 1.2.2.1 - Without time fixed effect or state fixed effect
            # 1.2.2.2 - With Time fixed effect

# Specification 3:
# Normal but use growth in GDP and growth of employment

# Specification 4:
# Like specification 2 but use growth

# Specification 5:
# Study specific states - (Geography, immigration law activity, political preference, immigration policy)

In [2]:
import pandas as pd

# Crafting the main dataframe
# Columns = State, Year, Industry, GDP, H-1B, Labor

us_states = [
    "alabama",
    "alaska",
    "arizona",
    "arkansas",
    "california",
    "colorado",
    "connecticut",
    "delaware",
    "florida",
    "georgia",
    "hawaii",
    "idaho",
    "illinois",
    "indiana",
    "iowa",
    "kansas",
    "kentucky",
    "louisiana",
    "maine",
    "maryland",
    "massachusetts",
    "michigan",
    "minnesota",
    "mississippi",
    "missouri",
    "montana",
    "nebraska",
    "nevada",
    "newhampshire",
    "newjersey",
    "newmexico",
    "newyork",
    "northcarolina",
    "northdakota",
    "ohio",
    "oklahoma",
    "oregon",
    "pennsylvania",
    "rhodeisland",
    "southcarolina",
    "southdakota",
    "tennessee",
    "texas",
    "utah",
    "vermont",
    "virginia",
    "washington",
    "westvirginia",
    "wisconsin",
    "wyoming",
]

us_state_abbreviations = [
    "AL",
    "AK",
    "AZ",
    "AR",
    "CA",
    "CO",
    "CT",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MD",
    "MA",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "ND",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VT",
    "VA",
    "WA",
    "WV",
    "WI",
    "WY",
]

# Getting all industries (there are 20 industries + 1 Unknown)
h1b_data_industries = [
    "11 - Agriculture, Forestry, Fishing and Hunting",
    "21 - Mining, Quarrying, and Oil and Gas Extraction",
    "22 - Utilities",
    "23 - Construction",
    "31-33 - Manufacturing",
    "42 - Wholesale Trade",
    "44-45 - Retail Trade",
    "48-49 - Transportation and Warehousing",
    "51 - Information",
    "52 - Finance and Insurance",
    "53 - Real Estate and Rental and Leasing",
    "54 - Professional, Scientific, and Technical Services",
    "55 - Management of Companies and Enterprises",
    "56 - Administrative and Support and Waste Management and Remediation Services",
    "61 - Educational Services",
    "62 - Health Care and Social Assistance",
    "71 - Arts, Entertainment, and Recreation",
    "72 - Accommodation and Food Services",
    "81 - Other Services (except Public Administration)",
    "92 - Public Administration",
    "Unknown",
]

labor_data_industries = [
    "Total",
    "Natural Res. and Mining",
    "Construction",
    "Manufacturing",
    "Trade, Transport, Util.",
    "Info",
    "Finance",
    "Prof. Bus. Services",
    "Education and health",
    "Leisure and hospitality",
    "Other",
    "Gov",
]

industry_mapping = {
    "11 - Agriculture, Forestry, Fishing and Hunting": "Natural Res. and Mining",
    "21 - Mining, Quarrying, and Oil and Gas Extraction": "Natural Res. and Mining",
    "22 - Utilities": "Trade, Transport, Util.",
    "23 - Construction": "Construction",
    "31-33 - Manufacturing": "Manufacturing",
    "42 - Wholesale Trade": "Trade, Transport, Util.",
    "44-45 - Retail Trade": "Trade, Transport, Util.",
    "48-49 - Transportation and Warehousing": "Trade, Transport, Util.",
    "51 - Information": "Info",
    "52 - Finance and Insurance": "Finance",
    "53 - Real Estate and Rental and Leasing": "Prof. Bus. Services",  # Could be finance or prof bus
    "54 - Professional, Scientific, and Technical Services": "Prof. Bus. Services",
    "55 - Management of Companies and Enterprises": "Prof. Bus. Services",
    "56 - Administrative and Support and Waste Management and Remediation Services": "Prof. Bus. Services",
    "61 - Educational Services": "Education and health",
    "62 - Health Care and Social Assistance": "Education and health",
    "71 - Arts, Entertainment, and Recreation": "Leisure and hospitality",
    "72 - Accommodation and Food Services": "Leisure and hospitality",
    "81 - Other Services (except Public Administration)": "Other",
    "92 - Public Administration": "Gov",
    # "Unknown": "Total",  # Catch-all category
}

industry_mapping_counts = {
    "11 - Agriculture, Forestry, Fishing and Hunting": 2,
    "21 - Mining, Quarrying, and Oil and Gas Extraction": 2,
    "22 - Utilities": 4,
    "23 - Construction": 1,
    "31-33 - Manufacturing": 1,
    "42 - Wholesale Trade": 4,
    "44-45 - Retail Trade": 4,
    "48-49 - Transportation and Warehousing": 4,
    "51 - Information": 1,
    "52 - Finance and Insurance": 2,
    "53 - Real Estate and Rental and Leasing": 1,  # Could be finance
    "54 - Professional, Scientific, and Technical Services": 4,
    "55 - Management of Companies and Enterprises": 4,
    "56 - Administrative and Support and Waste Management and Remediation Services": 4,
    "61 - Educational Services": 2,
    "62 - Health Care and Social Assistance": 2,
    "71 - Arts, Entertainment, and Recreation": 2,
    "72 - Accommodation and Food Services": 2,
    "81 - Other Services (except Public Administration)": 1,
    "92 - Public Administration": 1,
    "Unknown": 20,  # Catch-all category
}

def create_labor_average(option):
    # Calculate annual average of labor for each year for each industry (create labor_averaged.csv)
    labor_df = pd.read_csv("./data/labor_processed/labor.csv")
    labor_df["Year"] = labor_df["Year"].astype(int)
    labor_df["Value"] = pd.to_numeric(labor_df["Value"], errors="coerce").fillna(0)

    # Filter for the four selected months
    selected_months = {"jan", "apr", "jul", "oct"}
    labor_df_filtered = labor_df[labor_df["Month"].isin(selected_months)]

    # Group by Year, State, and Category, then compute the mean
    labor_avg_df = (
        labor_df_filtered.groupby(["Year", "State", "Category"])["Value"]
        .mean()
        .reset_index()
    )

    # Save to CSV
    labor_avg_df.to_csv("./data/labor_processed/labor_averaged.csv", index=False)

def create_realgdp_average(option):
    # Process GDP (get averages)
    all_gdp = []
    for state in us_states:
        gdp_df = pd.read_csv(f"./data/realgdp/{state}.csv")
        gdp_df["Year"] = pd.to_datetime(gdp_df["observation_date"]).dt.year
        
        # Compute annual average GDP
        df_avg = gdp_df.groupby("Year").mean().reset_index()
        
        # Rename GDP column to state-specific
        df_avg.rename(columns={gdp_df.columns[1]: state}, inplace=True)
        
        # Append to list
        all_gdp.append(df_avg)

    result_df = all_gdp[0]
    for df in all_gdp[1:]:
        result_df = pd.merge(result_df, df, on="Year", how="outer")

    # Save to CSV
    result_df.to_csv("./data/realgdp_processed/compiled_annual_gdp.csv", index=False)

def get_total_approvals(h1b_df, state_abbrv, industry, add_unknown):
    result = h1b_df.loc[
        (h1b_df["Petitioner State"] == state_abbrv) & (h1b_df["Industry (NAICS) Code"] == industry), 
        "Total Approvals"
    ].values
    result = result[0] if len(result) > 0 else 0
    
    if add_unknown:
        unknown = h1b_df.loc[
        (h1b_df["Petitioner State"] == state_abbrv) & (h1b_df["Industry (NAICS) Code"] == "Unknown"), 
        "Total Approvals"
         ].values
        unknown = unknown[0] if len(unknown) > 0 else 0
        
        return result + (unknown / 20)

    return result

In [16]:
# Create full dataframe
data = {
    "State": [],
    "Year": [],
    "Industry": [],
    "GDP": [],
    "H-1B": [],
    "Labor": [],
}

year_low = 2003 # Lowest for labor
h1b_gdp_low = 2009 # Lowest for H1-B and realgdp
year_high = 2024 # Highest for realgdp and H1-B

gdp_df = pd.read_csv("./data/realgdp_processed/compiled_annual_gdp.csv")
labor_df = pd.read_csv("./data/labor_processed/labor_averaged.csv")
# & (labor_df["State"].str.lower() == "alabama") & (labor_df["Category"] == "11 - Agriculture, Forestry, Fishing and Hunting")
#print(labor_df.loc[(labor_df["Year"] == 2003) & (labor_df["Category"] == industry_mapping["11 - Agriculture, Forestry, Fishing and Hunting"]) & (labor_df["State"].str.lower() == "alabama"), "Value"].values[0])
for year in range(year_low, year_high + 1):
    if year >= h1b_gdp_low:
        h1b_df = pd.read_csv(f"./data/h1b_processed/{year}.csv")
    for idx, state in enumerate(us_states):
        state_abbrv = us_state_abbreviations[idx]
        for industry in h1b_data_industries:
            data["State"].append(state)
            data["Year"].append(year)
            data["Industry"].append(industry)

            print("State: ", state)
            print("Year: ", year)
            print("Industry: ", industry)

            # Accessing dataframes
            if year < h1b_gdp_low:
                data["GDP"].append(None)
                data["H-1B"].append(None)
            else:
                data["GDP"].append(gdp_df.loc[gdp_df["Year"] == year, state].values[0])
                curr_h1b = get_total_approvals(h1b_df, state_abbrv, industry, True)
                data["H-1B"].append(curr_h1b)

            # Ignoring unknown category 
            if industry != "Unknown" and year <= 2020:
                curr_labor = labor_df.loc[(labor_df["Year"] == year) & (labor_df["State"].str.replace(" ", "").str.lower() == state) & (labor_df["Category"] == industry_mapping[industry]), "Value"].values[0] / industry_mapping_counts[industry]
                data["Labor"].append(curr_labor)
            else:
                data["Labor"].append(None)

            

State:  alabama
Year:  2003
Industry:  11 - Agriculture, Forestry, Fishing and Hunting
State:  alabama
Year:  2003
Industry:  21 - Mining, Quarrying, and Oil and Gas Extraction
State:  alabama
Year:  2003
Industry:  22 - Utilities
State:  alabama
Year:  2003
Industry:  23 - Construction
State:  alabama
Year:  2003
Industry:  31-33 - Manufacturing
State:  alabama
Year:  2003
Industry:  42 - Wholesale Trade
State:  alabama
Year:  2003
Industry:  44-45 - Retail Trade
State:  alabama
Year:  2003
Industry:  48-49 - Transportation and Warehousing
State:  alabama
Year:  2003
Industry:  51 - Information
State:  alabama
Year:  2003
Industry:  52 - Finance and Insurance
State:  alabama
Year:  2003
Industry:  53 - Real Estate and Rental and Leasing
State:  alabama
Year:  2003
Industry:  54 - Professional, Scientific, and Technical Services
State:  alabama
Year:  2003
Industry:  55 - Management of Companies and Enterprises
State:  alabama
Year:  2003
Industry:  56 - Administrative and Support and 

In [36]:
df_1 = pd.DataFrame.from_dict(data)
pd.set_option('display.max_rows', 100)
df_1


Unnamed: 0,State,Year,Industry,GDP,H-1B,Labor
0,alabama,2003,"11 - Agriculture, Forestry, Fishing and Hunting",,,6.57500
1,alabama,2003,"21 - Mining, Quarrying, and Oil and Gas Extrac...",,,6.57500
2,alabama,2003,22 - Utilities,,,92.39375
3,alabama,2003,23 - Construction,,,100.40000
4,alabama,2003,31-33 - Manufacturing,,,296.92500
...,...,...,...,...,...,...
23095,wyoming,2024,"71 - Arts, Entertainment, and Recreation",40362.3,0.15,
23096,wyoming,2024,72 - Accommodation and Food Services,40362.3,0.15,
23097,wyoming,2024,81 - Other Services (except Public Administrat...,40362.3,0.15,
23098,wyoming,2024,92 - Public Administration,40362.3,1.15,


In [37]:
# Load your DataFrame (assuming it's named df)
df_1['Labor'] = df_1.groupby(['State', 'Industry'])['Labor'].shift(5)

In [38]:
df_1 = df_1[(df_1['Year'] > 2007) & (df_1['Industry'] != "Unknown")]

# Rename the Labor column
df_1 = df_1.rename(columns={'Labor': 'Labor (t - 5)'})

df_1['Labor_Growth'] = df_1.groupby(['State', 'Industry'])['Labor (t - 5)'].diff()

# Calculate GDP Growth: current year GDP - previous year GDP
df_1['GDP_Growth'] = df_1.groupby(['State', 'Industry'])['GDP'].diff()

df_1 = df_1[(df_1['Year'] > 2009)]
df_1
# # Display the result
df_1.to_csv("./data/full_dataframes/df_3_2_1.csv")
# df_1

In [40]:
import pandas as pd

# Load dataset
df = pd.read_csv("./data/full_dataframes/df_1_2_1.csv")  # Replace with actual path

# Ensure 'State' and 'Year' are categorical
df['State'] = df['State'].astype('category')
df['Year'] = df['Year'].astype(int)

# Compute total labor for each (State, Year)
df['Total_Labor_t5'] = df.groupby(['State', 'Year'])['Labor (t - 5)'].transform('sum')

# Compute labor proportion
df['Labor_Prop_t5'] = df['Labor (t - 5)'] / df['Total_Labor_t5']

# Drop unnecessary column
df = df.drop(columns=['Total_Labor_t5'])

In [56]:
import pandas as pd
from linearmodels.panel import PanelOLS

df['H1B_Labor_t5'] = df['H-1B'] * df['Labor_Prop_t5']
df_agg = df.groupby(['State', 'Year']).agg({
    'GDP': 'first',  # GDP is already at State-Year level
    'H1B_Labor_t5': 'sum',  # Sum H1B × Labor across industries
}).reset_index()

# Ensure 'State' is categorical
df_agg['State'] = df_agg['State'].astype('category')

# Add a constant term
# df_agg['constant'] = 1

# Set the panel data index
df_agg = df_agg.set_index(['State', 'Year'])

# Define the panel regression model with fixed effects
model = PanelOLS(
    df_agg['GDP'], 
    df_agg['H1B_Labor_t5'],  # Independent variables
    entity_effects=True,  # State fixed effects
    time_effects=True     # Year fixed effects
)

# Fit the model with clustered standard errors
results = model.fit(cov_type="clustered", cluster_entity=True)
print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                    GDP   R-squared:                        0.4331
Estimator:                   PanelOLS   R-squared (Between):              0.5154
No. Observations:                 800   R-squared (Within):               0.4771
Date:                Mon, Mar 31 2025   R-squared (Overall):              0.5148
Time:                        15:33:43   Log-likelihood                   -9825.0
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      560.66
Entities:                          50   P-value                           0.0000
Avg Obs:                       16.000   Distribution:                   F(1,734)
Min Obs:                       16.000                                           
Max Obs:                       16.000   F-statistic (robust):             18.145
                            

In [52]:
import numpy as np
import pandas as pd
from linearmodels.panel import PanelOLS

# Compute the interaction term
df['H1B_Labor_t5'] = df['H-1B'] * df['Labor_Prop_t5']

# Aggregate data to State-Year level
df_agg = df.groupby(['State', 'Year']).agg({
    'GDP': 'first',  # GDP is already at State-Year level
    'H1B_Labor_t5': 'sum',  # Sum H1B × Labor across industries
}).reset_index()

# Ensure 'State' is categorical
df_agg['State'] = df_agg['State'].astype('category')

# Take the natural log of GDP (avoiding log(0) errors)
df_agg['log_GDP'] = np.log(df_agg['GDP'])

# Add a constant term
df_agg['constant'] = 1

# Set the panel data index
df_agg = df_agg.set_index(['State', 'Year'])

# Define the panel regression model with fixed effects
model = PanelOLS(
    df_agg['log_GDP'],  # Dependent variable: log(GDP)
    df_agg['H1B_Labor_t5'],  # Independent variables
    entity_effects=True,  # State fixed effects
    time_effects=True      # Year fixed effects
)

# Fit the model with clustered standard errors
results = model.fit()
print(results.summary)


                          PanelOLS Estimation Summary                           
Dep. Variable:                log_GDP   R-squared:                        0.0687
Estimator:                   PanelOLS   R-squared (Between):              0.0053
No. Observations:                 800   R-squared (Within):               0.0994
Date:                Mon, Mar 31 2025   R-squared (Overall):              0.0054
Time:                        15:30:23   Log-likelihood                    1271.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      54.117
Entities:                          50   P-value                           0.0000
Avg Obs:                       16.000   Distribution:                   F(1,734)
Min Obs:                       16.000                                           
Max Obs:                       16.000   F-statistic (robust):             54.117
                            