In [26]:
# Standard imports
import pandas as pd
import numpy as np

Let's load in our cleaned data from the CSV we generated in DataCleaning.iypnb.

In [27]:
file_name = 'H-2B_Cleaned_data.csv'
df = pd.read_csv(file_name)
print(df.shape)
df.head(10)

(6540, 22)


Unnamed: 0,CASE_STATUS,AGENT_POC_EMP_REP_BY_AGENT,SOC_CODE,NAICS_CODE,NBR_WORKERS_REQUESTED,FULL_TIME_POSITION,NATURE_OF_TEMPORARY_NEED,BASIC_NUMBER_OF_HOURS,BASIC_RATE_OF_PAY,SUPERVISE_HOW_MANY,...,EMP_EXP_NUM_MONTHS,SWA_NAME,AGENT_ATTORNEY_CITY,WORKSITE_CITY,AGENT_ATTORNEY_STATE,WORKSITE_STATE,HOURLY_WORK_SCHEDULE_AM,HOURLY_WORK_SCHEDULE_PM,OVERTIME_RATE_FROM,OVERTIME_RATE_TO
0,1,1,37,56,5,1,peakload,40.0,13.68,0.0,...,0.0,0,brooklyn,sandy,new york,utah,07:00:00,16:00:00,20.52,20.52
1,1,1,45,11,49,1,seasonal,36.0,14.27,0.0,...,0.0,0,lake park,hermitage,georgia,arkansas,06:00:00,13:00:00,21.41,22.5
2,1,1,35,71,3,1,peakload,40.0,15.12,0.0,...,12.0,0,framingham,naples,massachusetts,florida,09:00:00,17:00:00,22.68,22.68
3,1,1,37,56,10,1,seasonal,35.0,16.36,0.0,...,0.0,0,charlottesville,teton village,virginia,wyoming,05:00:00,13:00:00,24.54,24.54
4,1,1,37,72,12,1,peakload,35.0,12.66,0.0,...,6.0,0,framingham,west dover,massachusetts,vermont,08:00:00,15:00:00,18.99,18.99
5,1,1,45,11,35,1,seasonal,40.0,10.25,0.0,...,3.0,0,coeur d' alene,saluda,idaho,south carolina,08:00:00,17:00:00,15.38,21.29
6,1,1,45,11,16,1,seasonal,40.0,14.97,0.0,...,0.0,0,coeur d' alene,buena vista,idaho,georgia,07:00:00,17:00:00,22.46,22.46
7,1,1,37,56,20,1,seasonal,40.0,14.24,0.0,...,0.0,0,coeur d' alene,atlanta,idaho,georgia,08:00:00,17:00:00,21.36,21.36
8,1,1,45,11,60,1,seasonal,40.0,9.79,0.0,...,0.0,0,coeur d' alene,lagrange,idaho,georgia,08:00:00,16:00:00,14.69,29.67
9,1,0,37,56,20,1,seasonal,35.0,10.91,0.0,...,1.0,0,no attorney representation,clearwater beach,no attorney representation,florida,08:00:00,15:00:00,16.37,16.37


Let's perform some feature engineering on the data.
Our intention is to create new features from existing features that might be useful in classifying cases as certified or denied.

Our first engineered features will be:
- [STATE_MATCH]: WORKSITE_STATE matches AGENT_ATTORNEY_STATE.
- [CITY_MATCH]: WORKSITE_CITY matches AGENT_ATTORNEY_CITY.

These columns can have 3 distinct values:
- "no_rep"   : AGENT_ATTORNEY_STATE or AGENT_ATTORNEY_CITY are "no_attorney_representation". Distinguishes this unique state by avoiding generating "different" when in reality, the case simply has no agent attorney.
- "same"    : WORKSITE_STATE matches AGENT_ATTORNEY_STATE, or WORKSITE_CITY matches AGENT_ATTORNEY_CITY.
- "different" : WORKSITE_STATE does not match AGENT_ATTORNEY_STATE, or WORKSITE_CITY does not match AGENT_ATTORNEY_CITY.

This may lend some insight into whether companies that can afford to hire attorneys located in different cities have an advantage,
or if using local attorneys is as effective or a predictor of worse performance.

Once complete, we also drop the columns [WORKSITE_STATE], [WORKSITE_CITY], [AGENT_ATTORNEY_STATE], [AGENT_ATTORNEY_CITY].

In [28]:
no_attorney_representation = "no attorney representation"

def generate_matches(row):
    """
    Generate the columns STATE_MATCH and CITY_MATCH
    """
    worksite_state = row["WORKSITE_STATE"]
    agent_attorney_state = row["AGENT_ATTORNEY_STATE"]
    
    worksite_city = row["WORKSITE_CITY"]
    agent_attorney_city = row["AGENT_ATTORNEY_CITY"]
    
    state_match, city_match = "different", "different"
    
    if agent_attorney_state == no_attorney_representation:
        state_match = "no_rep"
    elif worksite_state == agent_attorney_state:
        state_match = "same"
    
    if agent_attorney_city == no_attorney_representation:
        city_match = "no_rep"
    elif worksite_city == agent_attorney_city:
        city_match = "same"
    
    return (city_match, state_match)

df[["CITY_MATCH", "STATE_MATCH"]] = df.apply(generate_matches, axis=1, result_type='expand')
df.drop(columns=['WORKSITE_STATE', 'WORKSITE_CITY', 'AGENT_ATTORNEY_STATE', 'AGENT_ATTORNEY_CITY'], inplace=True)

Our next engineered features will be:

- [WORK_DAY_LENGTH]: How many hours the job will entail, as indicated by the difference between ["HOURLY_WORK_SCHEDULE_AM"] and ["HOURLY_WORK_SCHEDULE_PM"].
- [DAYTIME_WORK]: Whether or not the job is over "daytime" hours, meaning that it starts between 5AM and 10AM, and lasts 12 hours a shift or less.

Only one row has abnormal entries "9am" and "5pm", so we'll explicitly convert those values to 09:00:00 and 17:00:00.

What this should tell us is if jobs with abnormally long work days (or abnormally short work days) and more "normal" daytime hours affect the possibility that the job is successfully certified.

Once complete, we also drop the columns [HOURLY_WORK_SCHEDULE_AM], [HOURLY_WORK_SCHEDULE_PM].

In [29]:
def generate_workday_columns(row):
    """
    Generate the columns WORK_DAY_LENGTH and DAYTIME_WORK.
    """
    time_start_string = row["HOURLY_WORK_SCHEDULE_AM"]
    time_end_string = row["HOURLY_WORK_SCHEDULE_PM"]
    
    # Dealing with explicit abnormal entries.
    if time_start_string == "9am" and time_end_string == "5pm":
        time_start_string = "09:00:00"
        time_end_string = "17:00:00"
    
    time_start_num = float(time_start_string[0:2]) + (float(time_start_string[3:5]) / 60)
    time_end_num = float(time_end_string[0:2]) + (float(time_end_string[3:5]) / 60)
    
    # If the end time is less than the start time (meaning the work duration went over the 12:00 AM boundary)
    if time_end_num <= time_start_num:
        time_end_num += 24
    
    work_length = time_end_num - time_start_num
    
    is_daytime = 0
    if 5 <= time_start_num <= 10 and work_length <= 12:
        is_daytime = 1
    
    return (work_length, is_daytime)

df[["WORK_DAY_LENGTH", "DAYTIME_WORK"]] = df.apply(generate_workday_columns, axis=1, result_type='expand')

df.drop(columns=['HOURLY_WORK_SCHEDULE_AM', 'HOURLY_WORK_SCHEDULE_PM'], inplace=True)

The final feature we're seeking to engineer is:

- [HAS_OVERTIME]: Whether or not the job offers overtime pay.

We're uncertain if there's a significant relationship between a job paying overtime pay and the job being certified, but we want to standardize overtime pay as a categorical feature and simplify from the two columns [OVERTIME_RATE_FROM] and [OVERTIME_RATE_TO].

Once complete, we also drop the columns [OVERTIME_RATE_FROM], [OVERTIME_RATE_TO].

In [30]:
def has_overtime_pay(row):
    base_pay = float(row["BASIC_RATE_OF_PAY"])
    overtime_rate_from = float(row["OVERTIME_RATE_FROM"])
    overtime_rate_to = float(row["OVERTIME_RATE_TO"])
    
    if overtime_rate_from != base_pay and overtime_rate_from != 0:
        return 1
    
    if overtime_rate_to != base_pay and overtime_rate_to != 0:
        return 1
    
    return 0

df["HAS_OVERTIME"] = df.apply(has_overtime_pay, axis=1)
df.drop(columns=['OVERTIME_RATE_FROM', 'OVERTIME_RATE_TO'], inplace=True)

We're done feature engineering, so let's save this data to a csv.

In [31]:
df.to_csv('H-2B_Engineered_Data.csv', index=False)