In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

# The Data Information (from Kaggle, https://www.kaggle.com/jboysen/us-perm-visas)

## <i>"A permanent labor certification issued by the Department of Labor (DOL) allows an employer to hire a foreign worker to work permanently in the United States. In most instances, before the U.S. employer can submit an immigration petition to the Department of Homeland Security's U.S. Citizenship and Immigration Services (USCIS), the employer must obtain a certified labor certification application from the DOL's Employment and Training Administration (ETA). The DOL must certify to the USCIS that there are not sufficient U.S. workers able, willing, qualified and available to accept the job opportunity in the area of intended employment and that employment of the foreign worker will not adversely affect the wages and working conditions of similarly employed U.S. workers."</i>

In [2]:
df = pd.read_csv('us_perm_visas.csv')
df.head()

Unnamed: 0,add_these_pw_job_title_9089,agent_city,agent_firm_name,agent_state,application_type,case_no,case_number,case_received_date,case_status,class_of_admission,...,ri_pvt_employment_firm_to,ri_us_workers_considered,schd_a_sheepherder,us_economic_sector,wage_offer_from_9089,wage_offer_to_9089,wage_offer_unit_of_pay_9089,wage_offered_from_9089,wage_offered_to_9089,wage_offered_unit_of_pay_9089
0,,,,,PERM,A-07323-97014,,,Certified,J-1,...,,,,IT,75629.0,,yr,,,
1,,,,,PERM,A-07332-99439,,,Denied,B-2,...,,,,Other Economic Sector,37024.0,,yr,,,
2,,,,,PERM,A-07333-99643,,,Certified,H-1B,...,,,,Aerospace,47923.0,,yr,,,
3,,,,,PERM,A-07339-01930,,,Certified,B-2,...,,,,Other Economic Sector,10.97,,hr,,,
4,,,,,PERM,A-07345-03565,,,Certified,L-1,...,,,,Advanced Mfg,100000.0,,yr,,,


## Seeing the first 5 rows of the data with null values, knowing how much null values is essential. It is also noticable that there are 154 columns/features in the dataset, dropping some of these features maybe beneficial for certain use of the dataset. 

## For the puspose of showcasing foundation skills, the dataset will be used to create descriptive analysis - getting insights on which specific features does the data shows bias in.

In [3]:
df.isna().mean()# counting and showing percentage all of null values in each feature of the dataset

add_these_pw_job_title_9089               0.887291
agent_city                                0.435162
agent_firm_name                           0.447067
agent_state                               0.443750
application_type                          0.638668
case_no                                   0.638668
case_number                               0.361332
case_received_date                        0.361337
case_status                               0.000000
class_of_admission                        0.061024
country_of_citizenship                    0.055115
country_of_citzenship                     0.945042
decision_date                             0.000000
employer_address_1                        0.000112
employer_address_2                        0.398526
employer_city                             0.000037
employer_country                          0.361530
employer_decl_info_title                  0.361436
employer_name                             0.000032
employer_num_employees         

## From the results above, we can see that there are numerous features with null values close to 90%. Hence, it wont be beneficial to fill/clean the dataset with this much missing information. It may create bias to the data and the proceeding analysis. Therefor, dropping the features with greater than 60% (arbitrary number, assuming that all features are and would be normally distributed) would be feasible. It would not not create a false mean should we fill the null values nor create bias.

In [4]:
for x in df: # looping through all the dataframe features
    if df[x].isna().mean()>=.4: # conditional filter as stated above
        df.drop(x, axis = 1, inplace = True) # dropping the feature that fits the condition

In [5]:
df.isna().mean()

case_number                       0.361332
case_received_date                0.361337
case_status                       0.000000
class_of_admission                0.061024
country_of_citizenship            0.055115
decision_date                     0.000000
employer_address_1                0.000112
employer_address_2                0.398526
employer_city                     0.000037
employer_country                  0.361530
employer_decl_info_title          0.361436
employer_name                     0.000032
employer_num_employees            0.361546
employer_phone                    0.361428
employer_postal_code              0.000099
employer_state                    0.000112
employer_yr_estab                 0.361693
foreign_worker_info_city          0.361407
foreign_worker_info_education     0.361439
ji_live_in_domestic_service       0.361367
job_info_alt_combo_ed_exp         0.361426
job_info_alt_field                0.361423
job_info_combo_occupation         0.361359
job_info_ed

In [7]:
from IPython.core.display import HTML

display(HTML(df.head(10).to_html()))

Unnamed: 0,case_number,case_received_date,case_status,class_of_admission,country_of_citizenship,decision_date,employer_address_1,employer_address_2,employer_city,employer_country,employer_decl_info_title,employer_name,employer_num_employees,employer_phone,employer_postal_code,employer_state,employer_yr_estab,foreign_worker_info_city,foreign_worker_info_education,ji_live_in_domestic_service,job_info_alt_combo_ed_exp,job_info_alt_field,job_info_combo_occupation,job_info_education,job_info_experience,job_info_foreign_ed,job_info_foreign_lang_req,job_info_job_req_normal,job_info_job_title,job_info_training,job_info_work_city,job_info_work_postal_code,job_info_work_state,preparer_info_emp_completed,pw_amount_9089,pw_determ_date,pw_expire_date,pw_level_9089,pw_soc_code,pw_soc_title,pw_source_name_9089,pw_track_num,pw_unit_of_pay_9089,recr_info_coll_univ_teacher,recr_info_employer_rec_payment,recr_info_first_ad_start,recr_info_professional_occ,recr_info_second_ad_start,recr_info_sunday_newspaper,recr_info_swa_job_order_end,recr_info_swa_job_order_start,ri_1st_ad_newspaper_name,ri_2nd_ad_newspaper_name,ri_2nd_ad_newspaper_or_journal,ri_layoff_in_past_six_months,ri_posted_notice_at_worksite,schd_a_sheepherder,wage_offer_from_9089,wage_offer_unit_of_pay_9089
0,,,Certified,J-1,,2012-02-01,24 WEST 25 STREET,5TH FLOOR,NEW YORK,,,NETSOFT USA INC.,,,10010,NY,,,,,,,,,,,,,,,New York,,NY,,75629.0,,,Level II,15-1031.00,"Computer Software Engineers, Applications",OES,,yr,,,,,,,,,,,,,,,75629.0,yr
1,,,Denied,B-2,,2011-12-21,200 BROAD STREET,,CARLSTADT,,,PINNACLE ENVIRONEMNTAL CORP,,,7072,NY,,,,,,,,,,,,,,,New York,,NY,,37024.0,,,Level I,47-4041.00,Hazardous Materials Removal Workers,OES,,yr,,,,,,,,,,,,,,,37024.0,yr
2,,,Certified,H-1B,,2011-12-01,1054 TECHNOLOGY PARK DRIVE,,GLEN ALLEN,,,"SCHNABEL ENGINEERING, INC.",,,23059,VA,,,,,,,,,,,,,,,Lutherville,,MD,,47923.0,,,Level I,17-2051.00,Civil Engineers,OES,,yr,,,,,,,,,,,,,,,47923.0,yr
3,,,Certified,B-2,,2011-12-01,33-17 PRINCE ST. 2ND FL,,FLUSHING,,,EBENEZER MISSION CHURCH,,,11354,NY,,,,,,,,,,,,,,,Flushing,,NY,,10.97,,,Level II,43-4071.00,File Clerks,OES,,hr,,,,,,,,,,,,,,,10.97,hr
4,,,Certified,L-1,,2012-01-26,1373 BROADWAY,,ALBANY,,,ALBANY INTERNATIONAL CORP.,,,12204,NY,,,,,,,,,,,,,,,Albany,,NY,,94890.0,,,Level IV,41-9031.00,Sales Engineers,OES,,yr,,,,,,,,,,,,,,,100000.0,yr
5,,,Denied,EWI,,2012-01-26,200 BROAD STREET,,CARLSTADT,,,PINNACLE ENVIRONEMNTAL CORP,,,7072,NY,,,,,,,,,,,,,,,New York,,NY,,37024.0,,,Level I,47-4041.00,Hazardous Materials Removal Workers,OES,,yr,,,,,,,,,,,,,,,37024.0,yr
6,,,Certified-Expired,H-1B,,2011-10-07,285 PAWLING AVE,,TROY,,,EMMA WILLARD SCHOOL,,,12180,NY,,,,,,,,,,,,,,,Troy,,NY,,47083.3,,,Level II,25-2031.00,"Secondary School Teachers, Except Special and ...",OES,,yr,,,,,,,,,,,,,,,47084.0,yr
7,,,Denied,E-2,,2012-02-06,1831 - A BLOUNT ROAD,,POMPANO BEACH,,,FDS ALUMINUM LLC,,,33069,FL,,,,,,,,,,,,,,,POMPANO BEACH,,FL,,36733.0,,,Level I,13-1051.00,Cost Estimators,OES,,yr,,,,,,,,,,,,,,,36733.0,yr
8,,,Certified,H-1B,,2012-02-29,2711 CENTERVILLE ROAD,,WILMINGTON,,,ELECTRONIC DATA SYSTEMS CORPORATION,,,19808,DE,,,,,,,,,,,,,,,Fort Worth,,TX,,44824.0,,,Level I,15-1051.00,Computer Systems Analysts,OES,,yr,,,,,,,,,,,,,,,44824.0,yr
9,,,Denied,E-2,,2012-03-30,3180 FULTON STREET,,BROOKLYN,,,"AMER BROTHERS INTERNATIONAL, INC.",,,11208,NY,,,,,,,,,,,,,,,Brooklyn,,NY,,12.86,,,Level I,41-1011.00,First-Line Supervisors/Managers of Retail Sale...,OES,,hr,,,,,,,,,,,,,,,12.86,hr


In [8]:
df.columns

Index(['case_number', 'case_received_date', 'case_status',
       'class_of_admission', 'country_of_citizenship', 'decision_date',
       'employer_address_1', 'employer_address_2', 'employer_city',
       'employer_country', 'employer_decl_info_title', 'employer_name',
       'employer_num_employees', 'employer_phone', 'employer_postal_code',
       'employer_state', 'employer_yr_estab', 'foreign_worker_info_city',
       'foreign_worker_info_education', 'ji_live_in_domestic_service',
       'job_info_alt_combo_ed_exp', 'job_info_alt_field',
       'job_info_combo_occupation', 'job_info_education',
       'job_info_experience', 'job_info_foreign_ed',
       'job_info_foreign_lang_req', 'job_info_job_req_normal',
       'job_info_job_title', 'job_info_training', 'job_info_work_city',
       'job_info_work_postal_code', 'job_info_work_state',
       'preparer_info_emp_completed', 'pw_amount_9089', 'pw_determ_date',
       'pw_expire_date', 'pw_level_9089', 'pw_soc_code', 'pw_soc_title',

# From the information above, we can identify relevant information regarding the permanent visa issuance. 

## Relevant features:
'case_received_date'
'case_status'
'class_of_admission'
'country_of_citizenship'
'employer_country'
'employer_city'
'employer_state'
'employer_yr_estab'
'foreign_worker_info_city'
'foreign_worker_info_education'
'job_info_education',
'job_info_experience'
'job_info_foreign_ed',
'job_info_foreign_lang_req'
'job_info_job_req_normal'
'job_info_job_title'
'job_info_training'
'job_info_work_city'
'job_info_work_state',
'preparer_info_emp_completed'
'pw_determ_date',
'pw_expire_date', 'pw_level_9089', 'pw_soc_code', 'pw_soc_title',
'pw_source_name_9089', 'pw_track_num', 'pw_unit_of_pay_9089',
'recr_info_coll_univ_teacher', 'recr_info_employer_rec_payment',
'recr_info_first_ad_start', 'recr_info_professional_occ',
'recr_info_second_ad_start', 'recr_info_sunday_newspaper',
'recr_info_swa_job_order_end', 'ri_layoff_in_past_six_months',
'ri_posted_notice_at_worksite', 'schd_a_sheepherder',
'wage_offer_from_9089', 'wage_offer_unit_of_pay_9089'