In [1]:
import os
import pandas as pd

df = pd.read_csv("source-data/PERM_2017.csv")
df = df.append(pd.read_csv("source-data/PERM_2018.csv"),sort=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

# Transform YES/No --> 1/0.
def to_binary_numeric(series):
    le = preprocessing.LabelEncoder()
    series = series.astype(str).apply(le.fit_transform)
    df = pd.DataFrame.from_records(series)
    return df

# Transform categorical data column to one hot encoding columns.
def to_one_hot_encoding(series):
    series = to_binary_numeric(series)
    enc = preprocessing.OneHotEncoder()
    enc.fit(series)
    series = enc.transform(series).toarray()
    df = pd.DataFrame.from_records(series)
    return df

# Normalize a column.
def normalize(column):
    min_max_scaler = preprocessing.MinMaxScaler()
    column_reshaped = column.values.reshape(-1, 1)
    column_scaled = min_max_scaler.fit_transform(column_reshaped)
    return pd.DataFrame(column_scaled)


# Make a copy of the data we just imported.
data = df.copy()


''' ----- pre-process lable y (CASE_STATUS) ----- '''

# In order to train a model we need to reduce the amount of data we use.
percentile = 0.03
print("We are using", percentile, "of orginal data for our model.")
data = data.sample(frac=percentile, random_state=250)
data = data.fillna(0)

# CASE_STATUS is our predicting lable. We need to transfrom its string format into numerical data.
data["CASE_STATUS"] = data["CASE_STATUS"].map({'Certified':1,'Certified-Expired':1,'Withdrawn':0, 'Denied':-1})

# Number of people get accepted is much bigger than number of people get rejected.
# In order to make our model unbiased we need to increase number of people get rejected.
rows_passed = data.loc[data["CASE_STATUS"] == 1]
rows_denied = data.loc[data["CASE_STATUS"] == -1]
print("Original data: pass ->",len(rows_passed),"Denied ->",len(rows_denied))
for i in range(0,10):
    data = data.append(rows_denied,ignore_index=True)
rows_passed = data.loc[data["CASE_STATUS"] == 1]
rows_denied = data.loc[data["CASE_STATUS"] == -1]
print("Expanded data: pass ->",len(rows_passed),"Denied ->",len(rows_denied))



''' ----- pre-process Attributes X ----- '''

# Process REFILE
data['REFILE'] = to_binary_numeric(data[['REFILE']])
data_for_classifier = pd.concat((data['REFILE'], pd.DataFrame(data["CASE_STATUS"])), axis=1)

# Process EMPLOYER_STATE
EMPLOYER_STATE_columns = to_one_hot_encoding(data[['EMPLOYER_STATE']])
data_for_classifier = pd.concat((EMPLOYER_STATE_columns, data_for_classifier), axis=1)

# Process EMPLOYER_NUM_EMPLOYEES
data['EMPLOYER_NUM_EMPLOYEES'] = normalize(data['EMPLOYER_NUM_EMPLOYEES'])
data_for_classifier = pd.concat((data['EMPLOYER_NUM_EMPLOYEES'], data_for_classifier), axis=1)

# Process FW_OWNERSHIP_INTEREST
data['FW_OWNERSHIP_INTEREST'] = to_binary_numeric(data[['FW_OWNERSHIP_INTEREST']])
data_for_classifier = pd.concat((data['FW_OWNERSHIP_INTEREST'], data_for_classifier), axis=1)

# Process PW_SOC_CODE
data['PW_SOC_CODE'] = [elem[:2] for elem in list(map(str, list(data['PW_SOC_CODE'])))]
PW_SOC_CODE_columns = to_one_hot_encoding(data[['PW_SOC_CODE']])
data_for_classifier = pd.concat((PW_SOC_CODE_columns, data_for_classifier), axis=1)

# Process PW_LEVEL_9089
data["PW_LEVEL_9089"] = data["PW_LEVEL_9089"].map({'Level I':1,'Level II':2,'Level IV':3, 'Level III':4})
data_for_classifier = pd.concat((data["PW_LEVEL_9089"], data_for_classifier), axis=1)

# Process JOB_INFO_WORK_STATE
JOB_INFO_WORK_STATE_columns = to_one_hot_encoding(data[['JOB_INFO_WORK_STATE']])
data_for_classifier = pd.concat((JOB_INFO_WORK_STATE_columns, data_for_classifier), axis=1)

# Process JOB_INFO_EDUCATION
data["JOB_INFO_EDUCATION"] = data["JOB_INFO_EDUCATION"].map({"None":1,'Other':1,'High School':2, "Associate's":3, "Bachelor's":4, "Master's":5, 'Doctorate':6})
data_for_classifier = pd.concat((data['JOB_INFO_EDUCATION'], data_for_classifier), axis=1)

# Process JI_OFFERED_TO_SEC_J_FW
data['JI_OFFERED_TO_SEC_J_FW'] = to_binary_numeric(data[['JI_OFFERED_TO_SEC_J_FW']])
data_for_classifier = pd.concat((data['JI_OFFERED_TO_SEC_J_FW'], data_for_classifier), axis=1)

# Process NAICS_US_CODE
data['NAICS_US_CODE'] = [elem[:2] for elem in list(map(str, list(data['NAICS_US_CODE'])))]
NAICS_US_CODE_columns = to_one_hot_encoding(data[['NAICS_US_CODE']])
data_for_classifier = pd.concat((NAICS_US_CODE_columns, data_for_classifier), axis=1)

# Process COUNTRY_OF_CITIZENSHIP
COUNTRY_OF_CITIZENSHIP_columns = to_one_hot_encoding(data[['COUNTRY_OF_CITIZENSHIP']])
data_for_classifier = pd.concat((COUNTRY_OF_CITIZENSHIP_columns, data_for_classifier), axis=1)

# Process CLASS_OF_ADMISSION
CLASS_OF_ADMISSION_columns = to_one_hot_encoding(data[['CLASS_OF_ADMISSION']])
data_for_classifier = pd.concat((CLASS_OF_ADMISSION_columns, data_for_classifier), axis=1)

# Process FOREIGN_WORKER_INFO_EDUCATION
data["FOREIGN_WORKER_INFO_EDUCATION"] = data["FOREIGN_WORKER_INFO_EDUCATION"].map({"0":1,"None":1,'Other':1,'High School':2, "Associate's":3, "Bachelor's":4, "Master's":5, 'Doctorate':6})
FOREIGN_WORKER_INFO_EDUCATION_columns = to_one_hot_encoding(data[['FOREIGN_WORKER_INFO_EDUCATION']])
data_for_classifier = pd.concat((FOREIGN_WORKER_INFO_EDUCATION_columns, data_for_classifier), axis=1)

# Process FW_INFO_TRAINING_COMP
FW_INFO_TRAINING_COMP_columns = to_one_hot_encoding(data[['FW_INFO_TRAINING_COMP']])
data_for_classifier = pd.concat((FW_INFO_TRAINING_COMP_columns, data_for_classifier), axis=1)

# Process NAICS_US_CODE
NAICS_US_CODE_columns = to_one_hot_encoding(data[['NAICS_US_CODE']])
data_for_classifier = pd.concat((NAICS_US_CODE_columns, data_for_classifier), axis=1)

# Process WAGE_OFFER_FROM_9089 and WAGE_OFFER_UNIT_OF_PAY_9089
data["WAGE_OFFER_UNIT_OF_PAY_9089"] = data["WAGE_OFFER_UNIT_OF_PAY_9089"].map({"Hour":2085,"Week":52,'Month':12,'Year':1})
data["WAGE_OFFER_FROM_9089"] = pd.to_numeric(data["WAGE_OFFER_FROM_9089"].astype(str).str.replace(',',''), errors='coerce').fillna(0).astype(int)
salary = data.WAGE_OFFER_UNIT_OF_PAY_9089 * data.WAGE_OFFER_FROM_9089
data_for_classifier = pd.concat((salary, data_for_classifier), axis=1)



''' ----- Build model with Support Vector Machine algorithm ----- '''

# Dropna
data_for_classifier = data_for_classifier.dropna()

# 分成train和test
y = data_for_classifier["CASE_STATUS"]
X = data_for_classifier.drop(columns=['CASE_STATUS'])

train_per = 0.3
print("Using", train_per,"of data as training data.")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Buidling SVM Classifier model ......")

# train
clf = svm.SVC()
clf.fit(X_train, y_train)

# predict
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy using test data: ",acc)






We are using 0.03 of orginal data for our model.
Original data: pass -> 5897 Denied -> 391
Expanded data: pass -> 5897 Denied -> 4301
Using 0.3 of data as training data.
Buidling SVM Classifier model ......
Accuracy using test data:  0.8982764447448462
