In [1]:
import sklearn as sk
import pandas as pd

In [2]:
# First, lets construct the different models that we will use.
from sklearn import tree
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import svm
from sklearn import neural_network

# read data engineered csv
# df = pd.read_csv('H-2B_Engineered_Data.csv')
df = pd.read_csv('H-2B_Engineered_Data_Both.csv')
# df = pd.read_csv('H-2B_Engineered_Data_Downsampling_Only.csv')
labels = df.loc[:,'CASE_STATUS']
features = pd.DataFrame(df.drop(labels = ['CASE_STATUS'], axis = 1))

classifiers = []

#classifiers
norm_tree = tree.DecisionTreeClassifier()
classifiers.append(('tree', norm_tree))
depth_tree = tree.DecisionTreeClassifier(max_depth = 10)
classifiers.append(('depth_tree', depth_tree))
knn = neighbors.KNeighborsClassifier(n_neighbors = 5)
classifiers.append(('knn', knn))
nb = naive_bayes.GaussianNB()
classifiers.append(('nb', nb))
sv = svm.SVC(gamma = 'auto')
classifiers.append(('svm', sv))
net = neural_network.MLPClassifier()
classifiers.append(('neural_network', net))

In [3]:
features["NAICS_CODE"].value_counts()

56    1151
23     680
72     511
71     272
11     103
32      73
33      63
48      61
81      58
54      53
21      51
31      47
44      42
42      40
53      21
62      20
61      19
45      14
22       9
51       7
55       4
52       2
49       1
92       1
Name: NAICS_CODE, dtype: int64

In [4]:
features["SOC_CODE"].value_counts()

37    1394
47     710
35     407
39     202
51     178
53     155
45      84
49      40
43      35
41      30
27      18
11      15
33      11
25       7
13       5
31       4
29       2
19       2
17       2
15       2
Name: SOC_CODE, dtype: int64

In [5]:
from sklearn import decomposition
from sklearn import preprocessing as pp
from sklearn import neighbors as knn
from sklearn import pipeline
from sklearn import model_selection
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


# Build a preprocessor to scale numeric features and one hot encode categorical features
numeric_features = ['NBR_WORKERS_REQUESTED', 
                    'BASIC_NUMBER_OF_HOURS', 
                    'BASIC_RATE_OF_PAY', 
                    'SUPERVISE_HOW_MANY', 
                    'NUM_OF_MONTHS_TRAINING',
                    'EMP_EXP_NUM_MONTHS',
                    'WORK_DAY_LENGTH']
numeric_transformer = pipeline.Pipeline(steps=[
    ('scaler', pp.StandardScaler())])

categorical_features = ['SOC_CODE',
                        'NAICS_CODE',
                       'NATURE_OF_TEMPORARY_NEED',
                       'EDUCATION_LEVEL',
                       'CITY_MATCH',
                       'STATE_MATCH']
categorical_transformer = pipeline.Pipeline(steps=[
    ('onehot', pp.OneHotEncoder(sparse = False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



In [6]:
from sklearn import metrics

# The results from the individual classifiers

for name, cl in classifiers:
    pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('classifier', cl)])
    pr = model_selection.cross_val_predict(pipe, features, labels, cv = 5)
    
    print(cl)
    print(metrics.confusion_matrix(labels, pr))
    print(metrics.classification_report(labels, pr))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
[[1452  126]
 [ 290 1435]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1578
           1       0.92      0.83      0.87      1725

    accuracy                           0.87      3303
   macro avg       0.88      0.88      0.87      3303
weighted avg       0.88      0.87      0.87      3303

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_sa



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
[[1287  291]
 [ 231 1494]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83      1578
           1       0.84      0.87      0.85      1725

    accuracy                           0.84      3303
   macro avg       0.84      0.84      0.84      3303
weighted avg       0.84      0.84      0.84      3303





In [7]:
from sklearn import ensemble

voting = ensemble.VotingClassifier(classifiers)

pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('classifier', voting)])
pr = model_selection.cross_val_predict(pipe, features, labels, cv = 5)
    
print(voting)
print(metrics.confusion_matrix(labels, pr))
print(metrics.classification_report(labels, pr))



VotingClassifier(estimators=[('tree',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
    