In [1]:
import sklearn as sk
import pandas as pd

In [2]:
# First, lets construct the different models that we will use.
from sklearn import tree
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import svm
from sklearn import neural_network

# read data engineered csv
# df = pd.read_csv('H-2B_Engineered_Data.csv')
df = pd.read_csv('H-2B_Engineered_Data_Both.csv')
# df = pd.read_csv('H-2B_Engineered_Data_Downsampling_Only.csv')
labels = df.loc[:,'CASE_STATUS']
features = pd.DataFrame(df.drop(labels = ['CASE_STATUS'], axis = 1))

In [5]:
from sklearn import decomposition
from sklearn import preprocessing as pp
from sklearn import neighbors as knn
from sklearn import pipeline
from sklearn import model_selection
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV


# Build a preprocessor to scale numeric features and one hot encode categorical features
numeric_features = ['NBR_WORKERS_REQUESTED', 
                    'BASIC_NUMBER_OF_HOURS', 
                    'BASIC_RATE_OF_PAY', 
                    'SUPERVISE_HOW_MANY', 
                    'NUM_OF_MONTHS_TRAINING',
                    'EMP_EXP_NUM_MONTHS',
                    'WORK_DAY_LENGTH']
numeric_transformer = pipeline.Pipeline(steps=[
    ('scaler', pp.StandardScaler())])

categorical_features = ['SOC_CODE',
                        'NAICS_CODE',
                       'NATURE_OF_TEMPORARY_NEED',
                       'EDUCATION_LEVEL',
                       'CITY_MATCH',
                       'STATE_MATCH']
categorical_transformer = pipeline.Pipeline(steps=[
    ('onehot', pp.OneHotEncoder(sparse = False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



We want to figure out the best parameters for Decision Tree, K Nearest Neighbors, SVM, Random Forest Classifier, and MLP Classifier.

In [8]:
tuned_parameters = {
    'tree__max_depth': [5, 10, 15, 20],
    'tree__min_samples_leaf': [5, 10, 15, 20], 
    'tree__max_features': [5, 10, 15],
    'tree__criterion': ['gini', 'entropy']
}
pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('tree', tree.DecisionTreeClassifier())])
tree_model = GridSearchCV(pipe, param_grid=tuned_parameters, scoring='accuracy', cv=5)
tree_model.fit(features, labels)
print(tree_model.best_params_)
print(tree_model.best_score_)

{'tree__criterion': 'entropy', 'tree__max_depth': 20, 'tree__max_features': 15, 'tree__min_samples_leaf': 5}
0.8277323645171056


In [9]:
tuned_parameters = {
    'knn__n_neighbors': list(range(1, 25))
}
pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('knn', neighbors.KNeighborsClassifier())])
knn_model = GridSearchCV(pipe, param_grid=tuned_parameters, scoring='accuracy', cv=5)
knn_model.fit(features, labels)
print(knn_model.best_params_)
print(knn_model.best_score_)

{'knn__n_neighbors': 1}
0.8870723584620043


In [12]:
tuned_parameters = {
    'svm__C': [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2],
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('svm', svm.SVC(gamma='auto'))])
svm_model = GridSearchCV(pipe, param_grid=tuned_parameters, scoring='accuracy', cv=5)
svm_model.fit(features, labels)
print(svm_model.best_params_)
print(svm_model.best_score_)

{'svm__C': 2, 'svm__kernel': 'rbf'}
0.7962458371177717


In [7]:
import warnings
warnings.simplefilter("ignore")

tuned_parameters = {
    'net__activation': ['logistic', 'tanh', 'relu'],
    'net__solver': ['sgd', 'adam']
}
pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('net', neural_network.MLPClassifier())])
net_model = GridSearchCV(pipe, param_grid=tuned_parameters, scoring='accuracy', cv=5)
net_model.fit(features, labels)
print(net_model.best_params_)
print(net_model.best_score_)

warnings.simplefilter("default")

{'net__activation': 'relu', 'net__solver': 'adam'}
0.8522555252800484


In [6]:
from sklearn import ensemble

tuned_parameters = {
    'rfc__max_depth': list(range(35, 56)),
    'rfc__min_samples_leaf': list(range(8, 13, 2)),
    'rfc__max_features': ['sqrt', 'log2']
}

pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('rfc', ensemble.RandomForestClassifier(n_estimators=10))])
rfc_model = GridSearchCV(pipe, param_grid=tuned_parameters, scoring='accuracy', cv=5)
rfc_model.fit(features, labels)
print(rfc_model.best_params_)
print(rfc_model.best_score_)

{'rfc__max_depth': 44, 'rfc__max_features': 'sqrt', 'rfc__min_samples_leaf': 8}
0.8483197093551317


Now, we make our classifiers based on the parameters that we have tuned.

In [9]:
classifiers = []

#classifiers
tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=20, max_features=15, min_samples_leaf=5)
classifiers.append(('tree', tree))

rfc = ensemble.RandomForestClassifier(n_estimators=10, max_depth=44, max_features='sqrt', min_samples_leaf=8)
classifiers.append(('rfc', rfc))

knn = neighbors.KNeighborsClassifier(n_neighbors=1)
classifiers.append(('knn', knn))

nb = naive_bayes.GaussianNB()
classifiers.append(('nb', nb))

sv = svm.SVC(gamma='auto', C=2.0, kernel='rbf')
classifiers.append(('svm', sv))

net = neural_network.MLPClassifier(activation='relu', solver='adam')
classifiers.append(('neural_network', net))

In [10]:
from sklearn import metrics

# The results from the individual classifiers

warnings.simplefilter("ignore")

for name, cl in classifiers:
    pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('classifier', cl)])
    pr = model_selection.cross_val_predict(pipe, features, labels, cv=5)
    
    print(cl)
    print(metrics.confusion_matrix(labels, pr))
    print(metrics.classification_report(labels, pr))
    
warnings.simplefilter("default")

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
                       max_features=15, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
[[1319  259]
 [ 302 1423]]
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      1578
           1       0.85      0.82      0.84      1725

    accuracy                           0.83      3303
   macro avg       0.83      0.83      0.83      3303
weighted avg       0.83      0.83      0.83      3303

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=44, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
            

In [11]:
from sklearn import ensemble

voting = ensemble.VotingClassifier(classifiers)

pipe = pipeline.Pipeline(steps = [('preprocess', preprocessor), ('dim', decomposition.PCA()), ('classifier', voting)])

warnings.simplefilter("ignore")
pr = model_selection.cross_val_predict(pipe, features, labels, cv=5)
warnings.simplefilter("default")
    
print(voting)
print(metrics.confusion_matrix(labels, pr))
print(metrics.classification_report(labels, pr))

VotingClassifier(estimators=[('tree',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='entropy',
                                                     max_depth=20,
                                                     max_features=15,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=5,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
     