In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

adult_data = "data/adult.data"
model_folder = "."

In [2]:
import pandas as pd
from predictions import to_predictable

df = pd.read_csv(adult_data, header=0)

In [3]:
X = to_predictable(df)
y = df["label"]

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and test so that the training set can be
# used to compare performance of models
X_train, X_evaluate, y_train, y_evaluate = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=1)

In [5]:
from sklearn import model_selection
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC

models = [RandomForestClassifier(n_estimators=3, class_weight={">50K":i/(i+1), "<=50K":1/(i+1)}) for i in range(4, 7)] \
    + [KNeighborsClassifier()] \
    + [LinearDiscriminantAnalysis()] \
    + [GaussianNB()] \
    + [MultinomialNB()] \
    + [LogisticRegression(class_weight={">50K":i/(i+1), "<=50K":1/(i+1)}, solver="lbfgs", max_iter=10000) for i in range(4, 7)] \
    + [SVC(class_weight={">50K":i/(i+1), "<=50K":1/(i+1)}, probability=True, gamma="auto") for i in range(4, 7)]

best_loss = float("inf")
best_model = None

for model in models:
    
    fit_model = model.fit(X=X_train, y=y_train)
    predictions = fit_model.predict(X_evaluate)
    confusion = confusion_matrix(y_evaluate, predictions)
    loss = confusion[0][1]/5 + confusion[1][0]

    if loss < best_loss:
        best_loss = loss
        best_model = model
    print(loss, model)
        
print(best_loss, best_model)

654.2 RandomForestClassifier(bootstrap=True,
            class_weight={'>50K': 0.8, '<=50K': 0.2}, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
663.2 RandomForestClassifier(bootstrap=True,
            class_weight={'>50K': 0.8333333333333334, '<=50K': 0.16666666666666666},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=3, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
690.6 RandomForestClassifier(bootstrap=Tr



505.6 GaussianNB(priors=None, var_smoothing=1e-09)
1225.0 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
424.0 LogisticRegression(C=1.0, class_weight={'>50K': 0.8, '<=50K': 0.2},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=10000, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False)
415.8 LogisticRegression(C=1.0,
          class_weight={'>50K': 0.8333333333333334, '<=50K': 0.16666666666666666},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=10000, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False)
413.4 LogisticRegression(C=1.0,
          class_weight={'>50K': 0.8571428571428571, '<=50K': 0.14285714285714285},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=10000, multi_class='warn', n_jobs

In [6]:
import joblib
import os
joblib.dump(best_model, os.path.join(model_folder, "model.joblib"))

['./model.joblib']

In [7]:
# Reload the model to make sure it was writen properly
model = joblib.load(os.path.join(model_folder, "model.joblib"))

In [8]:
def get_class_probabilities(age=None,
                workclass=None,
                education_num=None,
                marital_status=None,
                occupation=None,
                relationship=None,
               race=None,
               sex=None,
               capital_gain=None,
               capital_loss=None,
               hours_per_week=None,
               native_country=None):

    example = pd.DataFrame.from_dict({
        "age":[age],
        "workclass":[workclass],
        "education-num":[education_num],
        "marital-status":[marital_status],
        "occupation":[occupation],
        "relationship":[relationship],
        "race":[race],
        "sex":[sex],
        "capital-gain":[capital_gain],
        "capital-loss":[capital_loss],
        "hours-per-week":[hours_per_week],
        "native-country":[native_country],
        "label":"?",
    })
    predictable = to_predictable(example)
    return model.predict_proba(predictable)

In [9]:
# Sanity check to make sure that two obvious samples from each class are properly classified
p0 = get_class_probabilities(40, "?", 14, "Never-married", "?", "Not-in-family", "White", "Male",       0, 0, 0, "Canada")
p1 = get_class_probabilities(40, "?", 14, "Never-married", "?", "Not-in-family", "White", "Male", 1000000, 0, 0, "Canada")
print(p0, p1)

[[0.92676447 0.07323553]] [[0. 1.]]
