In [1]:
# Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from IPython.display import display
import pickle, os.path
from sklearn.svm import SVC
from util import *
from sklearn.metrics import accuracy_score, fbeta_score



In [2]:
X_train = load_pickle('pickle/X_train.pickle')
X_test = load_pickle('pickle/X_test.pickle')
y_train = load_pickle('pickle/y_train.pickle')
y_test = load_pickle('pickle/y_test.pickle')

In [None]:
beta = 0.5

def f1_wrap(y_true, y_predict):
    return fbeta_score(y_true, y_predict, beta)

best_clf = None

# Note to mr mentor: I left my PC running overnight, so enable gridsearch with caution!
#     Probably best to modify the size of parameters grid before proceeding
GRID_SEARCH_ENABLED = True
if GRID_SEARCH_ENABLED:
    # Initialize the classifier
    clf = SVC(random_state=0)

    # Create the parameters list you wish to tune
    parameters = {'C':range(1,6),'kernel':['linear','poly','rbf','sigmoid'],'degree':range(1,6)}

    # Make an fbeta_score scoring object
    scorer = make_scorer(f1_wrap)

    # Perform grid search on the classifier using 'scorer' as the scoring method
    grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

    # Fit the grid search object to the training data and find the optimal parameters   
    grid_fit = grid_obj.fit(X_train, y_train)

    # Get the estimator
    best_clf = grid_fit.best_estimator_
    
    with open("best_clf.pkl", "w") as f:
        pickle.dump(best_clf, f, 2)
else:
    if os.path.isfile("best_clf.pkl"):
        with open("best_clf.pkl", "r") as f:
            best_clf = pickle.load(f)
    else:
        raise Exception("No tuned model exists.  Please enable Grid search.")
    

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))

# Print the final parameters
df = pd.DataFrame(grid_fit.grid_scores_).sort_values('mean_validation_score').tail()
display(df)
print "Parameters for the optimal model: {}".format(clf.get_params())