# Homework 7
## Alex Pine, 2015-12-12

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from pystruct.models import ChainCRF
from pystruct.learners import OneSlackSSVM

In [17]:
################################################################################

def read_input(data_dir, dataset_type):
    assert dataset_type == 'train' or dataset_type == 'test', dataset_type
    num_files = 5000 if dataset_type == 'train' else 1000
    
    X, y = [], []
    # Iterate over all the training sample files
    for f in [data_dir + "/Data/"+ dataset_type +"-%d.txt" % i 
              for i in range(1, num_files+1)]:
        # Read each training sample file into 'data' variable
        data = pd.read_csv(f, header=None, quoting=3)
        # Extract 'tag' field into 'labels'
        labels = data[1]
        # Extract feature fields into 'features'
        features = data.values[:, 2:].astype(np.int)
        # Adjust features starting at 1 to start at 0
        for f_idx in range(len(features)):
          f1 = features[f_idx]
          features[f_idx] = [f1[0]-1, f1[1], f1[2], f1[3]-1, f1[4]-1]
        # Adjust labels to lie in {0,...,9}, and add to 'y'
        y.append(labels.values - 1)
        # Add feature vector to 'X'
        X.append(features)

    # See: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    # [Note: if you get an error on the below line, it may be because you need to
    # upgrade scikit-learn]
    encoder = OneHotEncoder(n_values=[1,2,2,201,201],sparse=False).fit(np.vstack(X))                 
    # Represent features using one-of-K scheme: If a feature can take value in 
    # {0,...,K}, then introduce K binary features such that the value of only the 
    # i^th binary feature is non-zero when the feature takes value 'i'.
    # n_values specifies the number of states each feature can take.
    X_encoded = [encoder.transform(x) for x in X]
    return X, X_encoded, y

In [24]:
X_train_orig, X_train_enc, y_train = read_input('ps7_data', 'train')
X_test_orig, X_test_enc, y_test = read_input('ps7_data', 'test')

In [25]:
################################################################################

# Problem 1: Find the value of the regularization hyperparameter "C" that
# minimizes the loss function of the SSVM.
# To do this, we maximize the 'score' value of the SSVM.

# TODO Write a function that trains an SSVM and returns the weight vector, the 
#      test score, and the training score. Inputs should be the training and 
#      test sets.
# 
# TODO Write function that finds an optimal value of C as measured by the
#      SSVM's score function. Use grid search?
#
# TODO Find the optimal C for a model trained on the first 4500 training inputs.
#      Then report the training and test error.
#
# TODO Train a model on all the training data using the value of C you found
#      before, and report it's test error on the data in the test_XXX.txt files.

def split_train_test(X, y, num_train):
    TEST_SET_SIZE = 500
    assert len(X) >= num_train + TEST_SET_SIZE, len(X)
    assert len(y) >= num_train + TEST_SET_SIZE, len(y)
    return X[:num_train], y[:num_train], X[:-TEST_SET_SIZE], y[:-TEST_SET_SIZE]


# TODO what effect does the 'directed' param have? 
def learn_pos_weights(X_train, y_train, X_test, y_test, C):
    # Construct a directed ChainCRF with 10 states for each variable, 
    # and pass this CRF to OneSlackSSVM constructor to create an object 'ssvm'
    # Learn Structured SVM using X_small and y_small
    crf = ChainCRF(n_states=10, inference_method='max-product', directed=True)
    ssvm = OneSlackSSVM(crf, max_iter=200, C=C)
    ssvm.fit(X_train, y_train)
    # Store learnt weights in 'weights'
    w = ssvm.w                  
    # Evaluate training accuracy on X_small, y_small
    train_score = ssvm.score(X_train, y_train)
    # Get predicted labels on X_small using the learnt model
    # print ssvm.predict(X_small)
    test_score = ssvm.score(X_test, y_test)
    return w, train_score, test_score

# TODO consider using grid search from sklearn since it can be parallelized
def find_best_model(X_train, y_train, X_test, y_test):
    params = [0.1, 1.0, 10] # TODO no idea what good values are, TODO expand this
    weights = []
    train_scores = []
    test_scores = []
    for C in params:
        w, train_score, test_score = learn_pos_weights(X_train, y_train, 
                                                       X_test, y_test, C)
        weights.append(w)
        train_scores.append(train_score)
        test_scores.append(test_score)
    # TODO you should probably graph this
    best_index = test_scores.index(max(test_scores))
    return (params[best_index], weights[best_index], train_scores[best_index],
            test_scores[best_index])


In [None]:
################################################################################

#TODO 4500
X, y, X_val, y_val = split_train_test(X_train_enc, y_train, 4500)

C, w, train_score, test_score = find_best_model(X, y, X_val, y_val)

print C, w, train_score, test_score