In [45]:
import project1 as p1
import utils
import numpy as np

In [7]:
train_data = utils.load_data('reviews_train.tsv')
val_data = utils.load_data('reviews_val.tsv')
test_data = utils.load_data('reviews_test.tsv')

train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data))
test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data))

dictionary = p1.bag_of_words(train_texts)

train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary)
val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary)
test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary)

# Baseline Accuracy
Now, uncomment the relevant lines in main.py and report the training and validation accuracies of each algorithm with T = 10 and  λ  = 0.01 (the  λ  value only applies to Pegasos).

In [47]:
T = 10
L = 0.01

pct_train_accuracy, pct_val_accuracy = \
classifier_accuracy(perceptron, train_bow_features,val_bow_features,train_labels,val_labels,T=T)
print("{:35} {:.4f}".format("Training accuracy for perceptron:", pct_train_accuracy))
print("{:35} {:.4f}".format("Validation accuracy for perceptron:", pct_val_accuracy))

avg_pct_train_accuracy, avg_pct_val_accuracy = \
classifier_accuracy(average_perceptron, train_bow_features,val_bow_features,train_labels,val_labels,T=T)
print("{:43} {:.4f}".format("Training accuracy for average perceptron:", avg_pct_train_accuracy))
print("{:43} {:.4f}".format("Validation accuracy for average perceptron:", avg_pct_val_accuracy))

avg_peg_train_accuracy, avg_peg_val_accuracy = \
classifier_accuracy(pegasos, train_bow_features,val_bow_features,train_labels,val_labels,T=T,L=L)
print("{:50} {:.4f}".format("Training accuracy for Pegasos:", avg_peg_train_accuracy))
print("{:50} {:.4f}".format("Validation accuracy for Pegasos:", avg_peg_val_accuracy))

Training accuracy for perceptron:   0.8157
Validation accuracy for perceptron: 0.7160
Training accuracy for average perceptron:   0.9728
Validation accuracy for average perceptron: 0.7980
Training accuracy for Pegasos:                     0.9143
Validation accuracy for Pegasos:                   0.7900


In [38]:
def classify(feature_matrix, theta, theta_0):
    return((np.sum(feature_matrix*theta, axis=1) + theta_0) > 0)*2 - 1
#pragma: coderesponse end


#pragma: coderesponse template
def accuracy(preds, targets):
    """
    Given length-N vectors containing predicted and target labels,
    returns the percentage and number of correct predictions.
    """
    return (preds == targets).mean()


def classifier_accuracy(classifier, train_feature_matrix, val_feature_matrix, train_labels, val_labels,**kwargs):
    train_theta, train_theta_0 = classifier(train_feature_matrix, train_labels, **kwargs)
    train_predictions= classify(train_feature_matrix, train_theta, train_theta_0)
    trained_validation  = classify(val_feature_matrix, train_theta, train_theta_0)
    train_accuracy = accuracy(train_predictions, train_labels)
    val_accuracy = accuracy(trained_validation, val_labels)
    return(train_accuracy, val_accuracy)

def get_order(n_samples):
    try:
        with open(str(n_samples) + '.txt') as fp:
            line = fp.readline()
            return list(map(int, line.split(',')))
    except FileNotFoundError:
        random.seed(1)
        indices = list(range(n_samples))
        random.shuffle(indices)
        return indices



def perceptron_single_step_update(
        feature_vector,
        label,
        current_theta,
        current_theta_0):
    if (label*(np.dot(feature_vector, current_theta) + current_theta_0)) <= 0:
        current_theta= current_theta + (np.dot(label, feature_vector))
        current_theta_0= (current_theta_0 + label)
    else:
        current_theta_0= current_theta_0
        current_theta= current_theta
    return (current_theta, current_theta_0)



def perceptron(feature_matrix, labels, T):
    theta_0  = 0
    theta= np.zeros(len(feature_matrix[0])) 
    for epoch in range(T): 
        for i in get_order(feature_matrix.shape[0]):
            if (labels[i]*(np.dot(feature_matrix[i], theta) + theta_0)) <= 0:
                theta= theta + (np.dot(labels[i], feature_matrix[i]))
                theta_0= (theta_0 + labels[i])
            else:
                theta_0= theta_0
                theta= theta
    return((theta, theta_0))


def average_perceptron(feature_matrix, labels, T):
    theta, theta_0 = np.zeros((feature_matrix.shape[1],)), 0
    c_theta, c_theta_0 = np.zeros((feature_matrix.shape[1],)), 0
    for _ in range(T):
        for i in get_order(feature_matrix.shape[0]):
            theta, theta_0 = perceptron_single_step_update(feature_matrix[i,:], labels[i], theta, theta_0)
            c_theta, c_theta_0 = c_theta + theta, c_theta_0 + theta_0
    n_samples = T * feature_matrix.shape[0]
    return c_theta / n_samples, c_theta_0 / n_samples

def pegasos_single_step_update(
        feature_vector,
        label,
        L,
        eta,
        current_theta,
        current_theta_0):
    if label*(feature_vector@current_theta + current_theta_0) <= 1:
        current_theta =  (1 - eta*L)*current_theta + eta*label*feature_vector
        current_theta_0 = current_theta_0 + eta*label
    else:
        current_theta =  (1 - eta*L)*current_theta
    return (current_theta, current_theta_0)
#pragma: coderesponse end


#pragma: coderesponse template
def pegasos(feature_matrix, labels, T, L):
    pegasos_theta = np.zeros(len(feature_matrix[0])) 
    pegasos_theta_0 = 0
    update_counter = 0
    # updating perceptrons
    for t in range(T):
        for i in get_order(feature_matrix.shape[0]):
            update_counter += 1
            eta = 1/(np.sqrt(update_counter))
            pegasos_theta, pegasos_theta_0 = pegasos_single_step_update(feature_matrix[i],
                                                                        labels[i],
                                                                        L,
                                                                        eta,
                                                                        pegasos_theta,
                                                                        pegasos_theta_0)
    return (pegasos_theta, pegasos_theta_0)