Exercise question

In the second example, the data are not linearly separable.

In [1]:
import numpy as np
from sklearn.base import BaseEstimator

class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)

    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])


In [119]:
class SVCImpl(LinearClassifier):
    def __init__(self, n_iter=20, lr=0.01, regularizer=0.0):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.lr = lr
        self.regularizer = regularizer

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)

        # training algorithm:
        for i in range(self.n_iter):
            for x, y in zip(X, Ye):

                # Compute the output score for this instance.
                score = x.dot(self.w)

                if y * score < 1:
                    self.w = (1 - self.regularizer * self.lr) * self.w + self.lr * y * x
                else:
                    self.w = (1 - self.regularizer * self.lr) * self.w

            print(f'epoch : {i}, train_loss : {self.score(X, Y)}')

    def score(self, X, Y):
        loss = 0.0
        min_weight = np.min(self.w.dot(self.w))
        for x, y in zip(X, self.encode_outputs(Y)):
            # loss for svc
            loss += max(0.0, 1.0 - y * x.dot(self.w))
        return - (self.regularizer / 2 * min_weight + loss / X.shape[0])

In [120]:
class LogisticRegressionImpl(LinearClassifier):
    def __init__(self, n_iter=20, lr=0.01, probability=False, regularizer=0.0):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.lr = lr
        self.probability = probability
        self.regularizer = regularizer

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)

       # training algorithm:
        for i in range(self.n_iter):
            for x, y in zip(X, Ye):
                score = y * x.dot(self.w)
                self.w = (1 - self.regularizer * self.lr) * self.w + self.lr * y * self.sigmoid(-score) * x

            print(f'epoch : {i}, train_loss : {self.score(X, Y)}')

    def score(self, X, Y):
        loss = 0.0
        min_weight = np.min(self.w.dot(self.w))
        for x, y in zip(X, self.encode_outputs(Y)):
            # loss for logistic regression
            loss += np.log(1 + np.exp(-y * x.dot(self.w)))
        return  - (self.regularizer / 2 * min_weight + loss / X.shape[0])

In [121]:

import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y


# Read all the documents.
X, Y = read_data('data/all_sentiment_shuffled.txt')

# Split into training and test parts.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)


In [156]:

# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    Normalizer(),
    SelectKBest(k=1000),
    # NB that this is our LogisticRegression implementation
    SVCImpl()
)

# Train the classifier.
t0 = time.time()
pipeline.fit(Xtrain, Ytrain)
t1 = time.time()
print('Training time: {:.2f} sec.'.format(t1-t0))

# Evaluate on the test set.
Yguess = pipeline.predict(Xtest)
print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

epoch : 0, train_loss : -0.9161084720480489
epoch : 1, train_loss : -0.8323747923487272
epoch : 2, train_loss : -0.7563517502257893
epoch : 3, train_loss : -0.6954490381839772
epoch : 4, train_loss : -0.6490632396232281
epoch : 5, train_loss : -0.6137364339077674
epoch : 6, train_loss : -0.5867036713091098
epoch : 7, train_loss : -0.5653668836367755
epoch : 8, train_loss : -0.5480847914090229
epoch : 9, train_loss : -0.5336190564677653
epoch : 10, train_loss : -0.5212886038261907
epoch : 11, train_loss : -0.5105629241758506
epoch : 12, train_loss : -0.5010372926347447
epoch : 13, train_loss : -0.49250187475177776
epoch : 14, train_loss : -0.48466442220709893
epoch : 15, train_loss : -0.4774959138847924
epoch : 16, train_loss : -0.47096398003766143
epoch : 17, train_loss : -0.465021766651516
epoch : 18, train_loss : -0.45952541110089656
epoch : 19, train_loss : -0.45440266717489086
Training time: 3.37 sec.
Accuracy: 0.8019.


In [123]:

# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(),

    # NB that this is our LogisticRegression implementation
    LogisticRegressionImpl()
)

# Train the classifier.
t0 = time.time()
pipeline.fit(Xtrain, Ytrain)
t1 = time.time()
print('Training time: {:.2f} sec.'.format(t1-t0))

# Evaluate on the test set.
Yguess = pipeline.predict(Xtest)
print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

param_grid = {
    'classifier__n_iter': [10, 30, 100],
    'classifier__regularizer': [0.1, 1, 2]
}


pipeline = Pipeline(steps=[
    ("vect",TfidfVectorizer()),
    ("select", SelectKBest(k=1000)),
    ("norm", Normalizer()),
    ("classifier", SVCImpl())
])

grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, return_train_score=True)

grid_search.fit(X, Y)

print("Best parameter (loss=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

In [168]:
def add_sparse_to_dense(x, w, factor):
    """
    Adds a sparse vector x, scaled by some factor, to a dense vector.
    This can be seen as the equivalent of w += factor * x when x is a dense
    vector.
    """
    w[x.indices] += factor * x.data

def sparse_dense_dot(x, w, a):
    """
    Computes the dot product between a sparse vector x and a dense vector w.
    """
    return a * np.dot(w[x.indices], x.data)


class SparseSVC(LinearClassifier):

    def __init__(self, n_iter=20, regularizer=1.0):
        self.n_iter = n_iter
        self.regularizer = regularizer

    def fit(self, X, Y):
        self.find_classes(Y)

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        self.w = np.zeros(X.shape[1])

        # Iteration through sparse matrices can be a bit slow, so we first
        # prepare this list to speed up iteration.
        XY = list(zip(X, Ye))

        # initialize vector scaling
        a = 1

        for i in range(self.n_iter):
            lr =  0.01#1.0 / (self.regularizer * (i+1))

            for x, y in XY:

                # Compute the output score for this instance.
                # (This corresponds to score = x.dot(self.w) above.)
                score = sparse_dense_dot(x, self.w, a)

                # If there was an error, update the weights.
                if y*score <= 0:
                    add_sparse_to_dense(x, self.w, (lr * y / a))



            # update vector scaling
            a = (1 - self.regularizer * lr) * a

        self.w[x.indices] =  a * self.w[x.indices]

In [169]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2)),
    Normalizer(),
    SparseSVC(regularizer=0.1)
)

# Train the classifier.
t0 = time.time()
pipeline.fit(Xtrain, Ytrain)
t1 = time.time()
print('Training time: {:.2f} sec.'.format(t1-t0))

# Evaluate on the test set.
Yguess = pipeline.predict(Xtest)
print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time: 5.61 sec.
Accuracy: 0.8636.
