# (B) Logistic Regression Model

In this second part of the lab, we will implement a language identifier trained on the same data, but using Logistic Regression instead of Naive Bayes.

In [5]:
import io, sys, math
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm
import random

This function is used to build the dictionary, or vocabulary, which is a mapping from strings (or words) to integers (or indices). This will allow to build vector representations of documents.

In [7]:
def build_dict(filename, threshold=1):
    fin = io.open(filename, 'r', encoding='utf-8')
    word_dict, label_dict = {}, {}
    counts = defaultdict(lambda: 0)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        if not label in label_dict:
            label_dict[label] = len(label_dict)

        for w in tokens[1:]:
            counts[w] += 1

    for k, v in counts.items():
        if v > threshold:
            word_dict[k] = len(word_dict)
    return word_dict, label_dict

This function is used to load the training dataset, and build vector representations of the training examples. In particular, a document or sentence is represented as a bag of words. Each example correspond to a sparse vector ` x` of dimension `V`, where `V` is the size of the vocabulary. The element `j` of the vector `x` is the number of times the word `j` appears in the document.

In [9]:
def load_data(filename, word_dict, label_dict):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    dim = len(word_dict)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        yi = label_dict[label]
        xi = np.zeros(dim)
        for word in tokens[1:]:
            if word in word_dict:
                wid = word_dict[word]
                xi[wid] += 1.0
        data.append((yi, xi))
    return data

In [10]:
word_dict, label_dict = build_dict("train1.txt")
train_data = load_data("train1.txt", word_dict, label_dict)

In [11]:
train_data[0:5]

[(0, array([1., 1., 1., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (1, array([0., 0., 0., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.]))]

In [12]:
y,x = train_data[0]
x = x.reshape(-1,1)
print(x.shape)

nlabels = len(label_dict)

dim = len(word_dict)
w = np.zeros([nlabels, dim])
print(w.shape)

(5826, 1)
(10, 5826)


First, let's implement the softmax function. Don't forget numerical stability!

In [14]:
def softmax(x):
  ##########################################################################
  #                      TODO: Implement this function                     #
  ##########################################################################
    exp = np.exp(x - np.max(x))
  ##########################################################################
  #                            END OF YOUR CODE                            #
  ##########################################################################
    return exp / sum(exp)

In [15]:
def one_hot(label):
    label_Encod = np.zeros((10,1))
    label_Encod[label] = 1
    return label_Encod

Now, let's implement the main training loop, by using stochastic gradient descent. The function will iterate over the examples of the training set. For each example, we will first compute the loss, before computing the gradient and performing the update.

In [17]:
def sgd(w, data, niter):

    lr = 0.01
    target = []
    nlabels, dim = w.shape

    for iter in range(niter):
      ##########################################################################
      #                      TODO: Implement this function                     #
      ##########################################################################
        np.random.shuffle(data)
        loss = 0.0
        for example in data:
            label, x = example
            x = x.reshape(-1,1)
            
            label_Encod = one_hot(label)
            prediction = softmax(w@x)
            loss += -label_Encod * np.log(prediction)

            grad =  -(label_Encod - prediction)  @ x.reshape(1,-1)

            w = w - lr * grad
            
        average_loss = loss / len(data)
        print(f'Train loss: {sum(average_loss)[0]}')
      ##########################################################################
      #                            END OF YOUR CODE                            #
      ##########################################################################
    return w # Replace "..." statement with your code

The next function will predict the most probable label corresponding to example `x`, given the trained classifier `w`.

In [19]:
def predict(w, x):
  ##########################################################################
  #                      TODO: Implement this function                     #
  ##########################################################################
    prod = w@x.T
    prod = prod.T
    pred = softmax(prod)
  ##########################################################################
  #                            END OF YOUR CODE                            #
  ##########################################################################
    return np.argmax(pred)

Finally, this function will compute the accuracy of a trained classifier `w` on a validation set.

In [21]:
def compute_accuracy(w, valid_data):
  ##########################################################################
  #                      TODO: Implement this function                     #
  ##########################################################################
    correct_pred = 0.0
    for example in valid_data:
        label, x = example
        pred = predict(w, x)
        
        if label == pred:
            correct_pred += 1
    accuracy = correct_pred / len(valid_data)
  ##########################################################################
  #                            END OF YOUR CODE                            #
  ##########################################################################
    return accuracy

In [22]:
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("train1.txt")
train_data = load_data("train1.txt", word_dict, label_dict)
valid_data = load_data("valid1.txt", word_dict, label_dict)

nlabels = len(label_dict)

dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 25)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **

Train loss: 1.6426581044767619
Train loss: 1.1330757226126966
Train loss: 0.9304015185976311
Train loss: 0.8115719503720908
Train loss: 0.7304272428071185
Train loss: 0.6701326812742345
Train loss: 0.6228917524737739
Train loss: 0.5845687220607583
Train loss: 0.5525894166922549
Train loss: 0.5253933636237751
Train loss: 0.5018556784792395
Train loss: 0.48122307330479147
Train loss: 0.46295061903194523
Train loss: 0.44659260795727496
Train loss: 0.43186097508519355
Train loss: 0.41850014035652067
Train loss: 0.406285784018739
Train loss: 0.3950833122370528
Train loss: 0.3847559886652884
Train loss: 0.3751790452509432
Train loss: 0.3662836874336158
Train loss: 0.35798753819652096
Train loss: 0.350222305055243
Train loss: 0.3429359985789215
Train loss: 0.3360826486935626

Validation accuracy: 0.913



# Now, it is your turn, try to do it with train2.txt and valid2.txt.


In [25]:
#Write your code here.
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("train2.txt")
train_data = load_data("train2.txt", word_dict, label_dict)
valid_data = load_data("valid2.txt", word_dict, label_dict)

nlabels = len(label_dict)

dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 25)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **

Train loss: 0.8431747630480024
Train loss: 0.4677757238988976
Train loss: 0.3764932188370059
Train loss: 0.3270151174209174
Train loss: 0.29435619777211763
Train loss: 0.270525267094896
Train loss: 0.25206349603433087
Train loss: 0.23715646896254172
Train loss: 0.2247475866689033
Train loss: 0.2141914782597832
Train loss: 0.20505340546061165
Train loss: 0.19702682986086228
Train loss: 0.1898979125824302
Train loss: 0.18350446357980016
Train loss: 0.1777303938107671
Train loss: 0.17247353342480062
Train loss: 0.16764617560309159
Train loss: 0.1632193817267673
Train loss: 0.1591071030928158
Train loss: 0.1552925405264963
Train loss: 0.1517422478287347
Train loss: 0.14841702368544657
Train loss: 0.14529604131341795
Train loss: 0.14235557511535332
Train loss: 0.13958171219019622

Validation accuracy: 0.966

