In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix

# Reading the data and creating the TF matrix

In [5]:
vocab = open('20news-bydate/vocabulary.txt', 'r').read().split()
Y_train = np.array(list(map(int, open('20news-bydate/matlab/train.label', 'r').read().split()))) - 1
Y_test = np.array(list(map(int, open('20news-bydate/matlab/test.label', 'r').read().split()))) - 1
m_train = len(Y_train)
m_test = len(Y_test)
LEN_VOCAB = len(vocab)

In [6]:
X_train = pd.read_csv('20news-bydate/matlab/train.data', sep=' ', header=None, names=['doc', 'word', 'count'])
X_test = pd.read_csv('20news-bydate/matlab/test.data', sep=' ', header=None, names=['doc', 'word', 'count'])

In [7]:
X_train = X_train.groupby('doc').apply(lambda x: dict(zip(x['word'], x['count'])))
X_test = X_test.groupby('doc').apply(lambda x: dict(zip(x['word'], x['count'])))

In [8]:
def from_dic_to_array(dic):
    ar = np.zeros(LEN_VOCAB)
    for i, j in dic.items():
        ar[i-1] = j
    return ar

In [28]:
class Net(nn.Module):

    def __init__(self, input_size=LEN_VOCAB, output_size=20, hidden_layer_size=100):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_layer_size, bias=True)
        self.fc2 = nn.Linear(hidden_layer_size, output_size, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        torch.nn.init.xavier_uniform(self.fc2.weight)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        return x

net = Net()
print(net)
criterion = nn.CrossEntropyLoss()

import torch.optim as optim

Net(
  (fc1): Linear(in_features=61188, out_features=100)
  (fc2): Linear(in_features=100, out_features=20)
)


In [29]:
import math
def random_mini_batches_idx(m, mini_batch_size = 64, seed = 0):
    permutation = np.random.permutation(m)
    num_complete_minibatches = int(math.floor(m/mini_batch_size)) # number of mini batches of size mini_batch_size in your partitionning
    list_of_indices = []
    for k in range(0, num_complete_minibatches):
        list_of_indices.append(list(permutation[k * mini_batch_size: k * mini_batch_size + mini_batch_size]))
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        list_of_indices.append(list(permutation[num_complete_minibatches * mini_batch_size : m]))
    return list_of_indices

In [30]:
def get_accuracy(X, Y):
    minibatches_indices = random_mini_batches_idx(len(X), mini_batch_size=10)
    total_correct = 0
    total_examples = 0
    for i, indices in enumerate(minibatches_indices):
        # get the inputs
        X_minibatch = X.iloc[indices]
        inputs = np.vstack(X_minibatch.apply(from_dic_to_array).tolist())
        labels = Y[indices]
        inputs = torch.from_numpy(inputs).float()
        true_labels = torch.from_numpy(labels).long()
        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(true_labels)
        # forward + backward + optimize
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_correct += (predicted == true_labels).sum()
        total_examples += len(indices)
    return float(total_correct)/total_examples
print ("Accuracy on training set: {0}".format(get_accuracy(X_train, Y_train)))
print ("Accuracy on test set: {0}".format(get_accuracy(X_test, Y_test)))

Accuracy on training set: 0.04614428964415654
Accuracy on test set: 0.05116588940706196


In [32]:
def train(n_epochs=3, lr=0.01, momentum=.9, mini_batch_size=8, seed=0):
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)
    accuracies_train = []
    accuracies_train2 = [get_accuracy(X_train, Y_train)]
    accuracies_test = [get_accuracy(X_test, Y_test)]
    for epoch in range(n_epochs):  # loop over the dataset multiple times
        minibatches_indices = random_mini_batches_idx(m_train, mini_batch_size=mini_batch_size, seed=seed)
        running_loss = 0.0
        total_correct = 0
        total_examples = 0
        for i, indices in enumerate(minibatches_indices):
            # get the inputs
            X_minibatch = X_train.iloc[indices]
            inputs = np.vstack(X_minibatch.apply(from_dic_to_array).tolist())
            labels = Y_train[indices]
            inputs = torch.from_numpy(inputs).float()
            true_labels = torch.from_numpy(labels).long()
            # wrap them in Variable
            inputs, labels = Variable(inputs), Variable(true_labels)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            #evaluate accuracy on train set
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == true_labels).sum()
            total_examples += len(indices)

            # print statistics
            running_loss += loss.data[0]
            if i % 700 == 699:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.4f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
        accuracies_train.append(total_correct/float(total_examples))
        accuracies_train2.append(get_accuracy(X_train, Y_train))
        accuracies_test.append(get_accuracy(X_test, Y_test))
    print('Finished Training')
    return accuracies_train, accuracies_train2, accuracies_test

In [33]:
accuracies_train, accuracies_train2, accuracies_test = train()

[1,   700] loss: 0.9812
[1,  1400] loss: 0.9175
[2,   700] loss: 0.8710
[2,  1400] loss: 0.8654
[3,   700] loss: 0.8458
[3,  1400] loss: 0.8278
Finished Training
