## Example. Simple NLP with PyTorch

In [15]:
import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

## Download data

In [25]:
categories = ["comp.graphics","sci.space","rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# you can print all the available categories in this dataset
print(fetch_20newsgroups(subset='train').target_names)

print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
total texts in train: 1774
total texts in test: 1180


## Encoding the data for the NN

The dataset you will use consists of different texts in English and we need to manipulate this data to pass it to the neural network. To do that we do two things:
*   Create an index for each word
*   Create a matrix for each text, where the values are 1 if a word is in the text and 0 if not

In [38]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
print(vocab['little'])       
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i
        
    return word2index

word2index = get_word_2_index(vocab)


181


In [18]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        else:
            index_y = 2
        results.append(index_y)
            
     
    return np.array(batches),np.array(results)

## Creating a model

In [19]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = 3         # Categories: graphics, sci.space and baseball

In [20]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [21]:
class OurNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

## Training

In [22]:
net = OurNet(input_size, hidden_size, num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(newsgroups_train.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
        articles = Variable(torch.FloatTensor(batch_x), requires_grad=True) # A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node in a computational graph. If x is a Variable then x.data is a Tensor giving its value, and x.grad is another Variable holding the gradient of x with respect to some scalar value.
        labels = Variable(torch.LongTensor(batch_y))
        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(articles)
        loss = criterion(outputs, labels) # calculate cross entropy loss for each batch
        loss.backward() # calculates the gradient for the network and stores the gradient in the .grad attribute
        optimizer.step() # update the weights based on the gradients
        # print(articles.grad) # allow you to see the partial derivatives of the loss wrt each weight
        
        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.data.item()))

Epoch [1/10], Step [4/11], Loss: 1.0247
Epoch [1/10], Step [8/11], Loss: 0.4093
Epoch [2/10], Step [4/11], Loss: 0.4197
Epoch [2/10], Step [8/11], Loss: 0.0040
Epoch [3/10], Step [4/11], Loss: 0.0000
Epoch [3/10], Step [8/11], Loss: 0.0000
Epoch [4/10], Step [4/11], Loss: 0.0002
Epoch [4/10], Step [8/11], Loss: 0.0000
Epoch [5/10], Step [4/11], Loss: 0.0000
Epoch [5/10], Step [8/11], Loss: 0.0000
Epoch [6/10], Step [4/11], Loss: 0.0000
Epoch [6/10], Step [8/11], Loss: 0.0000
Epoch [7/10], Step [4/11], Loss: 0.0000
Epoch [7/10], Step [8/11], Loss: 0.0000
Epoch [8/10], Step [4/11], Loss: 0.0012
Epoch [8/10], Step [8/11], Loss: 0.0144
Epoch [9/10], Step [4/11], Loss: 0.1411
Epoch [9/10], Step [8/11], Loss: 0.0000
Epoch [10/10], Step [4/11], Loss: 0.2130
Epoch [10/10], Step [8/11], Loss: 0.0091


## Testing

In [39]:
# Test the Model
correct = 0
total = 0
total_test_data = len(newsgroups_test.target)
batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
articles = Variable(torch.FloatTensor(batch_x_test))
labels = torch.LongTensor(batch_y_test)
outputs = net(articles)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
    
print('Accuracy of the network on the 1180 test articles: %d %%' % (100 * correct / total))

Accuracy of the network on the 1180 test articles: 91 %


## Exercise

1) In the above example we play with 3 categories: graphics, sci.space and baseball. Try to use other categories and see the results (make sure that you also change **num_classes** parameter). 

2) Change **learning_rate**, **num_epochs**, **batch_size** and  **hidden_size** parameters. Explore the results.

3) Experiment with different:


- **criterion** (e.g. MSELoss, L1Loss)

- **optimizer** (e.g. SGD), more info:  [```torch.optim```](https://pytorch.org/docs/stable/optim.html)

