# Code for dataset loading

In [None]:
# Make necessary imports
random_state = 0
import random
import numpy as np
random.seed(random_state)
np.random.seed(random_state)

import re
import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups 

In [None]:
# Provide list of categories to consider
categories = ['alt.atheism',
              'comp.graphics', 
              'comp.os.ms-windows.misc',
              'comp.sys.ibm.pc.hardware',  
              'comp.sys.mac.hardware',
              'comp.windows.x', 
              'misc.forsale', 
              'rec.autos', 
              'rec.motorcycles', 
              'rec.sport.baseball', 
              'rec.sport.hockey', 
              'sci.crypt', 
              'sci.electronics', 
              'sci.med', 
              'sci.space', 
              'soc.religion.christian', 
              'talk.politics.guns', 
              'talk.politics.mideast', 
              'talk.politics.misc', 
              'talk.religion.misc']

# Dictionary for merging similar classes together
dict_categories = {0: 0,
                   1: 1, 
                   2: 1,
                   3: 1,  
                   4: 1,
                   5: 1,
                   6: 2, 
                   7: 3, 
                   8: 3, 
                   9: 3, 
                   10: 3,
                   11: 4, 
                   12: 4, 
                   13: 4, 
                   14: 4,
                   15: 5, 
                   16: 6,
                   17: 6, 
                   18: 6, 
                   19: 6}

# Training subset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
# Testing subset
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('Total samples in training data:',len(newsgroups_train.data))
print('Total samples in testing data:',len(newsgroups_test.data))

Total samples in training data: 11314
Total samples in testing data: 7532


In [None]:
print("Unique data targets: ", np.unique(newsgroups_train.target))

print("Unique data target names: \n", newsgroups_train.target_names)

Unique data targets:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
Unique data target names: 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [None]:
def clean(text):
  """ Function to clean the text """
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  texter = re.sub(r"<br />", " ", text)
  texter = re.sub(r"&quot;", "\"",texter)
  texter = re.sub('&#39;', "\"", texter)
  texter = re.sub('\n', " ", texter)
  texter = re.sub(' u '," you ", texter)
  texter = re.sub('`',"", texter)
  texter = re.sub(' +', ' ', texter)
  texter = re.sub(r"(!)\1+", r"!", texter)
  texter = re.sub(r"(\?)\1+", r"?", texter)
  texter = re.sub('&amp;', 'and', texter)
  texter = re.sub('\r', ' ',texter)
  # Remove numbers from string
  texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
  texter = texter.replace("  ", " ")
  clean = re.compile('<.*?>')
  texter = texter.encode('ascii', 'ignore').decode('ascii')
  texter = re.sub(clean, '', texter)
  if texter == "":
    texter = ""
  return texter

def get_word_2_index(vocab):
  """ Function to get index for each word """
  word2index = {}
  for i, word in enumerate(vocab):
    word2index[word.lower()] = i
  return word2index

def get_vocab_using_bow(newsgroups_train, newsgroups_test):
  """ Function to get vocabulary and indices for dataset """
  # Build a vocabulary
  vocab = Counter()

  # Iterate through training samples
  for text in newsgroups_train.data:
    text = clean(text)
    for word in text.split(' '):
      vocab[word.lower()]+=1
  # Iterate through testing samples
  for text in newsgroups_test.data:
    text = clean(text)
    for word in text.split(' '):
      vocab[word.lower()]+=1

  word2index = get_word_2_index(vocab)
  print("Vocabulary size [Bag-of-words]: ", len(vocab))
  return vocab, word2index

def get_vocab_using_glove(dim):
  """ Function to get vocabulary using GloVe Embeddings
  dim can take a value from 50, 100, 200, 300"""

  vocab = {}
  with open("/content/drive/MyDrive/Colab Notebooks/NNDL/Project/ARDL/glove.6B.{}d.txt".format(dim), 'r') as f:
    for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      vocab[word] = vector

  print("Vocabulary size [GloVe]: ", len(vocab))
  return vocab, None

def get_batch(df, i, batch_size, vocab, emb, word2index):
  """ Function to convert text into embeddings for a batch of data 
  emb can take values "bow", "gloveGEN" and "gloveLSTM" correspoding 
  to Bag of words model, GloVe embeddings (one vector per paragraph) and
  GloVe embeddings (a matrix corresponding per paragraph) respectively.
  """
  batches = []
  results = []

  texts = df.data[i*batch_size : i*batch_size+batch_size]
  categories = df.target[i*batch_size : i*batch_size+batch_size]
  
  for text in texts:
    text = clean(text)
    if emb == "bow":
      layer = np.zeros(len(vocab), dtype=float)
    elif emb == "gloveGEN":
      layer = np.zeros(len(vocab["a"]), dtype=float)
    elif emb == "gloveLSTM":
      layer = []

    for word in text.split(' '):
      # Computation for bag of words model
      if emb == "bow":
        layer[word2index[word.lower()]] += 1
      # Computation for GloVe embedding - general
      elif emb == "gloveGEN":
        if word in vocab:
          layer += vocab[word]
      # Computation for GloVe embedding - LSTM
      elif emb == "gloveLSTM":
        if word in vocab:
          layer.append(vocab[word])
      else:
        print("### Invalid embedding type ###")

    batches.append(layer)

  for category in categories:
    index_y = dict_categories[category]
    results.append(index_y)
  
  return np.array(batches),np.array(results)

In [None]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features

#####
#vocab, word2index = get_vocab_using_bow(newsgroups_train, newsgroups_test)
#input_size = len(vocab) # Words in vocab
#####
vocab, word2index = get_vocab_using_glove(dim=300) # dim = 50, 100, 200, 300
input_size = len(vocab["a"]) # Words in vocab
#####

num_classes = 7         # Categories: graphics, sci.space and baseball

from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
class OurNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out
# input [batch_size, n_labels]
# output [max index for each item in batch, ... ,batch_size-1]
loss = nn.CrossEntropyLoss()
input = Variable(torch.randn(2, 7), requires_grad=True)
print(input)
target = Variable(torch.LongTensor(2).random_(7))
output = loss(input, target)
output.backward()

Vocabulary size [GloVe]:  400000
tensor([[-0.3591, -0.3044, -0.2761,  0.6093, -0.4150, -0.8868, -0.9012],
        [ 0.4209, -0.0140,  0.8190,  0.2695, -0.3682, -0.6611,  0.6439]],
       requires_grad=True)


In [None]:
net = OurNet(input_size, hidden_size, num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  
#####
emb = "gloveGEN" #"bow", "gloveGEN" and "gloveLSTM"
#####
# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(newsgroups_train.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size,vocab, emb, word2index)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        # print("articles",articles)
        # print(batch_x, labels)
        # print("size labels",labels.size())

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(articles)
        #print(outputs.shape, labels.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.data))


        

Epoch [1/10], Step [4/75], Loss: 35.4270
Epoch [1/10], Step [8/75], Loss: 8.3492
Epoch [1/10], Step [12/75], Loss: 2.7281
Epoch [1/10], Step [16/75], Loss: 2.2970
Epoch [1/10], Step [20/75], Loss: 1.7194
Epoch [1/10], Step [24/75], Loss: 1.7143
Epoch [1/10], Step [28/75], Loss: 1.7290
Epoch [1/10], Step [32/75], Loss: 1.7897
Epoch [1/10], Step [36/75], Loss: 1.8303
Epoch [1/10], Step [40/75], Loss: 1.4828
Epoch [1/10], Step [44/75], Loss: 1.5176
Epoch [1/10], Step [48/75], Loss: 1.4640
Epoch [1/10], Step [52/75], Loss: 1.4455
Epoch [1/10], Step [56/75], Loss: 1.2459
Epoch [1/10], Step [60/75], Loss: 1.2882
Epoch [1/10], Step [64/75], Loss: 1.3257
Epoch [1/10], Step [68/75], Loss: 1.2401
Epoch [1/10], Step [72/75], Loss: 1.3542
Epoch [2/10], Step [4/75], Loss: 1.2921
Epoch [2/10], Step [8/75], Loss: 1.2329
Epoch [2/10], Step [12/75], Loss: 1.2873
Epoch [2/10], Step [16/75], Loss: 1.5232
Epoch [2/10], Step [20/75], Loss: 1.2279
Epoch [2/10], Step [24/75], Loss: 1.2392
Epoch [2/10], Step 

In [None]:
# Test the Model
correct = 0
total = 0
total_test_data = len(newsgroups_test.target)
batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data, vocab, emb, word2index)
articles = Variable(torch.FloatTensor(batch_x_test))
labels = torch.LongTensor(batch_y_test)
outputs = net(articles)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()

print('Accuracy of the network on the 1180 test articles: %d %%' % (100 * correct / total))


Accuracy of the network on the 1180 test articles: 63 %
