In [1]:
# for generating the feature vector
import json
from nltk import ngrams
import nltk
from collections import Counter

# for NN using pyTorch
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
def extract_headline_category(category_list):
    headline_category = []
    try:
        input_file = open('./News_Category_Dataset.json')
        input_data = input_file.readlines()
        input_file.close()
        for json_object in input_data:
            data = json.loads(json_object)
            category = data['category'].upper()
            if category in category_list:
                headline_category.append((data['headline'], data['category']))
        return headline_category
    except IOError:
        print("ERROR : IO ERROR occurred while opening file")
        exit(0)

In [3]:
category_list = ['Business', 'Comedy','Sports', 'Crime', 'Religion']
category_list = [i.upper() for i in category_list]
headlines_and_category = extract_headline_category(category_list)

In [4]:
def get_n_grams(dataset:'headline, category', n:'n gram value', k:'return top k n-grams'):
    n_grams_list=[]
    for headline, category in dataset:
        tokenize = nltk.word_tokenize(headline)
        n_gram = nltk.ngrams(tokenize, n)
        n_grams_list.extend(n_gram)
    print(len(n_grams_list))
    top_k = Counter(n_grams_list).most_common(k)
    most_frequent=[i[0] for i in top_k]
    return most_frequent

In [5]:
unigrams_dict = get_n_grams(headlines_and_category, 1, 500)
bigrams_dict = get_n_grams(headlines_and_category, 2, 300)
trigrams_dict = get_n_grams(headlines_and_category, 3, 200)

187445




169606
151792


In [6]:
pos_list = list({'CD', 'CC', 'RP', 'NNPS', 'IN', ',', '$', 'FW', 'RBR', 'JJ', "''", ')', 'VBD', 'VBP', 'POS', ':', 'NNS', '#', 'PRP', '(', 'VBN', 'PDT', 'JJS', 'VBG', 'PRP$', 'RBS', 'LS', '.', 'EX', 'NN', '``', 'DT', 'RB', 'WDT', 'VB', 'UH', 'TO', 'JJR', 'VBZ', 'MD', 'NNP', 'WP', 'WRB'})

In [7]:
def generate_features(dataset, unigrams_dict, bigrams_dict, trigrams_dict, pos_list, category_list):
    X = []
    y = []
    for headline, category in dataset:
        text = nltk.word_tokenize(headline)
        unigrams = nltk.ngrams(text, 1)
        bigrams = nltk.ngrams(text, 2)
        trigrams = nltk.ngrams(text, 3)
#         list_of_ngrams_dict = [unigrams_dict]
#         list_of_ngrams_sentences = [unigrams]
        list_of_ngrams_dict = [unigrams_dict, bigrams_dict, trigrams_dict]
        list_of_ngrams_sentences = [unigrams, bigrams, trigrams]
        temp_sentence = []
        for i, ngram in enumerate(list_of_ngrams_dict):
            temp_ngram = [0]*len(ngram)
            for word in list_of_ngrams_sentences[i]:
                if word in ngram:
                    temp_ngram[ngram.index(word)]+=1
            temp_sentence.extend(temp_ngram)
        
    
#         temp_pos = [0]*len(pos_list)
#         for word, tag in nltk.pos_tag(text):
#             temp_pos[pos_list.index(tag)]+=1
#         temp_sentence.extend(temp_pos)
        X.append(temp_sentence)
        y.append(category_list.index(category))
    return (X,y)

In [8]:
X, y = generate_features(headlines_and_category, unigrams_dict, bigrams_dict, trigrams_dict, pos_list, category_list)



In [9]:
y_temp = []
for i in y:
    temp = [0, 0, 0, 0, 0]
    temp[i] = 1
    y_temp.append(temp)
y = y_temp

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

inputs = list(map(lambda s: Variable(torch.Tensor([s])), X_train))
targets = list(map(lambda s: Variable(torch.Tensor([s])), y_train))
in_features = len(X_train[0])
out_features = len(category_list)

In [10]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Sigmoid(),
            nn.Linear(64, out_features),
        )

    def forward(self, input):
        return self.main(input)

In [11]:
net = Net()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

EPOCHS_TO_TRAIN = 50
best_model = net
best_loss = 1
print("Training loop:")
for idx in range(0, EPOCHS_TO_TRAIN):
    for input, target in zip(inputs, targets):
        optimizer.zero_grad()   # zero the gradient buffers
        output = net(input)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()    # Does the update
    if loss.data.numpy() < best_loss:
        best_model = net
    print("Epoch {: >8} Loss: {}".format(idx, loss.data.numpy()))

Training loop:
Epoch        0 Loss: 0.18248659372329712
Epoch        1 Loss: 0.191620334982872
Epoch        2 Loss: 0.19245700538158417
Epoch        3 Loss: 0.1787334680557251
Epoch        4 Loss: 0.15703199803829193
Epoch        5 Loss: 0.13386061787605286
Epoch        6 Loss: 0.11300627887248993
Epoch        7 Loss: 0.09596505761146545
Epoch        8 Loss: 0.08265608549118042
Epoch        9 Loss: 0.07235165685415268
Epoch       10 Loss: 0.06426673382520676
Epoch       11 Loss: 0.057776566594839096
Epoch       12 Loss: 0.052440106868743896
Epoch       13 Loss: 0.04795856773853302
Epoch       14 Loss: 0.044129833579063416
Epoch       15 Loss: 0.04081406071782112
Epoch       16 Loss: 0.03791188821196556
Epoch       17 Loss: 0.035350583493709564
Epoch       18 Loss: 0.033075083047151566
Epoch       19 Loss: 0.031043149530887604
Epoch       20 Loss: 0.029221152886748314
Epoch       21 Loss: 0.027582207694649696
Epoch       22 Loss: 0.026104064658284187
Epoch       23 Loss: 0.0247682444751

In [12]:
inputs = list(map(lambda s: Variable(torch.Tensor([s])), X_test))
targets = list(map(lambda s: Variable(torch.Tensor([s])), y_test))

In [13]:
print("")
print("Final results:")
total_accurate = 0
for inp, target in zip(inputs, targets):
    output = best_model(inp)
    temp = target[0].detach().numpy().tolist()
    temp_2 = output[0].detach().numpy().tolist()
    if temp_2.index(max(temp_2)) == temp.index(max(temp)):
        total_accurate += 1
print(total_accurate / len(inputs))


Final results:
0.6861866068926871
