# Import Libraries

In [None]:
import numpy as np 
import pandas as pd 

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
from keras.layers import Reshape, merge, Concatenate, Lambda, Average
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers.merge import add

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from nltk.stem import WordNetLemmatizer
import time

import tensorflow as tf
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (6,6)
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Load data from Google Drive to Google CoLab

In [None]:
from google.colab import drive
drive.mount("/content/gdrive/", force_remount=True)
df = pd.read_json("/content/gdrive/My Drive/351/Final/News_Category_Dataset_v2.json", lines = True)

df.head()

# Data Preprocessing

In [None]:
print(df.size)

cates = df.groupby('category')
print("total categories:", cates.ngroups)
print(cates.size())

In [None]:
df.category = df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

In [None]:
df['text'] = df.headline + " " + df.short_description
textArray = df['text'].to_numpy(dtype=str)

lemmatizer = WordNetLemmatizer()
stopWords = stopwords.words('english')
cleaned_title = []

for i in range(textArray.shape[0]):
    title_sentence = re.sub('[^A-Za-z0-9 ]+', '', textArray[i])
    title_tokens = nltk.word_tokenize(title_sentence)
    title_tokens = [w for w in title_tokens if not w.lower() in stopWords]
    title_lemmatized_word = [lemmatizer.lemmatize(word.lower()) for word in title_tokens]
    cleaned_title.append(" ".join(word.lower() for word in title_lemmatized_word))
cleaned_title = np.asarray(cleaned_title, dtype=object)


tk = Tokenizer()
tk.fit_on_texts(cleaned_title)
X = tk.texts_to_sequences(cleaned_title)
df['words'] = X

df.head()

In [None]:
categories = df.groupby('category').size().index.tolist()
categoryInt = {}
intCategory = {}
for i, k in enumerate(categories):
    categoryInt.update({k:i})
    intCategory.update({i:k})

df['catUpdate'] = df['category'].apply(lambda x: categoryInt[x])

In [None]:
X = np.array(X)
Y = np_utils.to_categorical(list(df.catUpdate))

seed = 29
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

# Naive Bayes

In [None]:

def normalize_text(n):
    n = n.lower()
    n = re.sub('\s\W',' ',n)
    n = re.sub('\W\s',' ',n)
    n = re.sub('\s+',' ',n)
    
    return n

df['normHead'] = [normalize_text(s) for n in df['headline']]

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['normHead'])

encoder = LabelEncoder()
y = encoder.fit_transform(df['category'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


nb = MultinomialNB()
nb.fit(x_train, y_train)

print(nb.score(x_test, y_test)*100)

# Text CNN

In [None]:
wordIndex = tk.word_index

EMBEDDING_DIM = 100

embeddingsIndex = {}
f = open('/content/gdrive/My Drive/351/Final/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddingsIndex[word] = vector
f.close()

print('Unique tokens: %s' % len(wordIndex))
print('Word vectors %s' % len(embeddingsIndex))

In [None]:
embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
for word, i in wordIndex.items():
    embeddingVector = embeddingsIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector

embedding_layer = Embedding(len(wordIndex)+1,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embeddingMatrix),
                            input_length=maxlen,
                            trainable=False)


In [None]:
inp = Input(shape=(maxlen,), dtype='int32')
embedding = embedding_layer(inp)
stacks = []
for kernel_size in [2, 3, 4]:
    conv = Conv1D(64, kernel_size, padding='same', activation='relu', strides=1)(embedding)
    pool = MaxPooling1D(pool_size=3)(conv)
    drop = Dropout(0.5)(pool)
    stacks.append(drop)

merged = Concatenate()(stacks)
flatten = Flatten()(merged)
drop = Dropout(0.5)(flatten)
outp = Dense(len(int_category), activation='softmax')(drop)

TextCNN = Model(inputs=inp, outputs=outp)
TextCNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

TextCNN.summary()

In [None]:
textcnn_history = TextCNN.fit(x_train, 
                              y_train, 
                              batch_size=128, 
                              epochs=5, 
                              validation_data=(x_val, y_val))

In [None]:
acc = textcnn_history.history['accuracy']
val_acc = textcnn_history.history['val_accuracy']
loss = textcnn_history.history['loss']
val_loss = textcnn_history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()

plt.figure()
plt.title('Training and validation loss')
plt.plot(epochs, loss, 'red', label='Training loss')
plt.plot(epochs, val_loss, 'blue', label='Validation loss')
plt.legend()

plt.show()

# Attention LSTM

In [None]:
import keras as keras
class Attention(keras.layers.Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight(shape = (input_shape[-1],),
                                 initializer=self.init,
                                 name='kernel',
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight(shape = (input_shape[1],),
                                     initializer='zeros',
                                     name='bias',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None:
            a *= K.cast(mask, K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)
    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    

lstm_layer = LSTM(300, dropout=0.25, recurrent_dropout=0.25, return_sequences=True)

inp = Input(shape=(maxlen,), dtype='int32')
embedding= embedding_layer(inp)
x = lstm_layer(embedding)
x = Dropout(0.25)(x)
merged = Attention(maxlen)(x)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.25)(merged)
merged = BatchNormalization()(merged)
outp = Dense(len(int_category), activation='softmax')(merged)

AttentionLSTM = Model(inputs=inp, outputs=outp)
AttentionLSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

AttentionLSTM.summary()

In [None]:
attlstm_history = AttentionLSTM.fit(x_train, 
                                    y_train, 
                                    batch_size=128, 
                                    epochs=10, 
                                    validation_data=(x_val, y_val))

In [None]:
acc = attlstm_history.history['acc']
val_acc = attlstm_history.history['val_acc']
loss = attlstm_history.history['loss']
val_loss = attlstm_history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.title('Training Over Time')
plt.plot(epochs, acc, 'red', label='Training Accuracy')
plt.plot(epochs, val_acc, 'green', label='Testing Accuracy')
plt.legend()

plt.figure()
plt.title('Training and validation loss')
plt.plot(epochs, loss, 'red', label='Training loss')
plt.plot(epochs, val_loss, 'blue', label='Validation loss')
plt.legend()

plt.show()