### **Importing**

In [1]:
from tensorflow import keras 
import pandas as pd
import numpy as np
import csv

In [2]:
import tensorflow as tf
import string

In [3]:
import re
from tensorflow.keras import models,layers,Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
import tensorflow.keras.backend as K
from matplotlib import pyplot as plt
from keras.models import load_model

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Preprocessing**

In [6]:
df=pd.read_csv("/content/drive/MyDrive/Datasets/DL-NLP-A4/train.csv")

In [7]:
df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


In [8]:
# swap labels with integers
df.Category=df.Category.replace("business",0)
df.Category=df.Category.replace("tech",1)
df.Category=df.Category.replace("entertainment",2)
df.Category=df.Category.replace("sport",3)
df.Category=df.Category.replace("politics",4)

In [9]:
df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,0
1,german business confidence slides german busin...,0
2,bbc poll indicates economic gloom citizens in ...,0
3,lifestyle governs mobile choice faster bett...,1
4,enron bosses in $168m payout eighteen former e...,0


In [32]:
def is_special(text):
    rem = ''
    for i in text: 
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
            rem=rem+i
            rem = rem + ' '
    return rem
df.Text=df.Text.apply(is_special)

In [33]:
def rem_extra(text):
    rem=re.sub(' +', ' ',text)
    return str(rem).strip()
df.Text=df.Text.apply(rem_extra)

In [10]:
def get_max_length(df):
    max_length = 0
    length=[]
    for row in df['Text']:
        length.append(len(row.split(" ")))
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length,length

In [11]:
df_test=pd.read_csv("/content/drive/MyDrive/Datasets/DL-NLP-A4/test.csv")

In [12]:
df_label=pd.read_csv("/content/drive/MyDrive/Datasets/DL-NLP-A4/TestData_Labels.csv")

In [13]:
df_label.Category=df_label.Category.replace("business",0)
df_label.Category=df_label.Category.replace("tech",1)
df_label.Category=df_label.Category.replace("entertainment",2)
df_label.Category=df_label.Category.replace("sport",3)
df_label.Category=df_label.Category.replace("politics",4)

In [14]:
df_label.head()

Unnamed: 0,ArticleId,Category
0,1018,3
1,1319,1
2,1138,0
3,459,2
4,1020,4


In [15]:
df_test.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


### **Preparing new test labels**

In [16]:
df_test = pd.read_csv("/content/drive/MyDrive/Datasets/DL-NLP-A4/Assignment4_TestLabels.csv")

In [17]:
df_test.columns = ['ArticleId', 'Category']

ValueError: ignored

In [None]:
df_test.head()

In [None]:
df_label = df_test.copy()

In [None]:
df_test = df_test[['ArticleId', 'Label - (business, tech, politics, sport, entertainment)']]

### **Preparing the data**

In [34]:
token=keras.preprocessing.text.Tokenizer(filters='"#$*+/:;<=>@[\\]^_{|}~\t\n')
token.fit_on_texts(df.Text)

In [35]:
size_of_vocabulary=len(token.word_index)+1
size_of_vocabulary

24792

In [36]:
max_len=1000

In [37]:
def prepare_text(text, token, max_len):
    text_seqs = token.texts_to_sequences(text)
    return tf.keras.preprocessing.sequence.pad_sequences(text_seqs, maxlen = max_len)

In [38]:
data_train = prepare_text(df.Text, token, max_len)
data_test = prepare_text(df_test.Text, token, max_len)

In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(data_train, df.Category.values, random_state=42, test_size=0.15)

### **Pretrained word vectors** 

In [40]:
def load_emb(addrs,size_of_vocabulary,tk):
    embeddings_index = dict()
    f = open(addrs)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    
    embedding_matrix = np.zeros((size_of_vocabulary, 300))
    c=0
    for word, i in tk.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            c+=1
        else:
            pass
    print('No. of out of vocab word in train set= %s'%(size_of_vocabulary-c))
    
    return embedding_matrix

In [41]:
fasttext = load_emb("/content/drive/MyDrive/Embeddings/wiki-news-300d-1M-subword.vec",size_of_vocabulary,token)

Loaded 999995 word vectors.
No. of out of vocab word in train set= 3471


In [42]:
glove = load_emb("/content/drive/MyDrive/Embeddings/glove.6B.300d.txt",size_of_vocabulary,token)

Loaded 400000 word vectors.
No. of out of vocab word in train set= 1053


In [43]:
def w2v():
  from gensim import models
  w2v = models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Embeddings/GoogleNews-vectors-negative300.bin', binary=True)
  embd_w2v = np.zeros((size_of_vocabulary, 300))
  c=0
  for word, i in token.word_index.items():
      try:
        embd_w2v[i] =w2v[word]   
      except:
        c+=1
        
  print('No. of out of vocab word in train set= %s'%(c))
  return embd_w2v
    

In [44]:
word2vec = w2v()

No. of out of vocab word in train set= 5760


### **Dynamic meta embedding**

In [45]:
def Concat_Emb(list_emb, maxlen):
    inputs = []
    output = []
    for embedding in list_emb:
        inp = layers.Input(shape=(maxlen,))
        emb = layers.Embedding(size_of_vocabulary, 300, weights=[embedding], trainable=False)(inp)
        emb = layers.Reshape((-1,300,1))(emb)
        inputs.append(inp)
        output.append(emb)
    concat = layers.Concatenate(axis=-1)(output)
    return Model(inputs, concat)

In [46]:
def DME(maxlen):
    inp = layers.Input(shape=(maxlen, 300, 3))
    x = layers.Reshape((maxlen, -1))(inp)
    x = layers.LSTM(3, return_sequences=True)(x)
    x = layers.Activation('sigmoid')(x)
    x = layers.Reshape((maxlen, 1, 3))(x)
    x = layers.multiply([inp, x])
    out = layers.Lambda(lambda t: K.sum(t, axis=-1))(x)
    return Model(inp, out)

### **Without CNN**

In [None]:
concat_inputs = Concat_Emb([fasttext, glove, word2vec], maxlen=max_len)

dme = DME(max_len)
x = dme(concat_inputs.output)
x = layers.GRU(128, return_sequences=True)(x)
x = layers.GRU(32)(x)
out = layers.Dense(5, activation='softmax')(x)
basic_model = Model(concat_inputs.input, out)
basic_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, mode='min')

In [None]:
basic_model.fit([data_train]*3, df.Category.values, batch_size=64, validation_split=0.1, epochs=50,callbacks=early_stop)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.callbacks.History at 0x7f19762c2bd0>

### **With CNN, without self-attention**

In [53]:
concat_inputs = Concat_Emb([fasttext, glove, word2vec], maxlen=max_len)
dme = DME(max_len)

x = dme(concat_inputs.output)
x=layers.Conv1D(128, 3, activation='swish', input_shape=(1000,300), padding='same')(x)
x = layers.GRU(64)(x)
out = layers.Dense(5, activation='softmax')(x)

CLSTM_model = Model(concat_inputs.input, out)
CLSTM_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min')

In [54]:
CLSTM_model.fit([X_train]*3, y_train, validation_data=([X_valid]*3, y_valid) ,batch_size=16, epochs=50, callbacks = early_stop)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


<keras.callbacks.History at 0x7f3ad7933290>

### **With self-attention**

In [56]:
!pip install keras-self-attention

Collecting keras-self-attention
  Downloading keras-self-attention-0.50.0.tar.gz (12 kB)
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.50.0-py3-none-any.whl size=19414 sha256=b077e0669758839be2c241556bf7e3f7c029e860ea6ff09497bff22978197f08
  Stored in directory: /root/.cache/pip/wheels/92/7a/a3/231bef5803298e7ec1815215bc0613239cb1e9c03c57b13c14
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.50.0


In [None]:
from keras_self_attention import SeqSelfAttention

concat_inputs = Concat_Emb([fasttext, glove, word2vec], maxlen=max_len)
dme = DME(max_len)

x = dme(concat_inputs.output)
x=layers.Conv1D(128, 3, activation='swish', input_shape=(1000,300),padding='same')(x)
x = layers.GRU(64, return_sequences=True)(x)
x = SeqSelfAttention(attention_activation='sigmoid')(x)
x = layers.GlobalAvgPool1D()(x)
out = layers.Dense(5, activation='softmax')(x)

attention_model = Model(concat_inputs.input, out)
attention_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
attention_model.fit([X_train]*3, y_train, validation_data=([X_valid]*3,y_valid), batch_size=16, epochs=100, callbacks=early_stop)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<keras.callbacks.History at 0x7f194f09dc90>

### **Slightly different architecture**

In [58]:
from keras_self_attention import SeqSelfAttention

concat_inputs = Concat_Emb([fasttext, glove, word2vec], maxlen=max_len)
dme = DME(max_len)

x = dme(concat_inputs.output)
x=layers.Conv1D(128, 3, activation='swish', input_shape=(1000,300),padding='same')(x)
x = layers.GRU(64, return_sequences=True)(x)
x = SeqSelfAttention(attention_activation='sigmoid')(x)
x = layers.GlobalAvgPool1D()(x)
out = layers.Dense(5, activation='softmax')(x)

attention_model2 = Model(concat_inputs.input, out)
attention_model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [60]:
attention_model2.fit([X_train]*3, y_train, validation_data=([X_valid]*3,y_valid), batch_size=16, epochs=30, callbacks=early_stop)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


<keras.callbacks.History at 0x7f3ae005fa50>

In [None]:
model.save('/content/drive/MyDrive/Models/DL-NLP-A4/')



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Models/DL-NLP-A4/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Models/DL-NLP-A4/assets


In [None]:
y_pred=model.predict([data_test]*3)

In [None]:
print(classification_report(df_label.Category.values,np.argmax(y_pred,axis=1)))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       176
           1       0.96      0.95      0.96       135
           2       0.94      0.90      0.92       113
           3       1.00      0.98      0.99       168
           4       0.95      0.97      0.96       143

    accuracy                           0.96       735
   macro avg       0.96      0.96      0.96       735
weighted avg       0.96      0.96      0.96       735



In [None]:
np.argmax(y_pred,axis=1)

array([3, 1, 3, 0, 3, 3, 4, 4, 2, 0, 0, 1, 4, 1, 2, 3, 4, 1, 2, 2, 0, 4,
       3, 0, 4, 3, 0, 3, 3, 0, 4, 1, 0, 0, 3, 3, 3, 0, 2, 4, 1, 4, 2, 1,
       3, 1, 2, 0, 4, 0, 4, 0, 0, 0, 1, 0, 1, 2, 3, 1, 3, 2, 1, 4, 0, 2,
       3, 1, 3, 3, 0, 3, 0, 4, 1, 3, 1, 1, 1, 2, 4, 3, 2, 0, 0, 2, 0, 2,
       0, 1, 0, 4, 3, 1, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 0, 3, 4, 3, 0, 2,
       3, 0, 2, 3, 4, 3, 4, 3, 4, 0, 1, 0, 2, 2, 1, 3, 0, 2, 0, 2, 0, 4,
       4, 1, 0, 0, 4, 1, 2, 3, 0, 1, 3, 2, 4, 3, 3, 2, 2, 1, 0, 1, 4, 1,
       3, 3, 3, 3, 2, 1, 0, 1, 0, 1, 0, 1, 2, 1, 1, 4, 0, 4, 0, 0, 2, 4,
       1, 0, 0, 1, 3, 4, 3, 4, 1, 1, 4, 0, 4, 2, 4, 0, 2, 3, 1, 1, 0, 1,
       4, 0, 3, 4, 0, 2, 0, 0, 3, 1, 0, 3, 2, 2, 3, 2, 3, 1, 4, 2, 3, 2,
       3, 2, 4, 0, 1, 2, 0, 4, 0, 1, 0, 3, 4, 4, 0, 4, 3, 0, 0, 4, 3, 4,
       0, 3, 1, 0, 4, 0, 4, 0, 0, 3, 1, 4, 2, 1, 2, 2, 3, 3, 1, 3, 3, 3,
       2, 3, 4, 1, 0, 3, 0, 3, 0, 3, 2, 0, 0, 2, 4, 0, 3, 3, 1, 3, 3, 2,
       0, 3, 1, 4, 2, 0, 0, 4, 3, 2, 4, 0, 3, 3, 1,

In [None]:
for i in range(len(labels)):
  index = np.argmax(labels[i])
  labels[i] = np.zeros(5)
  labels[index] = 1

In [None]:
temp = labels.argmax(axis = 1)

In [None]:
labels

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

In [None]:
df_label.Category.values,np.argmax(y_pred,axis=1)

(array([3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1,
        0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2,
        4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3,
        1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0,
        2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4,
        3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1,
        0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2,
        4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3,
        1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0,
        2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4,
        3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1,
        0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2,
        4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3,
        1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 

In [None]:
preds = np.argmax(y_pred,axis=1)
preds

array([3, 1, 3, 0, 3, 3, 4, 4, 2, 0, 0, 1, 4, 1, 2, 3, 4, 1, 2, 2, 0, 4,
       3, 0, 4, 3, 0, 3, 3, 0, 4, 1, 0, 0, 3, 3, 3, 0, 2, 4, 1, 4, 2, 1,
       3, 1, 2, 0, 4, 0, 4, 0, 0, 0, 1, 0, 1, 2, 3, 1, 3, 2, 1, 4, 0, 2,
       3, 1, 3, 3, 0, 3, 0, 4, 1, 3, 1, 1, 1, 2, 4, 3, 2, 0, 0, 2, 0, 2,
       0, 1, 0, 4, 3, 1, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 0, 3, 4, 3, 0, 2,
       3, 0, 2, 3, 4, 3, 4, 3, 4, 0, 1, 0, 2, 2, 1, 3, 0, 2, 0, 2, 0, 4,
       4, 1, 0, 0, 4, 1, 2, 3, 0, 1, 3, 2, 4, 3, 3, 2, 2, 1, 0, 1, 4, 1,
       3, 3, 3, 3, 2, 1, 0, 1, 0, 1, 0, 1, 2, 1, 1, 4, 0, 4, 0, 0, 2, 4,
       1, 0, 0, 1, 3, 4, 3, 4, 1, 1, 4, 0, 4, 2, 4, 0, 2, 3, 1, 1, 0, 1,
       4, 0, 3, 4, 0, 2, 0, 0, 3, 1, 0, 3, 2, 2, 3, 2, 3, 1, 4, 2, 3, 2,
       3, 2, 4, 0, 1, 2, 0, 4, 0, 1, 0, 3, 4, 4, 0, 4, 3, 0, 0, 4, 3, 4,
       0, 3, 1, 0, 4, 0, 4, 0, 0, 3, 1, 4, 2, 1, 2, 2, 3, 3, 1, 3, 3, 3,
       2, 3, 4, 1, 0, 3, 0, 3, 0, 3, 2, 0, 0, 2, 4, 0, 3, 3, 1, 3, 3, 2,
       0, 3, 1, 4, 2, 0, 0, 4, 3, 2, 4, 0, 3, 3, 1,

In [None]:
df_pred.Category=df_pred.Category.replace(0, "business")
df_pred.Category=df_pred.Category.replace(1, "tech")
df_pred.Category=df_pred.Category.replace(2, "entertainment")
df_pred.Category=df_pred.Category.replace(3, "sport")
df_pred.Category=df_pred.Category.replace(4, "politics")

In [None]:
df_label

Unnamed: 0,ArticleId,Category
0,1018,3
1,1319,1
2,1138,3
3,459,0
4,1020,3
...,...,...
730,1923,0
731,373,2
732,1704,3
733,206,0


In [None]:
df_pred = df_label.copy()

In [None]:
preds

In [None]:
df_pred.Category = preds

In [None]:
df_pred

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,business
733,206,business


In [None]:
df_pred.to_csv('/content/drive/MyDrive/Datasets/DL-NLP-A4/solution.csv', index_label = False)

In [None]:
temp = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATASETS/assgn4/solution.csv')

In [None]:
temp

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,business
733,206,business


### **Load Saved Model**



In [None]:
loaded_model = keras.models.load_model('/content/drive/MyDrive/Models/DL-NLP-A4/')
loaded_model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1000, 300)    7437600     input_13[0][0]                   
___________________________________________________________________________________________

In [None]:
y_pred=loaded_model.predict([data_test]*3)

In [None]:
print(classification_report(df_label.Category.values,np.argmax(y_pred,axis=1)))