In [None]:
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(6,6)

plt.style.use('fivethirtyeight')

from tqdm import tqdm 
import time
import re
import string
import nltk
from nltk.corpus import stopwords,wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer,WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Dropout,GRU,LSTM,Dense,Bidirectional
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
df=pd.read_json("News_Category_Dataset_v2.json",lines=True)
df.head()

In [None]:
df.category=df.category.map(lambda x: "WORLDPOST" if x== "THE WORLDPOST" else x)


In [None]:
df['Text']=df.headline+" "+df.short_description

In [None]:
def clean_text(text):
    text=re.sub(r"[^A-Za-z0-9(),!?\'\`]"," ",text)
    return text

In [None]:
df.Text=df.Text.apply(clean_text)


In [None]:
types=df.category.unique()

In [None]:
types,len(types)

In [None]:
def get_type_index(string):
    return list(types).index(string)

In [None]:
df['type_index']=df.category.apply(get_type_index)


In [None]:
train, test = train_test_split(df)
train, val = train_test_split(train)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size=10000
trunc_type="post"
pad_type="post"
oov_tok="<OOV>"
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(df.Text.values)

In [None]:
one_hot_lables=tf.keras.utils.to_categorical(train.type_index.values,num_classes=40)
val_labels=tf.keras.utils.to_categorical(val.type_index.values,num_classes=40)

In [None]:
import transformers

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-large-uncased')

In [None]:
maxlen=200
train_input_ids=[tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in train.Text.values]


In [None]:
val_input_ids = [tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in val.Text.values]

In [None]:
train_input_ids[:5]

In [None]:
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(40, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00001), metrics=['accuracy'])
    return model

In [None]:
model = create_model()


In [None]:
model.summary()

In [None]:
batch_size = 1

history=model.fit(np.array(train_input_ids), one_hot_lables,validation_data = (np.array(val_input_ids), val_labels),epochs = 4, batch_size = batch_size)

In [None]:
test_input_ids = [tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in test.text.values]
test_labels= tf.keras.utils.to_categorical(test.type_index.values, num_classes=40)
model.evaluate(np.array(test_input_ids), test_labels)
predict= model.predict(np.array(test_input_ids))