# **News Category Classification using LSTM**
**News categories included in this dataset include business; science and technology; entertainment; and health.** 

**Different news articles that refer to the same news item (e.g., several articles about recently released employment statistics) are also categorized together.**

In [4]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
from tensorflow import keras
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.models import Sequential
from keras.utils.np_utils import to_categorical

In [5]:
#importing the dataset
dir = pd.read_csv("/uci-news-aggregator.csv")
pd.set_option('display.max_columns', None)
dir.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


**WE HAVE ONLY TWO FEATURES OF USE**

1. **TITLE**
2. **CATEGORY**

In [6]:
#creating a new dataset with only relevant features.
ds = dir[['TITLE','CATEGORY']]
ds.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


**HERE YOU CAN SEE THAT ALL CATEGORIES ARE IN ORDER(ALL B's TOGETHER AND SO ON), THEREFORE SHUFFLING THEM FOR OUR CONVENIENCE**

In [7]:
#shuffling rows with the help of sample, here (frac = 1) means return all rows
ds = ds.sample(frac=1).reset_index(drop=True)
ds.head()

Unnamed: 0,TITLE,CATEGORY
0,Teen Arrested After Tweeting Bizarre Terrorist...,e
1,AP Top News at 8:59 am EDT,e
2,WHO says fight against West Africa Ebola outbr...,m
3,FTC Accuses T-Mobile of Skimming Hundreds of M...,t
4,"Carrie Underwood and Miranda Lambert, plus mor...",e


**DATASET IS NOW SHUFFLED**

In [9]:
#checking for null values
ds.isnull().sum()

TITLE       0
CATEGORY    0
dtype: int64


**NO NULL VALUES FOUND**

In [10]:
#plotting graph for categories
sns.countplot(x = 'CATEGORY',data = ds)

<AxesSubplot:xlabel='CATEGORY', ylabel='count'>

**THERE ARE FOUR TYPES OF CATEGORIES-**
1. **b : business (~115000)**
2. **t : science and technology (~110000)**
3. **e : entertainment (~150000)**
4. **m : health (~40000)**


**NOW MOVING ONTO CLEANING AND PREPROCESSING OF THE TEXT DATA**

In [11]:
#cleaning and preprocessing the text

cleaned = []
for i in range(0,len(ds)):
    
    #removing any other words than (a-z) and (A-Z)
    msg = re.sub('[^a-zA-Z]',' ',ds['TITLE'][i])
    
    #converting all texts to lower case
    msg = msg.lower()
    
    #tokenizing
    msg = msg.split()
    
    #stemming and removing stopwords
    ps = PorterStemmer()
    msg = [ps.stem(words) for words in msg if not words in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    cleaned.append(msg)

In [12]:
#cleaned data with no punctuations,stopwords and all texts in lowercase.
cleaned[:5]

['teen arrest tweet bizarr terrorist threat american airlin',
 'ap top news edt',
 'say fight west africa ebola outbreak begin',
 'ftc accus mobil skim hundr million bogu charg',
 'carri underwood miranda lambert plu cmt nomine face']

In [13]:
#taking dictionary size 5000
dict_size = 5000

#one hot encoding
one_hot_mat = [one_hot(words,dict_size) for words in cleaned]

#now for input as an embedding layer length of all rows should be equal therefore applying padding
#this will make size of all rows equal by adding 0 at starting of the shorter rows
#size of each row will be equal to length of longest row.
embedded_layer = pad_sequences(one_hot_mat,padding = 'pre',maxlen = 150)
embedded_layer

array([[   0,    0,    0, ..., 1106, 4366,  692],
       [   0,    0,    0, ..., 4618,  925, 3209],
       [   0,    0,    0, ..., 2960,  806, 2055],
       ...,
       [   0,    0,    0, ..., 2241, 1731, 3986],
       [   0,    0,    0, ..., 4816, 2174,  631],
       [   0,    0,    0, ..., 1336,  668, 2535]])

In [14]:
#now creating independent and dependent features
x = embedded_layer
y = np.array(ds['CATEGORY'])

In [15]:
#converting categorical values of y using OneHotEncoding
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y,4)

In [16]:
y[:10]

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [17]:
#splitting the Dataset into Train and Test set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(337935, 150) (337935, 4)
(84484, 150) (84484, 4)


In [18]:
#creating model using LSTM
model = Sequential()

#taking number features as 50
model.add(Embedding(dict_size,50,input_length = len(x[0])))
model.add(Dropout(0.2))

#adding LSTM layers with 100 neurons
model.add(LSTM(100))

#adding output layer 
model.add(Dense(4,activation="softmax"))

#compiling the model
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=["accuracy"])

#summary of model
model.summary()

#training the model
model.fit(x_train, y_train, validation_data = (x_test,y_test), epochs = 10, batch_size = 256)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 50)           250000    
_________________________________________________________________
dropout (Dropout)            (None, 150, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               60400     
_________________________________________________________________
dense (Dense)                (None, 4)                 404       
Total params: 310,804
Trainable params: 310,804
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d75517e940>

In [19]:
#evaluating our model
model.evaluate(x_test,y_test)



[0.24101004004478455, 0.9181738495826721]

In [20]:
#making predictions
pred = model.predict(x_test)

#saving index of maximum value of pred in preds (because in pred probabilities will come)
preds = []
for i in range(0,len(pred)):
    preds.append(pred[i].argmax())

#saving index of maximum value of y_test in actual
actual = []
for i in range(0,len(y_test)):
    actual.append(y_test[i].argmax())


In [21]:
#classification report
from sklearn import metrics
report = metrics.classification_report(actual, preds, target_names = ['b','t','e','m'])
print(report)

              precision    recall  f1-score   support

           b       0.90      0.90      0.90     23124
           t       0.95      0.95      0.95     30536
           e       0.91      0.88      0.89      9134
           m       0.90      0.91      0.90     21690

    accuracy                           0.92     84484
   macro avg       0.91      0.91      0.91     84484
weighted avg       0.92      0.92      0.92     84484



In [22]:
#checking category of a text
txt = ["A soccer was eaten by Elvis Presley."]

#cleaning and preprocessing the text
cleaned = []
for i in range(0,len(txt)):
    msg = re.sub('[^a-zA-Z]',' ',txt[i])
    msg = msg.lower()
    msg = msg.split()
    ps = PorterStemmer()
    msg = [ps.stem(words) for words in msg if not words in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    cleaned.append(msg)

#one hot encoding and embedding layer
one_hot_mat = [one_hot(words,dict_size) for words in cleaned]
embedded_layer = pad_sequences(one_hot_mat,padding = 'pre',maxlen = 150)
embedded_layer

#prediction
pred = model.predict(embedded_layer)
cat = ['Business','Science','Entertainment','Health']
print(pred, cat[np.argmax(pred)])

[[4.3531038e-02 1.0842958e-03 2.0914743e-04 9.5517546e-01]] Health


In [25]:
from tensorflow.keras.models import load_model

model.save('my_model.h5')

In [19]:
import pickle
pickle_out = open("model.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()

TypeError: cannot pickle 'weakref' object