# Sentiment Analysis of News Headlines using Deep Neural Networks

### Installing necessary packages, importing libraries and modules

In [11]:
#install needed packages
!pip install snorkel
!pip install spacy
!pip install textblob
!pip install tensorflow

#import libraries and modules
import io
import pandas as pd
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Error loading stopwords: <urlopen error [WinError 10061]
[nltk_data]     No connection could be made because the target machine
[nltk_data]     actively refused it>


### Getting News Titles and Article from BBC API

In [155]:
# importing requests package
import requests    
 
def NewsFromBBC():
     
    # BBC news api
    # following query parameters are used
    # source, sortBy and apiKey
    query_params = {
      "source": "bbc-news",
      "sortBy": "top",
      "apiKey": "4dbc17e007ab436fb66416009dfb59a8"
    }
    main_url = " https://newsapi.org/v1/articles"
 
    # fetching data in json format
    res = requests.get(main_url, params=query_params)
    open_bbc_page = res.json()
 
    # getting all articles in a string article
    article = open_bbc_page["articles"]
    # empty list which will contain all trending news
    results = []
     
    for ar in article:
        results.append(ar["title"])
         
    for i in range(len(results)):
         
        # printing all trending news
        print(i + 1, results[i])
    
    return results, article
                  
 
# Driver Code
if __name__ == '__main__':
     
    # function call.
    title, article = NewsFromBBC()
    

1 Winter storm's icy blast hits 200 million in US
2 US winter storm: Stranded Native Americans burn clothes for warmth
3 Iran protests: Activist Narges Mohammadi details 'abuse' of detained women
4 Russia-Ukraine war: Strikes on Kherson kill seven
5 South Africa: Fuel tanker explosion kills several near hospital
6 Scuba diving Santa makes waves in Florida
7 Megan Thee Stallion: US jury finds Tory Lanez guilty of shooting hip-hop star
8 Bethlehem sees Christmas tourism boost after two-year Covid hiatus
9 Afghanistan protests: Taliban use water cannon on women opposing university ban
10 The Sun apologises over Jeremy Clarkson's Meghan column


### Loading local data set for training

#### The dataset which is used in this project is called the ‘ Million News Headlines’ dataset and it is available on Kaggle. 

In [14]:
#uplaod the data from your local directory
# store the dataset as a Pandas Dataframe

file=('abcnews-date1-text.csv')
df = pd.read_csv(file)
#conduct some data cleaning
df = df.drop(['publish_date'], axis=1)
df = df.rename(columns = {'headline_text': 'text'})
df['text'] = df['text'].astype(str)
#check the data info
df.head()

Unnamed: 0,text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


Shape of the Data Set

In [158]:
df.shape

(6284, 2)

### Snorkel Labeling Technique

Because the dataset is unlabelled, we will employ Snorkel, to come up with heuristics and programmatic rules using functions which assign labels of two classes that differentiate if the headline is positive (1) or negative (0).

In [116]:
#define constants to represent the class labels :positive, negative, and abstain
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1
#define function which looks into the input words to represent a proper label
def keyword_lookup(x, keywords, label):  
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN
#define function which assigns a correct label
def make_keyword_lf(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

"""positive news might contain the following words' """
keyword_positive = make_keyword_lf(keywords=['boosts','tribute','legacy','christmas','faith','opportunity','undaunted', 'great', 'develops', 'promising', 'ambitious', 'delighted', 'record', 'win', 'breakthrough', 'recover', 'achievement', 'peace', 'party', 'hope', 'unhappy','flourish', 'respect', 'partnership', 'champion', 'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 'complete', 'assured' ])
"""negative news might contain the following words"""
keyword_negative = make_keyword_lf(keywords=['war','blast','dead','illegal','blast','burn','guilty','dead','storm','solidiers','fire', 'turmoil', 'injured','trouble', 'aggressive', 'killed', 'coup', 'evasion', 'strike', 'troops', 'dismisses', 'attacks', 'defeat', 'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 'accusations', 'victims',  'death', 'unrest', 'fraud', 'dispute', 'destruction', 'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 'poison', 'unfair', 'unhealthy'
                                              ], label=NEGATIVE)

### Textblob Labeling Technique

Another set of labelling functions were implemented through TextBlob tool, a pretrained sentiment analyzer. We will create a Pre-processor that runs TextBlob on our headlines, then extracts the polarity and subjectivity scores

In [117]:
#set up a preprocessor function to determine polarity & subjectivity using textlob pretrained classifier 
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.6 else ABSTAIN
#find subjectivity 
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

 The next step is to **combine all the labelling functions** and apply it on our dataset. Then, we fit the label_model to predict and generate the positive and negative classes.

In [118]:
#combine all the labeling functions 
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity ]
#apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)
#apply the label model
label_model = LabelModel(cardinality=2, verbose=True)
#fit on the data
label_model.fit(L_snorkel)
#predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

100%|██████████| 6284/6284 [00:11<00:00, 551.19it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.091]
INFO:root:[10 epochs]: TRAIN:[loss=0.027]
INFO:root:[20 epochs]: TRAIN:[loss=0.018]
INFO:root:[30 epochs]: TRAIN:[loss=0.015]
INFO:root:[40 epochs]: TRAIN:[loss=0.012]
INFO:root:[50 epochs]: TRAIN:[loss=0.012]
INFO:root:[60 epochs]: TRAIN:[loss=0.011]
 61%|██████    | 61/100 [00:00<00:00, 608.07epoch/s]INFO:root:[70 epochs]: TRAIN:[loss=0.011]
INFO:root:[80 epochs]: TRAIN:[loss=0.010]
INFO:root:[90 epochs]: TRAIN:[loss=0.010]
100%|██████████| 100/100 [00:00<00:00, 628.72epoch/s]
INFO:root:Finished Training


In [119]:
#Filtering out unlabeled data points
df= df.loc[df.label.isin([0,1]), :]
#find the label counts 
df['label'].value_counts()

1    3387
0    2897
Name: label, dtype: int64

We can notice that after dropping the unlabelled data points (as shown above), we have around 3387 positive labels and 2897 negative which will be sufficient to build our sentiment classifier.

## Train and test split

In [120]:
##store headlines and labels in respective lists
text = list(df['text'])
labels = list(df['label'])
##sentences
training_text = text[0:10000]
testing_text = text[10000:]
##labels
training_labels = labels[0:10000]
testing_labels = labels[10000:]

### Set up the tokenizer from Tensor to pre-process the data.

In this cell, we use the **word tokenizer from tensorflow.keras** to create word encodings (dictionary with key-value pairs) and sequences using **texs_to_sequences instance**, and then we pad these sequences to make it of equal length using the **pad_sequences instance**.

In [121]:
#preprocess 
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')
# convert lists into numpy arrays to make it work with TensorFlow 
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In this step, we use the **word tokenizer from tensorflow.keras** to create word encodings (dictionary with key-value pairs) and sequences using **texs_to_sequences instance**, and then we pad these sequences to make it of equal length using the **pad_sequences instance.**

### Define & train the Sequential model

We build the model with an **embedding layer** of a vocab size, embedding dimension, and input length. We also add a **dense layer RelU** which asks the model to classify the instances into two classes as positive or negative and other final **sigmoid layer** which outputs probabilities between 0 or 1. We can simply play with the hyperparameters within each layer to increase model performance. Then, we compile the model with an optimizer and metric performance and we train it on our dataset.

In [122]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
##compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
 
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d_5   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_10 (Dense)            (None, 24)                408       
                                                                 
 dense_11 (Dense)            (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [123]:
num_epochs = 15
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

Epoch 1/15
197/197 - 2s - loss: 0.6898 - accuracy: 0.5390 - 2s/epoch - 12ms/step
Epoch 2/15
197/197 - 1s - loss: 0.6832 - accuracy: 0.5404 - 722ms/epoch - 4ms/step
Epoch 3/15
197/197 - 1s - loss: 0.6378 - accuracy: 0.6809 - 715ms/epoch - 4ms/step
Epoch 4/15
197/197 - 1s - loss: 0.5031 - accuracy: 0.8488 - 718ms/epoch - 4ms/step
Epoch 5/15
197/197 - 1s - loss: 0.3472 - accuracy: 0.9182 - 737ms/epoch - 4ms/step
Epoch 6/15
197/197 - 1s - loss: 0.2386 - accuracy: 0.9476 - 755ms/epoch - 4ms/step
Epoch 7/15
197/197 - 1s - loss: 0.1725 - accuracy: 0.9631 - 746ms/epoch - 4ms/step
Epoch 8/15
197/197 - 1s - loss: 0.1257 - accuracy: 0.9760 - 748ms/epoch - 4ms/step
Epoch 9/15
197/197 - 1s - loss: 0.0970 - accuracy: 0.9820 - 755ms/epoch - 4ms/step
Epoch 10/15
197/197 - 1s - loss: 0.0756 - accuracy: 0.9860 - 721ms/epoch - 4ms/step
Epoch 11/15
197/197 - 1s - loss: 0.0607 - accuracy: 0.9893 - 719ms/epoch - 4ms/step
Epoch 12/15
197/197 - 1s - loss: 0.0493 - accuracy: 0.9919 - 732ms/epoch - 4ms/step
Epo

We can further check that **our built neural network model** with **15 running epochs** has a **very good accuracy of 99.57%** , decreasing validation loss and increasing validation accuracy **which assure a powerful predictive performance** and a low risk of (generalisation) overfitting error.

## Using the model to predict on the News Headline taken from the BBC API

In [154]:
for i in range(10):
    new_headline = [title[i]]
    #print(new_headline)
    #prepare the sequences of the sentences in question
    sequences = tokenizer.texts_to_sequences(new_headline)
    padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
    #print(model.predict(padded_seqs))
    sent_value = model.predict(padded_seqs)
    if sent_value > 0.5:
     sentiment="Positive"
    else:
     sentiment="Negative"
    print(new_headline,str(": "), sentiment)

["Winter storm's icy blast hits 200 million in US"] :  Negative
['US winter storm: Stranded Native Americans burn clothes for warmth'] :  Negative
['Megan Thee Stallion: US jury finds Tory Lanez guilty of shooting hip-hop star'] :  Negative
['Wind, snow and floods combine for historic US storm'] :  Negative
['Winter storm strands Canadian family in RV in Texas'] :  Negative
['Boiling water turns to snow in frigid Montana'] :  Positive
["King's Christmas message to pay tribute to Queen's legacy"] :  Positive
['At least 20 dead in Russia illegal care home fire'] :  Negative
["The Sun apologises over Jeremy Clarkson's Meghan column"] :  Positive
['Paris shooting: Two dead and several injured in attack'] :  Negative


# You can see that our model is working very good.