In [5]:
import pandas as pd
df = pd.read_csv(r'C:\Users\Sandeep Roy\3D Objects\sentimental_analysis\million news headlines dataset\abcnews-date-text.csv')
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [19]:
#install needed packages
!pip install snorkel
!pip install textblob
!pip install spacy
!pip install tensorflow
#import libraries and modules
import io
import pandas as pd
from textblob import TextBlob
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
#NLP packages

# import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.11.0
  Downloading tensorflow_intel-2.11.0-cp39-cp39-win_amd64.whl (266.3 MB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting flatbuffers>=2.0
  Downloading flatbuffers-23.1.4-py2.py3-none-any.whl (26 kB)
Collecting libclang>=13.0.0
  Downloading libclang-14.0.6-py2.py3-none-win_amd64.whl (14.2 MB)
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-2.2.0-py3-none-any.w

[nltk_data]     Roy\AppData\Roaming\nltk_data...

[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#conduct some data cleaning
df = df.drop(['publish_date'], axis=1)
df = df.rename(columns = {'headline_text': 'text'})
df['text'] = df['text'].astype(str)
#check the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1244184 non-null  object
dtypes: object(1)
memory usage: 9.5+ MB


In [9]:
#define constants to represent the class labels :positive, negative, and abstain
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

#define function which looks into the input words to represent a proper label
def keyword_lookup(x, keywords, label):  
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN
#define function which assigns a correct label
def make_keyword_lf(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))
#resource: https://www.snorkel.org/use-cases/01-spam-tutorial#3-writing-more-labeling-functions
#these two lists can be further extended 

"""positive news might contain the following words' """
keyword_positive = make_keyword_lf(keywords=['boosts', 'great', 'develops', 'promising', 'ambitious', 'delighted', 'record', 'win', 'breakthrough', 'recover', 'achievement', 'peace', 'party', 'hope', 'flourish', 'respect', 'partnership', 'champion', 'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 'complete', 'assured' ])
"""negative news might contain the following words"""
keyword_negative = make_keyword_lf(keywords=['war','solidiers', 'turmoil', 'injur','trouble', 'aggressive', 'killed', 'coup', 'evasion', 'strike', 'troops', 'dismisses', 'attacks', 'defeat', 'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 'accusations', 'victims',  'death', 'unrest', 'fraud', 'dispute', 'destruction', 'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 'poison', 'unfair', 'unhealthy'
                                              ], label=NEGATIVE)


In [10]:
# Another set of labelling functions implemented using TextBlob: polarity and subjectivity
# Subjectivity is the output that lies within [0,1] and refers to personal opinions and judgments. 
#set up a preprocessor function to determine polarity & subjectivity using textlob pretrained classifier 
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.6 else ABSTAIN
#find subjectivity 
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

In [11]:
# combining all the labeling functions and apply to dataset
#combine all the labeling functions 
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity ]
#apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)
#apply the label model
label_model = LabelModel(cardinality=2, verbose=True)
#fit on the data
label_model.fit(L_snorkel)
#predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

100%|███████████████████████████████████████████████████████████████████████| 1244184/1244184 [23:09<00:00, 895.24it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                                       | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.018]
INFO:root:[10 epochs]: TRAIN:[loss=0.007]
INFO:root:[20 epochs]: TRAIN:[loss=0.000]
INFO:root:[30 epochs]: TRAIN:[loss=0.001]
INFO:root:[40 epochs]: TRAIN:[loss=0.001]
INFO:root:[50 epochs]: TRAIN:[loss=0.000]
 55%|██████████████████████████████████████████▎                                  | 55/100 [00:00<00:00, 549.82epoch/s]INFO:root:[60 epochs]: TRAIN:[loss=0.000]
INFO:root:[70 epochs]: TRAIN:[loss=0.000]
INFO:root:[80 epochs]: TRAIN:[loss=0.000]
INFO:root:[90 epochs]: TRAIN:[loss=0.000]
100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 704.06epoch/s]
INFO:root:Finished Training


In [12]:
# dropping unlabelled datapoints
#Filtering out unlabeled data points
df= df.loc[df.label.isin([0,1]), :]
#find the label counts 
df['label'].value_counts()

1    238018
0    109987
Name: label, dtype: int64

In [26]:
# Train and test split
##store headlines and labels in respective lists
text = list(df['text'])
labels = list(df['label'])

##sentences
training_text = text[0:150000]
testing_text = text[150000:]

##labels
training_labels = labels[0:150000]
testing_labels = labels[150000:]

In [27]:
# using word tokenizer from tensorflow.keras to create word encodings and sequences, then we pad the sequences
#preprocess 
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')
# convert lists into numpy arrays to make it work with TensorFlow 
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [28]:
# we build embedding layer, add dense layer RelU, and sigmoid layer
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
##compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
 
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 24)                408       
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [29]:
num_epochs = 10
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

Epoch 1/10
4688/4688 - 19s - loss: 0.2514 - accuracy: 0.8873 - val_loss: 0.1027 - val_accuracy: 0.9638 - 19s/epoch - 4ms/step
Epoch 2/10
4688/4688 - 17s - loss: 0.0550 - accuracy: 0.9816 - val_loss: 0.0885 - val_accuracy: 0.9669 - 17s/epoch - 4ms/step
Epoch 3/10
4688/4688 - 18s - loss: 0.0378 - accuracy: 0.9875 - val_loss: 0.0646 - val_accuracy: 0.9776 - 18s/epoch - 4ms/step
Epoch 4/10
4688/4688 - 17s - loss: 0.0303 - accuracy: 0.9901 - val_loss: 0.0629 - val_accuracy: 0.9790 - 17s/epoch - 4ms/step
Epoch 5/10
4688/4688 - 17s - loss: 0.0255 - accuracy: 0.9918 - val_loss: 0.0689 - val_accuracy: 0.9765 - 17s/epoch - 4ms/step
Epoch 6/10
4688/4688 - 17s - loss: 0.0230 - accuracy: 0.9925 - val_loss: 0.0661 - val_accuracy: 0.9786 - 17s/epoch - 4ms/step
Epoch 7/10
4688/4688 - 18s - loss: 0.0211 - accuracy: 0.9933 - val_loss: 0.0844 - val_accuracy: 0.9767 - 18s/epoch - 4ms/step
Epoch 8/10
4688/4688 - 19s - loss: 0.0192 - accuracy: 0.9938 - val_loss: 0.0779 - val_accuracy: 0.9760 - 19s/epoch - 4

In [63]:
new_headline = ["Because of torrential rain, the city feels sad and gloomy"]
##prepare the sequences of the sentences in question
# sequences = tokenizer.texts_to_sequences(new_headline)
# padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
# print(model.predict(padded_seqs))

# doc = tokenizer.fit_on_texts(new_headline)
# encoded_docs = tokenizer.texts_to_matrix(new_headline, mode='tfidf')
# padded_docs = pad_sequences(encoded_docs, maxlen=120, padding='post', truncating='post')
# print(model.predict(padded_docs))

import spacy

[[0.97830856]]
