**Stock Sentiment Analysis using Natural Language Processing and Machine Learning Predictions**


In [5]:
import pandas as pd
import numpy as np

import dateutil
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import xgboost, lightgbm

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score

In [6]:
# Upload CSV file from local HDD to Google Colab-
from google.colab import files
uploaded = files.upload()

Saving Stock_Sentiment_Analysis_Data.csv to Stock_Sentiment_Analysis_Data (1).csv


In [8]:
import io

# Read in CSV file as Pandas DataFrame-
data = pd.read_csv(io.BytesIO(uploaded['Stock_Sentiment_Analysis_Data.csv']), encoding='ISO-8859-1')

In [9]:
# Get shape/dimensions of dataset-
data.shape

(4101, 27)

In [10]:
# Get feature/attribute/column names as a list-
data.columns.tolist()

['Date',
 'Label',
 'Top1',
 'Top2',
 'Top3',
 'Top4',
 'Top5',
 'Top6',
 'Top7',
 'Top8',
 'Top9',
 'Top10',
 'Top11',
 'Top12',
 'Top13',
 'Top14',
 'Top15',
 'Top16',
 'Top17',
 'Top18',
 'Top19',
 'Top20',
 'Top21',
 'Top22',
 'Top23',
 'Top24',
 'Top25']

In [11]:
# Check for missing values-
data.isnull().values.any()

True

In [12]:
# Get feature wise number of missing values (if any)-
data.isnull().sum()

Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
dtype: int64

In [13]:
# Convert 'time' attribute to 'datetime'-
data['Date'] = data['Date'].apply(dateutil.parser.parse, dayfirst = True)

# Sort according to 'time' attribute-
data_sorted = data.sort_values(by = 'Date', ascending = True)

# Reset index-
data_sorted.reset_index(inplace = True, drop = True)

In [14]:
# Get min and max values for 'Date' attribute-
data_sorted['Date'].min(), data_sorted['Date'].max()

(Timestamp('2000-01-02 00:00:00'), Timestamp('2016-12-05 00:00:00'))

In [15]:
# Get names of columns/features-
cols = data_sorted.columns.tolist()
cols.remove('Date')
cols.remove('Label')

In [17]:
# Remove punctuations with a blank space-

# Take only the features as the headline columns (removing 'Date' & 'Label')-
data_no_punc = data_sorted.loc[:, :]
data_no_punc.loc[:, cols].replace("[^a-zA-z]", " ", regex = True, inplace = True)

In [18]:
# Rename column names-
new_col_names = ['headline-{0}'.format(i) for i in range(1, 26)]

In [19]:
# Add these two column names in the first and second positions-
new_col_names.insert(0, 'Date')
new_col_names.insert(1, 'Label')

# Rename column names-
data_no_punc.columns = new_col_names

In [20]:
# Get column names for which text has to be converted to lower case-
cols_lower = data_no_punc.columns.tolist()
cols_lower.remove('Date')
cols_lower.remove('Label')

In [22]:
# Convert text in headlines to lower case-
for col_name in cols_lower:
	data_no_punc[f'{col_name}'] = data_no_punc[f'{col_name}'].str.lower()

In [23]:
# Split dataset into training and testing features based on 'Date' value-
X_train = data_no_punc.loc[data_no_punc['Date'] < '2015-01-01', cols_lower]
X_test = data_no_punc.loc[data_no_punc['Date'] >= '2015-01-01', cols_lower]

In [25]:
# Obtain training and testing labels based on a point in time-
y_train = data_no_punc.loc[data_no_punc['Date'] < '2015-01-01', 'Label']
y_test = data_no_punc.loc[data_no_punc['Date'] >= '2015-01-01', 'Label']

In [26]:
X_train.shape, y_train.shape

((3723, 25), (3723,))

In [27]:
X_test.shape, y_test.shape

((378, 25), (378,))

In [28]:
# Currently, for each day, the 25 headlines appear as separate feature/column.
# Combine all of the 25 headlines into a paragraph for each day.
headlines_train = []

for each_day in range(X_train.shape[0]):
	headlines_train.append(' '.join(str(x) for x in X_train.iloc[each_day, :25]))

In [29]:
headlines_test = []

for each_day in range(X_test.shape[0]):
	headlines_test.append(' '.join(str(x) for x in X_test.iloc[each_day, :25]))

In [30]:
# Sanity check-
headlines_train[0]

"double delight for shearer even ferguson wants a more peaceful accommodation owen in doubt for euro 2000 new sponsor for old trafford claridge in trouble over bet on pompey world cup a hit - official hussain backs floodlight age william hague's new cabinet 'in plain speaking you murdered her' blunt blair to tell farmers 'uncomfortable truth' portillo back as shadow chancellor computer problem delays endeavour launch for a week doctor's name joins ranks of world's worst murderers marking crisis hits a-levels 'a way of fighting poverty' ministers back universal schooling drive crash course in chaos shocks the minister economic crisis = empty desks internet promises salvation - or an even bigger knowledge gap a school triumphs amid the shacks fee-free zone where the young learn,all of society benefits a guide to valentine's day shopping on the net 'the lessons of the shipman case' government launches inquiry into shipman murders"

In [31]:
# Sanity check-
headlines_test[10]

'saudi arabia publicly beheads a woman in mecca boko haram appears to be using abducted girls as suicide bombers imam says "as it is clear that the cartoons are to be published again, muslims will be hurt and angered, but our reaction must be a reflection of the teachings of the one we love &amp; are angered for. enduring patience, tolerance, gentleness and mercy was the character of our beloved prophet." three deaths in anti-terror raid in belgium guantanamo guard: cia killed prisoners and made it look like suicide bps maximum fine for gulf of mexico oil spill is cut by billions woolly mammoth cloning attempt revives ethical debate protests flare in armenia after family massacred: "hundreds of armenians took to the streets on wednesday and thursday, demanding that a russian soldier who confessed to killing six members of an armenian family be turned over to the armenian authorities" top russian official \'ashamed\' of culture crackdown, quits ministry scientists: human activity has pu

In [32]:
len(headlines_train), len(headlines_test)

(3723, 378)

In [33]:
# Initialize a WordNet lemmatizer-
lemmatizer = WordNetLemmatizer()

class LemmatizedCountVectorizer(CountVectorizer):
	'''
	Python3 class to implement WordNet lemmatizer by inheriting
	from 'CountVectorizer' class
	'''
	def construct_analyzer(self):
		analyzer = super(LemmatizedCountVectorizer, self).construct_analyzer()
		return lambda doc: ([lemmatizer.lemmatize(wrd) for wrd in analyzer(doc)])


In [79]:
# Sanity check-
lm = LemmatizedCountVectorizer(lowercase = True, stop_words = 'english')

In [80]:
# Generate Bag of Words as output-
bow = lm.fit_transform(headlines_train)

In [81]:
type(bow), bow.shape

(scipy.sparse.csr.csr_matrix, (3723, 45831))

In [83]:
len(lm.vocabulary_), type(lm.vocabulary_)

(45831, dict)

In [88]:
print("All words in Bag-of-Words:\n{0}\n".format(lm.vocabulary_))

All words in Bag-of-Words:



In [87]:
print("\nFeatures/words are:\n{0}\n".format(lm.get_feature_names()))


Features/words are:



In [90]:
# Now just use a CountVectorizer without WordNet lemmatizer-
cv = CountVectorizer(lowercase = True, stop_words = 'english')

In [91]:
# Generate Bag of Words as output-
bow_cv = cv.fit_transform(headlines_train)

In [92]:
type(bow_cv), bow_cv.shape

(scipy.sparse.csr.csr_matrix, (3723, 45831))

In [93]:
print("All words in Bag-of-Words using CountVectorizer:\n{0}\n".format(cv.vocabulary_))

All words in Bag-of-Words using CountVectorizer:



In [95]:
print("\nFeatures/words using CountVectorizer are:\n{0}\n".format(cv.get_feature_names()))


Features/words using CountVectorizer are:



In [96]:
lm.vocabulary_ == cv.vocabulary_

True

**Initialize Machine Learning Pipeline for making NLP predictions:**

In [34]:
# Creating a pipeline for Multinomial Naive-Bayes classifier-
nb_lemmatizer_pipeline = Pipeline(
	[
		# ('countvectorizer', CountVectorizer(lowercase = True, stop_words = 'english')),
		('lm', LemmatizedCountVectorizer(lowercase = True, stop_words = 'english', ngram_range = (2, 2))),
		('tfidf', TfidfTransformer()),
		('nb_multinomial', MultinomialNB())
	]
	)

In [35]:
# Train pipeline on training data-
nb_lemmatizer_pipeline.fit(headlines_train, y_train)

Pipeline(memory=None,
         steps=[('lm',
                 LemmatizedCountVectorizer(analyzer='word', binary=False,
                                           decode_error='strict',
                                           dtype=<class 'numpy.int64'>,
                                           encoding='utf-8', input='content',
                                           lowercase=True, max_df=1.0,
                                           max_features=None, min_df=1,
                                           ngram_range=(2, 2),
                                           preprocessor=None,
                                           stop_words='english',
                                           strip_accents=None,
                                           token_pattern='(?u)\\b\\w\\w+\\b',
                                           tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                         

In [36]:
# Make predictions using trained pipeline-
y_pred_nb = nb_lemmatizer_pipeline.predict(headlines_test)

# Convert to np array-
y_test = np.array(y_test)

# Sanity check-
y_pred_nb.shape, y_test.shape

((378,), (378,))

In [37]:
# Compute trained pipeline metrics-
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)

print("\nMultinomial Naive-Bayes classifier metrics:")
print("accuracy = {0:4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
	accuracy_nb, precision_nb, recall_nb
	))


Multinomial Naive-Bayes classifier metrics:
accuracy = 0.505291, precision = 0.5066 & recall = 0.9948



**Perform hyper-parameter optimisation:**

In [38]:
# Perform 'RandomizedSearchCV' hyper-parameter tuning-

# Specify hyper-parameters to be searched-
randomsearch_params = {
	'lm__ngram_range': [(1, 1), (1, 2), (2, 2)],
	'tfidf__use_idf': (True, False),
	'nb_multinomial__alpha': (1e-1, 1e-3)
	}

In [40]:
# Initialize a RandomizedSearchCV object-
rs_clf = RandomizedSearchCV(
	estimator = nb_lemmatizer_pipeline,
	param_distributions = randomsearch_params,
	n_iter = 10
	)

In [41]:
# Train RandomizedSearchCV pipeline on training data-
rs_clf = rs_clf.fit(headlines_train, y_train)

In [42]:
# To see the best mean score and the best params-
rs_clf.best_score_

0.5235061701667026

In [43]:
rs_clf.best_params_

{'lm__ngram_range': (1, 2),
 'nb_multinomial__alpha': 0.1,
 'tfidf__use_idf': False}

In [44]:
# Hyper-parameter Tuning: Grid Search

# Specify hyper-parameters to be searched-
gridsearch_params = {
	'lm__ngram_range': [(1, 2), (1, 3), (2, 2)],
	'tfidf__use_idf': [False],
	'nb_multinomial__alpha': (0.1, 0.2, 0.01)
	}

In [45]:
# Initialize a GridSearchCV object-
gs_clf = GridSearchCV(
	estimator = nb_lemmatizer_pipeline,
	param_grid = gridsearch_params
	)

In [46]:
# Train GridSearchCV pipeline on training data-
gs_clf = gs_clf.fit(headlines_train, y_train)

In [47]:
# To see the best mean score and the params-
gs_clf.best_score_

0.5288752976834813

In [48]:
gs_clf.best_params_

{'lm__ngram_range': (1, 3),
 'nb_multinomial__alpha': 0.2,
 'tfidf__use_idf': False}

In [62]:
# Creating a pipeline for SGD classifier-
sgd_pipeline = Pipeline(
	[
		('countvectorizer', CountVectorizer(lowercase = True, stop_words = 'english')),
		# ('lm', LemmatizedCountVectorizer(lowercase = True, stop_words = 'english')),
		# ('tfidf', TfidfTransformer()),
    ('sgd_clf', SGDClassifier())
	]
	)

In [63]:
# Train pipeline on training data-
sgd_pipeline.fit(headlines_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, 

In [64]:
# Make predictions using trained pipeline-
y_pred_sgd = sgd_pipeline.predict(headlines_test)

# Sanity check-
y_pred_sgd.shape, y_test.shape

((378,), (378,))

In [65]:
# Compute trained pipeline metrics-
accuracy_sgd = accuracy_score(y_test, y_pred_sgd)
precision_sgd = precision_score(y_test, y_pred_sgd)
recall_sgd = recall_score(y_test, y_pred_sgd)

print("\nSGD classifier metrics:")
print("accuracy = {0:4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
	accuracy_sgd, precision_sgd, recall_sgd
	))


SGD classifier metrics:
accuracy = 0.478836, precision = 0.4894 & recall = 0.5990



In [66]:
# Creating a pipeline for SVM classifier-
svm_pipeline = Pipeline(
	[
		('countvectorizer', CountVectorizer(lowercase = True, stop_words = 'english')),
		# ('lm', LemmatizedCountVectorizer(lowercase = True, stop_words = 'english')),
		# ('tfidf', TfidfTransformer()),
    ('svm_clf', SVC())
	]
	)

In [67]:
# Train pipeline on training data-
svm_pipeline.fit(headlines_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('svm_clf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,

In [68]:
# Make predictions using trained pipeline-
y_pred_svm = svm_pipeline.predict(headlines_test)

# Sanity check-
y_pred_svm.shape, y_test.shape

((378,), (378,))

In [69]:
# Compute trained pipeline metrics-
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)

print("\nSVM classifier metrics:")
print("accuracy = {0:4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
	accuracy_svm, precision_svm, recall_svm
	))


SVM classifier metrics:
accuracy = 0.502646, precision = 0.5057 & recall = 0.9167



In [70]:
# Creating a pipeline for RandomForest classifier-
rf_pipeline = Pipeline(
	[
		('countvectorizer', CountVectorizer(lowercase = True, stop_words = 'english')),
		# ('lm', LemmatizedCountVectorizer(lowercase = True, stop_words = 'english')),
		# ('tfidf', TfidfTransformer()),
    ('rfc', RandomForestClassifier(n_estimators=200))
	]
	)

In [71]:
# Train pipeline on trainign data-
rf_pipeline.fit(headlines_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                 

In [72]:
# Make predictions using trained pipeline-
y_pred_rf = rf_pipeline.predict(headlines_test)

# Sanity check-
y_pred_rf.shape, y_test.shape

((378,), (378,))

In [73]:
# Compute trained pipeline metrics-
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)

print("\nRandomForest classifier metrics:")
print("accuracy = {0:4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
	accuracy_rf, precision_rf, recall_rf
	))


RandomForest classifier metrics:
accuracy = 0.513228, precision = 0.5128 & recall = 0.8333

