### Training Naive Bayes Classifier on Crude Oil News Articles' Sentiment

In [35]:
# import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB # for more than two class classification problems, for two we can use MultinomialNB
import pandas as pd
import pickle

In [36]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Quantitative Finance and Algo Trading/crudeoil_news_articles.csv")
data.drop(['Date', 'sentiment'], axis=1, inplace=True)
data.head()

Unnamed: 0,Headline,News,Sentiment
0,US Gulf Of Mexico Can Help Fill Global Oil Sup...,One of the world’s most mature deepwater basin...,positive
1,Oil Jumps On EIA Inventory Data,Crude oil prices moved higher today after the ...,negative
2,OPEC Crude Oil Exports Trend Lower In September,Crude oil exports from all OPEC oil producers ...,negative
3,Oil Price Cap On Russian Crude Could Cause Tan...,With the EU embargo on imports of Russian oil ...,negative
4,Russia Likely To Propose Major Output Cut At N...,Russia is likely to propose at the next OPEC+ ...,negative


In [37]:
X = data.iloc[:, 1] # extract column with news articles body
X.head()

0    One of the world’s most mature deepwater basin...
1    Crude oil prices moved higher today after the ...
2    Crude oil exports from all OPEC oil producers ...
3    With the EU embargo on imports of Russian oil ...
4    Russia is likely to propose at the next OPEC+ ...
Name: News, dtype: object

In [38]:
# tokenize the news text and convert data in matrix format
vectorizer = CountVectorizer(stop_words = 'english')
X_vec = vectorizer.fit_transform(X)
X_vec

<40x3755 sparse matrix of type '<class 'numpy.int64'>'
	with 12222 stored elements in Compressed Sparse Row format>

In [39]:
print(X_vec)

  (0, 3727)	7
  (0, 2098)	1
  (0, 972)	4
  (0, 464)	5
  (0, 1589)	13
  (0, 2145)	12
  (0, 2033)	1
  (0, 2603)	9
  (0, 1975)	1
  (0, 1961)	1
  (0, 1639)	1
  (0, 566)	1
  (0, 2315)	13
  (0, 3323)	10
  (0, 1517)	2
  (0, 3703)	1
  (0, 2099)	2
  (0, 918)	1
  (0, 2601)	1
  (0, 387)	1
  (0, 1552)	2
  (0, 462)	2
  (0, 2656)	2
  (0, 25)	1
  (0, 904)	4
  :	:
  (39, 2471)	1
  (39, 2052)	1
  (39, 3681)	1
  (39, 941)	1
  (39, 910)	1
  (39, 513)	1
  (39, 1347)	1
  (39, 3422)	1
  (39, 732)	1
  (39, 3470)	1
  (39, 1206)	1
  (39, 331)	1
  (39, 952)	1
  (39, 916)	1
  (39, 3150)	2
  (39, 2918)	1
  (39, 281)	1
  (39, 3639)	1
  (39, 3586)	1
  (39, 200)	1
  (39, 1047)	1
  (39, 1377)	1
  (39, 3472)	1
  (39, 1506)	1
  (39, 1090)	1


In [40]:
pickle.dump(vectorizer, open("vectorizer_crude_oil", "wb")) # save vectorizer for reuse
X_vec = X_vec.todense() # convert sparse matrix into dense matrix

In [41]:
X_vec[0], X_vec[0].shape

(matrix([[4, 0, 0, ..., 0, 2, 0]]), (1, 3755))

In [42]:
X_vec[34], X_vec[34].shape

(matrix([[1, 0, 0, ..., 0, 0, 0]]), (1, 3755))

In [43]:
# Transform data by applying term frequency inverse document frequency (TF-IDF)
import numpy as np
X_vec = np.asarray(X_vec) # converting np.matrix to np.array to remove np.matrix deprecation warning that is given by TfidfTransformer object 
tfidf = TfidfTransformer() # by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf = X_tfidf.todense()

In [44]:
X_tfidf.shape  # total articles: 40, total terms : 3755

(40, 3755)

### Applying Naive Bayes Algorithm to train data

In [52]:
# Extract the news body and labels for training the classifier
X_train = X_tfidf[:40,:]
X_train = np.asarray(X_train) # converting np.matrix dtype to np.ndarray to prevent deprecation warning 
y_train = data.iloc[:,2]

In [53]:
type(X_train)

numpy.ndarray

In [54]:
# Train the NB Classifier
clf = GaussianNB().fit(X_train, y_train)
pickle.dump(clf, open("nb_clf_crude_oil", "wb")) # save classifier for resuse