# NLP-9: Stock Sentiment Analysis using News Headlines
#### Credit
https://www.youtube.com/watch?v=h-LGjJ_oANs&list=PLZoTAELRMXVMdJ5sqbCK2LiM0HhQVWNzm&index=12

In [1]:
import pandas as pd

In [None]:
url='https://raw.githubusercontent.com/akdubey2k/NLP/main/Stock_Sentiment_Analysis/stock-data.csv'
df=pd.read_csv(url)
df.head()

#### Divide data into training and test set as per date-wise

In [None]:
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231'] 
train

#### Removing punctuations from the **training data set**

In [None]:
data = train.iloc[:, 2:27]  # all rows, and 2to26 columns included. Ideally removing 'Date' & 'Label' columns
data.replace("[^a-zA-Z]", " ", regex=True, inplace=True) # except alphabets everythinng has to be removed
data

#### Renaming column names for easiness of access

In [None]:
# list1 = [i for i in range(25)]      # generating number from 0 to 24 (total 25 in count) and storing in list
# print(list1)  
# new_index = [str(i) for i in list1] # converting number into string list

# ******************* can save above one line of code **************************
new_index = [str(i) for i in range(25)] # converting number into string list
print(new_index)
data.columns = new_index            # assigning new string list name to the columns
data.head()

#### Convertng headlines to lower case

In [None]:
for i in new_index:
  data[i] = data[i].str.lower()
data.head()

#### Combining all columns data into a **"single line row string"**

In [None]:
' '.join(str(x) for x in data.iloc[0, 0:25])  # 1 row and 0 to 24 columns (total 25 in count)

#### Create headlines by combining all columns data into row-wise string, as above code for a **"single line row string"**

In [None]:
headlines = []
for row in range(0, len(data.index)):
  headlines.append(' '.join(str(x) for x in data.iloc[row, 0:25]))

headlines[0]

#### Bag of words implementation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

cv = CountVectorizer(ngram_range=(2, 2))  # at least two entries should match
train_data = cv.fit_transform(headlines)
train_data

#### Random forest classifier implementation

In [None]:
rand_classifier = RandomForestClassifier(n_estimators=200, criterion='entropy')
rand_classifier.fit(train_data, train['Label']) # independent var., dependent var.

#### Prediction for the **Test Dataset**

In [11]:
test_transform = []
for row in range(0, len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row,2:27]))
test_dataset = cv.transform(test_transform)
predictions = rand_classifier.predict(test_dataset)
predictions

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,

#### Import library to check accuracy


In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [37]:
matrix = confusion_matrix(test['Label'], predictions)
print("\033[1m Matrix : \033[0m \n", matrix)
score = accuracy_score(test['Label'],predictions)
print("\n\033[1m Score : \033[0m \n", score)
report = classification_report(test['Label'],predictions)
print("\n\033[1m Classification report : \033[0m \n", report)

[1m Matrix : [0m 
 [[135  51]
 [  9 183]]

[1m Score : [0m 
 0.8412698412698413

[1m Classification report : [0m 
               precision    recall  f1-score   support

           0       0.94      0.73      0.82       186
           1       0.78      0.95      0.86       192

    accuracy                           0.84       378
   macro avg       0.86      0.84      0.84       378
weighted avg       0.86      0.84      0.84       378

