# Stock Price Movement Predictor based on News Headlines

### Import all the required libs

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Import the data

In [31]:
dataset = pd.read_csv('Data.csv')

In [32]:
# filling the null values with median 

dataset['Top23'].fillna(dataset['Top23'].median,inplace=True)
dataset['Top24'].fillna(dataset['Top24'].median,inplace=True)
dataset['Top25'].fillna(dataset['Top25'].median,inplace=True)

In [33]:
train = dataset[dataset['Date'] < '2015-01-01']
test = dataset[dataset['Date'] > '2014-12-31']

### Pre-processing steps

In [34]:
data = train.iloc[:, 2:]
data.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

In [35]:
#Make the data column headers easier to read
data.columns = [str(i) for i in range(25)]

In [36]:
for i in data.columns:
    
    data[i] = data[i].str.lower()

In [39]:
headlines = []
for row in range(0,len(data.index)):
    headlines.append(' '.join(str(x) for x in data.iloc[row,0:]))

In [40]:
## implement BAG OF WORDS
CV = CountVectorizer(ngram_range=(2,2), max_features=100009)
traindataset = CV.fit_transform(headlines)

In [41]:
## implement TF - IDF
TDF = TfidfVectorizer(ngram_range=(2,2), max_features=100009)
traindataset2 = TDF.fit_transform(headlines)

In [42]:
# Random Forest Implementation
RFC = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
RFC.fit(traindataset, train['Label'])

RandomForestClassifier(criterion='entropy', n_estimators=200)

In [43]:
# Logistic Regression Implementation
LR = LogisticRegression()
LR.fit(traindataset2, train['Label'])

LogisticRegression()

In [44]:
test_headlines = []
for i in range(0, len(test.index)):
    
    test_headlines.append(' '.join(str(x) for x in test.iloc[i, 2:27]))
test_dataset = CV.transform(test_headlines)
predictions = RFC.predict(test_dataset)

In [45]:
test_headlines2 = []
for i in range(0, len(test.index)):
    
    test_headlines2.append(' '.join(str(x) for x in test.iloc[i, 2:27]))
    
test_dataset2 = TDF.transform(test_headlines2)
predictions2 = LR.predict(test_dataset2)

### Import Metrics lib

In [46]:
## Import library to check accuracy
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [47]:
matrix=confusion_matrix(test['Label'],predictions)
print(matrix)
score=accuracy_score(test['Label'],predictions)
print(score)
report=classification_report(test['Label'],predictions)
print(report)

[[ 25 161]
 [ 27 165]]
0.5026455026455027
              precision    recall  f1-score   support

           0       0.48      0.13      0.21       186
           1       0.51      0.86      0.64       192

    accuracy                           0.50       378
   macro avg       0.49      0.50      0.42       378
weighted avg       0.49      0.50      0.43       378



In [49]:
matrix=confusion_matrix(test['Label'],predictions2)
print(matrix)
score=accuracy_score(test['Label'],predictions2)
print(score)
report=classification_report(test['Label'],predictions2)
print(report)

[[ 11 175]
 [  7 185]]
0.5185185185185185
              precision    recall  f1-score   support

           0       0.61      0.06      0.11       186
           1       0.51      0.96      0.67       192

    accuracy                           0.52       378
   macro avg       0.56      0.51      0.39       378
weighted avg       0.56      0.52      0.39       378

