In [17]:
import re
import nltk
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import SGDClassifier, SGDRegressor,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random
import itertools
from sklearn.metrics import confusion_matrix 

import sys
import os
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
import six
from abc import ABCMeta
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.preprocessing.text import Tokenizer
from collections import defaultdict
from keras.layers.convolutional import Convolution1D
from keras import backend as K
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

# Analysis of Dow Jones Index Price Movements using bigrams consructed from Reddit News Headlines

In this section we look how combination of two words (so-called bigrams) today affect the dow jones price movement today. We find that logistic regression gives us the best accuracy of 57.14%. We additionally use decision tree classifier and random forest classifiers and get the accuracy of 54%.

In [18]:
news = pd.read_csv('/Users/nastyademina/Desktop/r studio/Combined_News_DJIA.csv')
train = news[news['Date'] < '2015-01-01']
test = news[news['Date'] > '2014-12-31']

In [19]:
#logistic regression
advancedvectorizer = TfidfVectorizer( min_df=0.03, max_df=0.97, max_features = 200000, ngram_range = (2, 2))

trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))

advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
advancedmodel = LogisticRegression()
advancedmodel = advancedmodel.fit(advancedtrain, train["Label"])

testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
advancedtest = advancedvectorizer.transform(testheadlines)
preds_lr = advancedmodel.predict(advancedtest)
acc_lr=accuracy_score(test['Label'], preds_lr)

print('Logic Regression 2 accuracy: ', acc_lr)

results = pd.DataFrame(confusion_matrix(test['Label'], preds_lr))
print(results)
TPR=144/(48+144)
print(TPR)
FPR=114/(72+114)
print(FPR)

Logic Regression 2 accuracy:  0.5714285714285714
    0    1
0  72  114
1  48  144
0.75
0.6129032258064516


In [20]:
advwords = advancedvectorizer.get_feature_names()
advcoeffs = advancedmodel.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords, 
                        'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf[200:300]

Unnamed: 0,Words,Coefficient
471,the islamic,0.238449
480,the most,0.236327
457,the euro,0.232905
43,as an,0.231748
100,court has,0.231301
313,of being,0.230875
635,wikileaks founder,0.230030
520,they have,0.228450
61,based on,0.227673
273,last year,0.226998


In [21]:
#decision tree
tree = DecisionTreeClassifier()
tree= tree.fit(advancedtrain, train["Label"])

preds_tree = tree.predict(advancedtest)
acc_tree = accuracy_score(test['Label'], preds_tree)
print(acc_tree)

results = pd.DataFrame(confusion_matrix(test['Label'], preds_tree))
print(results)

TPR=102/(90+102)
print(TPR)
FPR=89/(89+97)
print(FPR)

0.5396825396825397
     0    1
0  104   82
1   92  100
0.53125
0.478494623655914


In [22]:
#random forest
rf = RandomForestClassifier()
rf = rf.fit(advancedtrain, train["Label"])

preds_rf = rf.predict(advancedtest)
acc_rf = accuracy_score(test['Label'], preds_rf)
print(acc_rf)

results = pd.DataFrame(confusion_matrix(test['Label'], preds_tree))
print(results)

TPR=97/(97+95)
print(TPR)
FPR=93/(93+93)
print(FPR)

0.5317460317460317
     0    1
0  104   82
1   92  100
0.5052083333333334
0.5


# Bigrams analysis with lagged news data and lagged Dow Jones Index price movements added as an independent variables starts here

* In this section we will test whether we can improve the prediction power of our model by using bigrams. We use python advanced functions for NLP-TfidfVectorizer to create bigrams. Additionally, we add one more independent variable to capture other effects which is equal to 1 if yesterday Dow Jones index price went up or stayed the same and is equal to 0 if Dow Jones Index went down. 
* We use lead of the Label variable as our dependent variable. Therefore, we try to predict price movements of Dow Jones tomorrow using news topics today and price movements of Dow Jones today.
* We run logistric regression, decision tree classifier and random forest classifier once again.
* From our results we can see that our models do not have high accuracy on the test set (Logic Regression 2 accuracy:  0.517, Decision Tree 1 accuracy:  0.509, Random Forest 1 accuracy:  0.516)

In [40]:
data = pd.read_csv('/Users/nastyademina/Desktop/r studio/Combined_News_DJIA.csv')
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
print(test.shape)
print(train.shape)

(378, 27)
(1611, 27)


In [41]:
#creating bigrams and running logistic regression
#gives us 52% accuracy 
test['Label_lead'] = test['Label'].shift(-1)
test.drop(test.tail(1).index,inplace=True)

train['Label_lead'] = train['Label'].shift(-1)
train.drop(train.tail(1).index,inplace=True)
train=pd.DataFrame(train)

trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))

#how to add it to the training set
advancedvectorizer = TfidfVectorizer( min_df=0.03, max_df=0.97, max_features = 200000, ngram_range = (2, 2))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
av = pd.DataFrame(advancedtrain.toarray(), columns=advancedvectorizer.get_feature_names())

label=pd.DataFrame(train['Label'])
advancedtrain= pd.concat([label, av], axis=1)

advancedmodel = LogisticRegression()
advancedmodel = advancedmodel.fit(advancedtrain, train["Label_lead"])

In [42]:
testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
advancedtest = advancedvectorizer.transform(testheadlines)


av_test = pd.DataFrame(advancedtest.toarray(), columns=advancedvectorizer.get_feature_names())
label_test=pd.DataFrame(test.iloc[:,1])

av_test.reset_index(drop=True, inplace=True)
label_test.reset_index(drop=True, inplace=True)

advancedtest= pd.concat([label_test, av_test], axis=1)

print(advancedtest.shape)

preds2 = advancedmodel.predict(advancedtest)
acc2=accuracy_score(test['Label_lead'], preds2)
print('Logic Regression 2 accuracy: ', acc2)

(377, 657)
Logic Regression 2 accuracy:  0.5172413793103449


In [59]:
#decision tree
advancedmodel = DecisionTreeClassifier(criterion = 'entropy',max_depth = 8, min_samples_leaf = 8, min_impurity_split = 0.5,random_state=123)
advancedmodel = advancedmodel.fit(advancedtrain, train["Label_lead"])
preds_dt = advancedmodel.predict(advancedtest)
acc_dt = accuracy_score(test['Label_lead'], preds_dt)
print('DT 1 accuracy: ', acc_dt)

DT 1 accuracy:  0.5092838196286472


In [55]:
results = pd.DataFrame(confusion_matrix(test['Label_lead'], preds_dt))
print(results)

    0    1
0  85  101
1  93   98


In [74]:
#random forest
advancedmodel = RandomForestClassifier(
    n_estimators = 20,
    criterion='entropy', 
    max_depth=8, 
    min_samples_leaf=8, 
    n_jobs=5, 
    random_state=13451)
advancedmodel = advancedmodel.fit(advancedtrain, train["Label_lead"])
preds_rf = advancedmodel.predict(advancedtest)
acc_rf = accuracy_score(test['Label_lead'], preds_rf)
print('RF 1 accuracy: ', acc_rf)

RF 1 accuracy:  0.5172413793103449


In [57]:
results = pd.DataFrame(confusion_matrix(test['Label_lead'], preds_rf))
print(results)

    0    1
0  28  158
1  24  167


# Baseline model

In our baseline model we have only one independent variable, which is equal to 1 if yesterday Dow Jones index price went up or stayed the same and is equal to 0 if Dow Jones Index went down. We can see that baseline model always predicts 1. Therefore, accuracy of logistic regression, decision tree and random forest models is the same and equals to 0.505.

In [64]:
data1 = pd.read_csv('/Users/nastyademina/Desktop/r studio/Combined_News_DJIA.csv')
train1 = data1[data1['Date'] < '2015-01-01']
test1 = data1[data1['Date'] > '2014-12-31']

In [67]:
test1['Label_lead'] = test1['Label'].shift(-1)
test1.drop(test1.tail(1).index,inplace=True)

train1['Label_lead'] = train1['Label'].shift(-1)
train1.drop(train1.tail(1).index,inplace=True)

x_train=pd.DataFrame(train1['Label'])
y_train=pd.DataFrame(train1['Label_lead'])

x_test=pd.DataFrame(test1['Label'])
y_test=pd.DataFrame(test1['Label_lead'])

In [76]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
print(accuracy_score(y_test, lr.predict(x_test)))
results = pd.DataFrame(confusion_matrix(y_test, lr.predict(x_test)))
print(results)

0.5053191489361702
   0    1
0  0  186
1  0  190


In [77]:
tree = DecisionTreeClassifier(criterion = 'entropy',max_depth = 8, min_samples_leaf = 8, min_impurity_split = 0.5, random_state=53)
tree.fit(x_train, y_train)
pred = tree.predict(x_test)
accuracy_score(pred, y_test)

results = pd.DataFrame(confusion_matrix(y_test, pred))
print(results)


   0    1
0  0  186
1  0  190


In [79]:
rf = RandomForestClassifier(
    n_estimators=20, 
    max_depth=8, 
    min_samples_leaf=8, 
    n_jobs=5, 
    random_state=123
)
rf.fit(x_train, y_train)
accuracy_score(y_test, rf.predict(x_test))

res = pd.DataFrame(confusion_matrix(x_test, rf.predict(x_test)))
print(res)

   0    1
0  0  186
1  0  190
