#Initial Downloads and imports

In [0]:
# initial downloads
!pip install yfinance --upgrade --no-cache-dir

In [0]:
# intial imports
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import yfinance as yf 
print(yf.__version__)

from datetime import datetime
from dateutil.parser import parse

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import gensim
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.simplefilter(action='ignore', category=FutureWarning)
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import sys
import csv

csv.field_size_limit(sys.maxsize)

import nltk
nltk.download('punkt')
from nltk import word_tokenize

In [0]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# load in data frames
path = '/content/drive/My Drive/Colab Notebooks/csvData/Data2016.csv'
path2 = '/content/drive/My Drive/Colab Notebooks/csvData/Data2017.csv'
df = pd.read_csv(path, encoding='ISO-8859-1', engine='python', error_bad_lines=False)
df2 = pd.read_csv(path2, encoding='ISO-8859-1', engine='python', error_bad_lines=False)

In [0]:
# concatenate data frames
df = pd.concat([df, df2], ignore_index=True)

#Data Cleaning, Pre-Processing and Data Set Creation

In [0]:
# cleaning method
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

def clean_str(text):
  """
     edited from from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
  """
  text = text.strip().lower()
  text = re.sub(r"â", "\'", text)
  text = re.sub(r"â€”", "", text)
  text = re.sub(r"â€", "", text)
  text = re.sub(r", I want to receive updates from partners and sponsors.", "", text)
  text = re.sub(r"I want to receive updates from partners and sponsors.", "", text)
  text = re.sub(r"For us to continue writing great stories, we need to display ads.", "", text)
  text = re.sub(r"Please select the extension that is blocking ads.", "", text)
  text = re.sub(r"Please follow the steps below,", "", text)
  text = re.sub(r"This article is part of a feature we also send out via email as Politics", "", text)
  text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text)
  text = re.sub(r"\'ve", " have ", text)
  text = re.sub(r"n\'t", " not ", text)
  text = re.sub(r"\'re", " are ", text)
  text = re.sub(r"\'d", " would ", text)
  text = re.sub(r"\'ll", " will ", text)
  text = re.sub(r",", " ", text)
  text = re.sub(r"\'", " ", text)
  text = re.sub(r"\"", "", text)
  text = re.sub(r"!", "", text)
  text = re.sub(r"\(", "", text)
  text = re.sub(r"\)", "", text)
  text = re.sub(r"\?", "", text)
  text = re.sub(r"\s{2,}", " ", text)
  text = text + ' '
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# further cleaning
for index, row in df.iterrows():
  if 'This article is part of a feature we also send out via email' in row['content'] or len(row['content']) < 40:
    df.drop(index, inplace=True)
    
df['date'] = df['date'].apply(lambda x: x.strip())
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))
df['content'] = df['content'].apply(lambda x: clean_str(x) + ' ')
df['title'] = df['title'].apply(lambda x: clean_str(x) + ' ')

filtered = df['title'].apply(lambda x: [word for word in x.split() if word not in stop_words])
df['title'] = [' '.join(word) for word in filtered]
filtered = df['content'].apply(lambda x: [word for word in x.split() if word not in stop_words])
df['content'] = [' '.join(word) for word in filtered]

In [0]:
# save news corpus data frame
df.to_csv('title_content.csv')
!cp title_content.csv drive/My\ Drive/Colab\ Notebooks/csvData/

In [0]:
# concatenate news into one row per date
result_df = df.groupby('date')['title'].sum().to_frame('Title')
result_df2 = df.groupby('date')['content'].sum().to_frame('Content')

result_df['Content'] = pd.Series(result_df2['Content'])
result_df = result_df.reset_index().rename(columns={"date" : "Date"})

In [0]:
# save compressed data frame
result_df.to_csv('Text_DataSTOP.csv')
!cp Text_DataSTOP.csv drive/My\ Drive/Colab\ Notebooks/csvData

In [0]:
# load in sentiment data
LMcDSent = pd.read_csv('/content/drive/My Drive/Colab Notebooks/csvData/LoughranMcDonald_SentimentWordLists_2018.csv', encoding='ISO-8859-1', engine='python', error_bad_lines=False)
LMcDSent.head()


In [0]:
# convert sentiment data to lists
negative = pd.Series.tolist(LMcDSent['Negative'])
positive = pd.Series.tolist(LMcDSent['Positive'])
uncertain = pd.Series.tolist(LMcDSent['Uncertainty'])
litigious = pd.Series.tolist(LMcDSent['Litigious'])
strongmodal = pd.Series.tolist(LMcDSent['StrongModel'])
weakmodal = pd.Series.tolist(LMcDSent['WeakModal'])
constrain = pd.Series.tolist(LMcDSent['Constraining'])

In [0]:
# method to create sentiment scores
def sentiment_count(df, words):
  tmp = []
  for index, row in df.iterrows():
    count = 0
    for word in words:
      if 'zzzzz' in word:
        break
      else:
        count = count + (df['Title'].iloc[index].count(word)) + (df['Content'].iloc[index].count(word))
    tmp.append(count)
  return tmp

In [0]:
# create sentiment scores for each sentiment category
negative = sentiment_count(result_df, negative)
positive = sentiment_count(result_df, positive)
uncertain = sentiment_count(result_df, uncertain)
litigious = sentiment_count(result_df, litigious)
strongmodal = sentiment_count(result_df, strongmodal)
weakmodal = sentiment_count(result_df, weakmodal)
constrain = sentiment_count(result_df, constrain)

In [0]:
# create data frame to store all sentiment scores
zippedList =  list(zip(negative, positive, uncertain, litigious, strongmodal, weakmodal, constrain))
LMcDSent = pd.DataFrame(zippedList, columns = ['Negative' , 'Positive', 'Uncertainty', 'Litigious', 'Strong Modal', 'Weak Modal', 'Constraining'])

In [0]:
# normalise sentiment scores
LMcDSent =(LMcDSent-LMcDSent.min())/(LMcDSent.max()-LMcDSent.min())

In [0]:
# save sentiment scores
LMcDSent.to_csv('/content/drive/My Drive/Colab Notebooks/csvData/Emotion_Data.csv')
!cp Emotion_Data.csv drive/My\ Drive/Colab\ Notebooks/

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/Text_Data.csv'
path2 = '/content/drive/My Drive/Colab Notebooks/Emotion_Data.csv'
path3 = '/content/drive/My Drive/Colab Notebooks/title_content.csv'
final_text = pd.read_csv(path, encoding='ISO-8859-1', engine='python', error_bad_lines=False)
df = pd.read_csv(path2, encoding='ISO-8859-1', engine='python', error_bad_lines=False)
ungrouped_text = pd.read_csv(path3, encoding='ISO-8859-1', engine='python', error_bad_lines=False)

final_text.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('Unnamed: 0', axis=1, inplace=True)
ungrouped_text.drop('Unnamed: 0', axis=1, inplace=True)

df['Date'] = final_text['Date']
final_text = final_text.merge(df, how='inner', on='Date')

final_text.head()

In [0]:
ungrouped_text.drop('Unnamed: 0', axis=1, inplace=True)

#Finance Data Creation and Preprocessing

In [0]:
# load and prepare all financial data
djia_data = yf.download("DJIA", start="2016-01-01", end="2017-06-30")
djia_data['Volatility'] = djia_data['High'] - djia_data['Low']

djia_data =(djia_data-djia_data.min())/(djia_data.max()-djia_data.min())

djia_data.reset_index(inplace=True)
djia_data['Direction'] = 0

djia_data['shift'] = djia_data['Open'].shift(-1)

[*********************100%***********************]  1 of 1 downloaded


In [0]:
# calculate data labels
for index, row in djia_data.iterrows():
  if index is 375:
    break
  elif row['Close'] < row['shift']:
    djia_data.loc[index, 'Direction'] = 1
    
djia_data.drop('shift', inplace=True, axis=1)

In [0]:
# save finance data frame
djia_data.to_csv('DJIA_Data.csv')
!cp DJIA_Data.csv drive/My\ Drive/Colab\ Notebooks/