In [3]:
import requests      
from datetime import datetime
from bs4 import BeautifulSoup  
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import re, csv, string 
import gensim
import pandas as pd
from gensim import corpora, models
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np


#------------------------------------------------HELPER FUNCTIONS---------------------------------------------------------
## Separates headlines out from 'all_data' variable so it can be written to separate columns in csv
def get_headline_list(filtered_data, index):
    headlines_list = []
    for row in filtered_data:
        headlines_list.append(row[index])
    return headlines_list

## Separate dates out from 'all_data' variable so it can be written to separate columns in csv
def get_date_list(filtered_data, index):
    date_list = []
    for row in filtered_data:
        date_list.append(row[index])
    return date_list

## Handle cleaning of one specific date format
def tokenize_date(s):
    pattern=r'[a-zA-Z{3}]+[.\s]+[\d{1,2}\,\s]+[\d{4}]+'                        
    tokens=nltk.regexp_tokenize(s, pattern)
    date = tokens
    return date


#--------------------------------------------------WEB SCRAPERS---------------------------------------------------------
def get_bloomberg_headlines():
    headlines=[]  # list variable to store headlines
    dates = []    # list variable to store dates
    raw_headlines = [] 
    page_number = 1
    page_url="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2018-04-27T17:35:17.135Z&page="+str(page_number)
    # loop until page 80
    while page_url!="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2018-04-27T17:35:17.135Z&page=80":     
        page_url="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2018-04-27T17:35:17.135Z&page="+str(page_number)
        page = requests.get(page_url) 
        page_number += 1
        if page_number % 20 == 0:
            print('scraped %s of 80 pages' % page_number)
        if page.status_code!=200:  
            page_number = 80 #if page status code fails to equal 200, connection failed; set page_num to while loop condition
        else:                   
            soup = BeautifulSoup(page.content, 'html.parser')                        
            
#-----------scrape and clean all headlines and append to a list called headlines----------------------------------------------
            for header in soup.find_all('h1', class_ ='search-result-story__headline'):
                headline = header.get_text().lower()
                raw_headlines.append(headline)
                
#-----------scrape all dates and append to a list called dates---------------------------------------------------------------
            for date in soup.find_all('time', class_ = 'published-at'):
                date_published = date.get_text()
                date_published = date_published.lstrip()
                dates.append(date_published)
                 
#---join headlines list and dates list into a list of tuples called raw_data-----------------------------------------------------
    raw_data = zip(raw_headlines, dates)
    
    return raw_data

def marketwatch_parser():
    headlines=[]
    page_url= "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=500&mp=806&bd=false&bdv=&rs=false"
    while page_url!=None:
        page = requests.get(page_url) 
        if page.status_code!=200:
             page_url=None
        else:
            all_data = []
            all_data_raw = []
            page_url = "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=500&mp=806&bd=false&bdv=&rs=false"
            page = requests.get(page_url)
            soup = BeautifulSoup(page.content, "html.parser")
            divs = soup.find_all("div", class_ = "searchresult")
            titles_dictionary = []
            raw_headlines = []
            for idx, div in enumerate(divs):
                titles = div.select("a")
                if titles != []:
                    title = titles[0].get_text()
                    raw_headlines.append(title)
            divs_dates = soup.find_all("div", class_ = "deemphasized")
            dates_list = []
            for idx, div in enumerate(divs_dates):
                date = div.get_text().lower()
                date = str(tokenize_date(date)[0])
                date = datetime.strptime(date.replace("sept.","sep.").replace("july", "jul.").replace("june", "jun.").replace("may", "may.").replace("march", "mar.").replace("april", "apr."), '%b. %d, %Y')
                date = datetime.strftime(date,'%b %d, %Y')
                dates_list.append(date)
            raw_data = zip(raw_headlines, dates_list)
        page_url = None
    return raw_data

def get_wsj_headlines():
    f = open('wsj_data.txt', 'r',encoding='latin-1')
    lines = [line for line in f]  
    headline_info = []
    raw_headlines = []
    date_info = []
    left_side_info = []
    right_side_info = []
    
    for i in lines: 
        temp = i
        # filter out rows without keywords 'Wall Street Journal' and filter out rows with 'regulation newsletter'
        if ( temp.find("Wall Street Journal") !=-1 and temp.find("financial regulation newsletter") == -1):
            wsj_data = temp 
            date = re.findall(r'\d{4}[,]\s[a-zA-Z]+\s[\d]+', wsj_data)                              
            date_str = date[0]                                                # get the date as string format
            date_str = datetime.strptime(date_str, '%Y, %b %d')
            date_str = datetime.strftime(date_str,'%b %d, %Y')
            date_info.append(date_str)                                        # append the date into a list
            idx_r_p = wsj_data.find(")")                                      # get the index of right parenthesis
            idx_l_p = wsj_data.find("(")                                      # get the index of left parenthesis
            left_side_date = wsj_data[0:idx_l_p]                              # get the lefe part of date in each row
            left_side_info.append(left_side_date)                             # append it into a list 
            right_side_date = wsj_data[idx_r_p+3::]                           # get the right side of date in each row
            right_side_info.append(right_side_date)                           # append it into a list 
            idx_wsj = right_side_date.find("Wall Street Journal")             # get the index of 'Wall Street Journal' of right side of date

            if (idx_wsj == 0):
#               if 'Wall Street Journal' is at the beginning of right side of date, get the headline from the left side of date
                headline = left_side_date
                t_headline = headline.lower()
                raw_headlines.append(t_headline)
                
            else:
#          if 'Wall Street Journal' is not at the beginning of right side of date, get the headline between right parenthesis and Wall Street Journal        
                headline = right_side_date[0:idx_wsj]
                t_headline = headline.lower()     
                raw_headlines.append(t_headline)
            
            raw_data = zip(raw_headlines, date_info)
            
    return raw_data



if __name__ == "__main__":
    print("This web scraper takes about 10 minutes to run")
    print("scraping bloomberg data...")
    bloomberg_raw = get_bloomberg_headlines()
    print("scraping marketwatch data...")
    marketwatch_raw = marketwatch_parser()
    print("scraping WSJ data from txt file...")
    wsj_raw = get_wsj_headlineswsj_raw = get_wsj_headlines()
    print("done scraping all data - stored in list 'all_data_raw' ")

    # All raw data with headlines that don't include keywords filtered out
    all_data_raw = []
    all_data_raw.extend(list(bloomberg_raw) + list(marketwatch_raw) + list(wsj_raw))
    print(all_data_raw)

This web scraper takes about 10 minutes to run
scraping bloomberg data...
scraped 20 of 80 pages
scraped 40 of 80 pages
scraped 60 of 80 pages
scraped 80 of 80 pages
scraping marketwatch data...
scraping WSJ data from txt file...
done scraping all data - stored in list 'all_data_raw' 


In [4]:
df = pd.DataFrame(all_data_raw)
df

Unnamed: 0,0,1
0,blockchains warrant skepticism but keep an op...,"Apr 27, 2018"
1,"bitcoin: the past, present and future","Apr 26, 2018"
2,digital-banking startup revolut is worth $1.7...,"Apr 26, 2018"
3,pantera's morehead says bitcoin 'is a screami...,"Apr 26, 2018"
4,"pantera ceo says bitcoin is ‘screaming buy,’ ...","Apr 26, 2018"
5,crypto world gains even more options: five ne...,"Apr 26, 2018"
6,samsung's winning big from a boom in the bori...,"Apr 25, 2018"
7,struggling russians turn to day-trading to pa...,"Apr 25, 2018"
8,central bankers can’t agree on cryptocurrencies,"Apr 25, 2018"
9,a software bug in ethereum tokens is helping ...,"Apr 25, 2018"


In [5]:
df.to_csv('web scrape.csv')
pd.read_csv('web scrape.csv')

Unnamed: 0.1,Unnamed: 0,0,1
0,0,blockchains warrant skepticism but keep an op...,"Apr 27, 2018"
1,1,"bitcoin: the past, present and future","Apr 26, 2018"
2,2,digital-banking startup revolut is worth $1.7...,"Apr 26, 2018"
3,3,pantera's morehead says bitcoin 'is a screami...,"Apr 26, 2018"
4,4,"pantera ceo says bitcoin is ‘screaming buy,’ ...","Apr 26, 2018"
5,5,crypto world gains even more options: five ne...,"Apr 26, 2018"
6,6,samsung's winning big from a boom in the bori...,"Apr 25, 2018"
7,7,struggling russians turn to day-trading to pa...,"Apr 25, 2018"
8,8,central bankers can’t agree on cryptocurrencies,"Apr 25, 2018"
9,9,a software bug in ethereum tokens is helping ...,"Apr 25, 2018"


In [6]:
# f = open('web scrape', encoding="latin-1")

In [8]:
# Initialize Sentiment Analyzer 
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [9]:
data_with_sentiment_df=pd.DataFrame(data_with_sentiment)
data_with_sentiment_df

NameError: name 'data_with_sentiment' is not defined

In [None]:
data_with_sentiment_df.to_csv('data_with_sentiment.csv')

In [None]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import plotly.offline as py
import plotly.graph_objs as go
import numpy as np
import seaborn as sns
py.init_notebook_mode(connected=True)
%matplotlib inline

In [None]:
data1 = pd.read_csv("btc price.csv",encoding='latin-1')
data2 = pd.read_csv("data_with_sentiment.csv",encoding='latin-1')

In [None]:
data1.info()

In [None]:
data2.info()

In [None]:
data1

In [None]:
len(data2)

In [None]:
data = pd.merge(data1,data2, on='date', how='inner')

In [None]:
data.info()

In [None]:
data

In [None]:
data['date'] = pd.to_datetime(data['date'].apply(str))
#data = data.sort_values(by='date')
data=data.drop(['Unnamed: 0'], axis=1)

In [None]:
btc_trace = go.Scatter(x=data['date'], y=data['price(USD)'], name= 'Price')
py.iplot([btc_trace])

In [None]:
data['price(USD)'].replace(0, np.nan, inplace=True)
data['price(USD)'].fillna(method='ffill', inplace=True)

In [None]:
btc_trace = go.Scatter(x=data['date'], y=data['price(USD)'], name= 'Price')
py.iplot([btc_trace])

In [None]:
from sklearn.preprocessing import MinMaxScaler
values = data['price(USD)'].values.reshape(-1,1)
sentiment = data['sentiment'].values.reshape(-1,1)
values = values.astype('float32')
sentiment = values.astype('float32')
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

In [None]:
train_size = int(len(scaled) * 0.7)
test_size = len(scaled) - train_size
train, test = scaled[0:train_size,:], scaled[train_size:len(scaled),:]
print(len(train), len(test))
split = train_size

In [None]:
def create_dataset(dataset, look_back, sentiment):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        a = dataset[i:(i + look_back), 0]
        np.append(a,sentiment[i])
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    print(len(dataY))
    return np.array(dataX), np.array(dataY)

In [None]:
look_back = 1
trainX, trainY = create_dataset(train, look_back, sentiment[0:train_size])
testX, testY = create_dataset(test, look_back, sentiment[train_size:len(scaled)])

In [None]:
trainX.shape

In [None]:
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

In [None]:
model = Sequential()
model.add(LSTM(100, input_shape=(trainX.shape[1], trainX.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
history = model.fit(trainX, trainY, epochs=900, batch_size=500, validation_data=(testX, testY), verbose=0, shuffle=False)

In [None]:
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
yhat = model.predict(testX)
pyplot.plot(yhat, label='predict')
pyplot.plot(testY, label='true')
pyplot.legend()
pyplot.show()

In [None]:
yhat_inverse = scaler.inverse_transform(yhat.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(testY.reshape(-1, 1))

In [None]:
rmse = sqrt(mean_squared_error(testY_inverse, yhat_inverse))
print('Test RMSE: %.3f' % rmse)

In [None]:
pyplot.plot(yhat_inverse, label='predict')
pyplot.plot(testY_inverse, label='actual', alpha=0.5)
pyplot.legend()
pyplot.show()

In [None]:
predictDates = data.tail(len(testX)).date

In [None]:
testY_reshape = testY_inverse.reshape(len(testY_inverse))
yhat_reshape = yhat_inverse.reshape(len(yhat_inverse))

In [None]:
actual_chart = go.Scatter(x=predictDates, y=testY_reshape, name= 'Actual Price')
predict_chart = go.Scatter(x=predictDates, y=yhat_reshape, name= 'Predict Price')
py.iplot([predict_chart, actual_chart])

In [None]:
sns.heatmap(data.corr(), annot=True, cmap='RdYlGn', linewidths=0.1, vmin=0)