# Predicting Future Stock Prices Based on Current News
## or effect of news sentiment on stock prices

In [1]:
from bs4 import BeautifulSoup
import httplib
import urllib2
import pandas as pd
import numpy as np

from datetime import date, timedelta, datetime
from time import sleep
import os.path
import re
import random
import matplotlib.pyplot as plt
%matplotlib inline

## Data
My capstone project is about effects of news on stock prices. for that I need to do my study on a well publicized stock so usually S&P 500 companies have better news coverage. Since choosing a particular stock can led to some bias and overfitting I decided to go with more than one which leads to problem of choosing those stocks. Since each stock belongs to a particular industry and usually news about a particular stock can affect the whole industry and vice versa I decided to do my study on multiple stocks in multiple industries. By using this approach we can even find out which industries are more sensitive to news.

There is a wikipedia page containig all S&P 500 companies names and in formation so we start from there.

In [2]:
if not os.path.exists('datasets/s_p_500.csv'):
    s_p_500 = pd.read_html('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', attrs = {"class":'wikitable sortable'}, header = 0)[0]
    s_p_500.to_csv('datasets/s_p_500.csv', encoding = "utf-8")
else:
    s_p_500 = pd.read_csv('datasets/s_p_500.csv', index_col = 0)

s_p_500.columns = [x.strip().replace(' ', '_') for x in s_p_500.columns]
s_p_500.Ticker_symbol = s_p_500.Ticker_symbol.apply(lambda x: x.replace('-', ''))
s_p_500['Ticker_symbol'] = s_p_500['Ticker_symbol'].apply(lambda x: x.replace('.N', ''))

Now we can identify all industries

In [3]:
industries = s_p_500.GICS_Sector.unique()
industries

array(['Industrials', 'Health Care', 'Information Technology',
       'Consumer Discretionary', 'Utilities', 'Financials', 'Materials',
       'Consumer Staples', 'Real Estate', 'Energy',
       'Telecommunications Services'], dtype=object)

We are going to use 3 companies in each industry

In [4]:
companies = [s_p_500[(s_p_500.GICS_Sector == industry)].head(10) for industry in industries]
companies = pd.concat(companies, axis = 0)
companies.head()

Unnamed: 0,Ticker_symbol,Security,SEC_filings,GICS_Sector,GICS_Sub_Industry,Address_of_Headquarters,Date_first_added,CIK
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740
5,AYI,Acuity Brands Inc,reports,Industrials,Electrical Components & Equipment,"Atlanta, Georgia",2016-05-03,1144215
15,ALK,Alaska Air Group Inc,reports,Industrials,Airlines,"Seattle, Washington",2016-05-13,766421
20,ALLE,Allegion,reports,Industrials,Building Products,"Dublin, Ireland",2013-12-02,1579241
28,AAL,American Airlines Group,reports,Industrials,Airlines,"Fort Worth, Texas",2015-03-23,6201


## News
for news source I choose Reuters. Reuters is a well known and presigeous news agency and most of all it has a great site which we grants us access to historical company news. for starter I decided get news from 2014 to present. I may need to go further later.

Reuters doesn't like `-` in symbols!!!

In [5]:
companies.Ticker_symbol = companies.Ticker_symbol.apply(lambda x: x.replace('-', ''))

This function goes through all the news starting from `start_date` to `end_date` and grabs all the news. This took me one day to get all the news!

In [6]:
def get_article(href, text, date):
    link = {}
    link['Url'] = "http://www.reuters.com/"+href
    link['Title'] = text
    link['Symbol'] = symbol
    link['Date'] = date
    soup = BeautifulSoup(urllib2.urlopen(link['Url']), "lxml")
    link['Article'] = soup.find_all(
        'span', {"id":"article-text"})[0].text.replace('\n', ' ')
    link['Time'] = soup.find_all(
        'span', {"class":"timestamp"})[0].text
    sleep(random.randint(1,5)/10.0)
    return link

def symbol_news(url):
    links=[]
    news = BeautifulSoup(urllib2.urlopen(url), "lxml").find_all('div', {"id":"companyNews"})
    for i in xrange(2):
#         print url
        for feature in news[i].find_all('h2'):
            a = feature.find('a')
            if a.has_attr('href'):
                d = re.search('[0-9]{8}', url).group(0)
                day=datetime.strptime(d, '%m%d%Y').date()
                links.append(get_article(a['href'], a.text, day))
    sleep(random.randint(1,5)/10.0)
    return links

def get_news_for_symbol(symbol, start_date = date(2014, 1, 1), end_date = date(2014, 2, 1)):
    days = [start_date + timedelta(n) for n in range((end_date - start_date).days)]
    links = []
    for day in days:
        url = "http://www.reuters.com/finance/stocks/companyNews?symbol={}&date={}".format(symbol,day.strftime('%m%d%Y'))
        failed_urls = []
        try:
            links += symbol_news(url)
#             news = BeautifulSoup(urllib2.urlopen(url), "lxml").find_all('div', {"id":"companyNews"})
#             for i in xrange(2):
#                 for feature in news[i].find_all('h2'):
#                     a = feature.find('a')
#                     if a.has_attr('href'):
#                         links.append(get_article(a['href'], a.text, day))
#             sleep(1)
        except (urllib2.URLError, IOError, httplib.HTTPException) as e:
            print e.args
            print url
            failed_urls.append(url)
            pass
        if len(failed_urls) > 0:
            failed_urls = pd.DataFrame(failed_urls)
            failed_urls['Symbol'] = symbol
            if not os.path.exists('datasets/companies/failed_urls.csv'):
                failed_urls.to_csv('datasets/companies/failed_urls.csv', encoding='utf8')
            else:
                all_failed = pd.read_csv('datasets/companies/failed_urls.csv', index_col=0)
                all_failed = pd.concat([all_failed, failed_urls], axis=1)
                all_failed.to_csv('datasets/companies/failed_urls.csv', encoding='utf8')
    links = pd.DataFrame(links)
    return links

In [7]:
# companies = companies.loc[companies.Ticker_symbol == 'MMM']

In [8]:
c = s_p_500
# c.Ticker_symbol

In [9]:
articles = []
# if not os.path.exists('datasets/articles.csv'):
for symbol in c.Ticker_symbol:
    print symbol
    if not os.path.exists('datasets/companies/{}.csv'.format(symbol)):
        article = get_news_for_symbol(symbol, end_date = date.today())
        article.to_csv('datasets/companies/{}.csv'.format(symbol), encoding='utf8')
        articles.append(article)
    else:
        article = pd.read_csv('datasets/companies/{}.csv'.format(symbol), encoding='utf8', index_col=0)
        articles.append(article)
# articles = pd.concat(articles, axis=1)
# articles.to_csv('datasets/articles.csv', encoding = "utf-8")
# else:
#     articles = pd.read_csv('datasets/articles.csv', index_col = 0)

articles['Date'] = pd.to_datetime(articles['Date'], format='%Y-%m-%d')

MMM
ABT
ABBV
ACN
ATVI
AYI
ADBE
AAP
AES
AET
AFL
AMG
A
APD
AKAM
ALK
ALB
AGN
LNT
ALXN
ALLE
ADS
ALL
GOOGL
GOOG
MO
AMZN
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
ABC
AME
AMGN
APH
APC
ADI
ANTM
AON
APA
AIV
AAPL
AMAT
ADM
ARNC
AJG
AIZ
T
ADSK
ADP
AN
AZO
AVB
AVY
BHI
BLL
BAC
BK
BCR
BAX
BBT
BDX
BBBY
BRKB
BBY
BIIB
BLK
HRB
BA
BWA
BXP
BSX
BMY
AVGO
BFB
CHRW
CA
COG
CPB
COF
CAH
HSIC
KMX
CCL
CAT
CBG
CBS
CELG
CNC
CNP
CTL
CERN
CF
SCHW
CHTR
CHK
CVX
CMG
CB
CHD
CI
XEC
CINF
CTAS
CSCO
C
CFG
CTXS
CLX
CME
CMS
COH
KO
CTSH
CL
CMCSA
CMA
CAG
CXO
COP
ED
STZ
GLW
COST
COTY
CCI
CSRA
CSX
CMI
CVS
DHI
DHR
DRI
DVA
DE
DLPH
DAL
XRAY
DVN
DLR
DFS
DISCA
DISCK
DG
DLTR
D
DOV
DOW
DPS
DTE
DD
DUK
DNB
ETFC
EMN
ETN
EBAY
ECL
EIX
EW
EA
EMR
ENDP
ETR
EOG
EQT
EFX
EQIX
EQR
ESS
EL
ES
EXC
EXPE
EXPD
ESRX
EXR
XOM
FFIV
FB
FAST
FRT
FDX
FIS
FITB
FSLR
FE
FISV
FLIR
FLS
FLR
FMC
FTI
FL
F
FTV
FBHS
BEN
FCX
FTR
GPS
GRMN
GD
GE
GGP
GIS
GM
GPC
GILD
GPN
GS
GT
GWW
HAL
HBI
HOG
HAR
HRS
HIG
HAS
HCA
HCP
HP
HES
HPE
HOLX
HD
HON
HRL
HST
HPQ
HUM
HBAN
ITW
ILMN
IR
INTC
ICE
IBM
IP
I

KeyboardInterrupt: 

In [None]:
articles.head()

There are plenty of redundancy which I have yet to decide what to do with them

# Stock quotes
After geting news data we gonna need quotes data. Getting hhistorical data can become really tricky. we can use yahoo finance to get historical daily quotes and for this project I think thats enough otherwise we have to use some propriority data.

In [None]:
def get_quotes(symbol, start_date='1/1/2014', end_date=date.today().strftime('%m/%d/%Y')):
    ret = web.DataReader(symbol, data_source='yahoo', start=start_date, end=end_date)
    ret['Symbol'] = symbol
    return ret

In [None]:
quotes = ""
if not os.path.exists('datasets/daily_quotes.csv'):
    quotes = [get_quotes(symbol) for symbol in companies.Ticker_symbol ]
    quotes = pd.concat(quotes, axis=0)
    quotes.to_csv('daily_quotes.csv', encoding = "utf-8")
else:
    quotes = pd.read_csv('datasets/daily_quotes.csv')

quotes['Date'] = pd.to_datetime(quotes['Date'], format='%Y-%m-%d')
    
quotes.head()

## Plotting
### Quotes through time
Here is stock prices for different symbols from 2014 till last friday.

In [None]:
for symbol in quotes.symbol.unique():
    plt.figure(1, figsize=(30,5))
    plt.subplot(222)
    plt.plot('Date', 'Open', data=quotes[(quotes.symbol == symbol)])
    plt.yscale('linear')
    plt.title(symbol)
    plt.grid(True)
    plt.show()

### News through time
as we can see not all companies are news friendly!

In [None]:
for symbol in links.symbol.unique():
    data=links[(links['symbol']==symbol)]
    data=pd.pivot_table(data, index='date', values='article', aggfunc=np.count_nonzero).reset_index()
    
    plt.figure(1, figsize=(30,5))
    plt.subplot(222)
    plt.plot('date', 'article', data=data)
    plt.yscale('linear')
    plt.title(symbol)
    plt.grid(True)
    plt.show()