In [None]:
from __future__ import division
from pandas_datareader import data as pddr
import pandas as pd
import datetime
import requests
from bs4 import BeautifulSoup
import numpy as np

%matplotlib inline

pd.options.display.max_rows = 10

def get_sp_tickers():
    ''' Scrapes list of S&P 500 companies and ticker symbols from Wikipedia'''
    
    WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

    req = requests.get(WIKI_URL)
    soup = BeautifulSoup(req.content, 'lxml')
    table_classes = {"class": ["sortable", "plainrowheaders"]}
    wikitables = soup.findAll("table", table_classes)

    rows = wikitables[0].findAll('tr')
    headers = [i.text for i in rows[0].findAll('th')]
    table_data = map(lambda x:[i.text for i in x.findAll('td')], rows[1:])
    sp = pd.DataFrame(table_data, columns = headers)
    sp['Ticker symbol'] = sp['Ticker symbol'].astype(str)
    return sp

sp_df = get_sp_tickers()

def batch_data_pull(tickers, start_date, end_date, batch_size = 200):
    '''Takes in a list of ticker symbols, and grabs all the Yahoo stock data between start and end dates'''
    
    assert len(tickers) > batch_size, 'Not a batch pull buddy'
    batches = int(round(len(tickers) / batch_size))
    ticker_batches = np.array_split(tickers, batches)
    raw_data = []
    data_source = 'yahoo'
    
    for ticker_batch in ticker_batches:
        # User pandas_reader.data.DataReader to load the desired data. As simple as that.
        panel_data = pddr.DataReader(ticker_batch, data_source, start_date, end_date)
        raw_data.append(panel_data)
        data = pd.concat(raw_data, axis=2)
    return data

stock_data = batch_data_pull(sp_df['Ticker symbol'], '2017-06-01', '2017-10-01')

In [None]:
# looking at effects of after hours trading
# normal, mean centered on + 0.04% - almost no efect, long tailed both ways though
ah_delta = (stock_data.loc['Open'] - stock_data['Close'].shift(-1)) / stock_data['Close'].shift(-1) * 100

# when narrowing down population to after 5% drops, pretty skewed distribution - prices tend to drop a little bit
# heavy tailed towards dropping a lot
stacked_delta = ah_delta.stack()
#stacked_delta.loc[high_stack.set_index(['Date', 'ticker']).index].hist(bins=20)