In [116]:
# Importing necessary models
import warnings
warnings.filterwarnings('ignore')

import smtplib
import pandas as pd
import numpy as np
import datetime as dt
import pandas.stats.moments as st
import time
%matplotlib inline
from bs4 import BeautifulSoup as bs
import requests
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sqlalchemy import *
from sqlalchemy import create_engine
import calendar
import sqlite3 as sql
pd.options.display.float_format = '{:,.4f}'.format

init_notebook_mode(connected=True)

import os
main_dir = os.getcwd()

dbs_dir = 'C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Data\\DBs'

os.chdir('C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Trading\\Modules\\DataCollection')

from yahoo_query import *

dbs_dir = 'D:\\Price Data'

os.chdir(dbs_dir)

# price_engine = create_engine('sqlite:///histprices.db', echo=False)

# inspector = inspect(price_engine)


## Getting SPX Constituents and Historical Compositions

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
soup = bs(requests.get(wiki_url,'lxml').text,'lxml')
sp_const = soup.select('table[class="wikitable sortable"]')[0]
changes = soup.select('table[class="wikitable sortable"]')[1]

sp_dict = {}
sp_headers = []
for row in sp_const.find_all('th'):
    sp_headers.append(row.text.strip())
    sp_dict[row.text.strip()] = []
    
for row in sp_const.find_all('tr'):
    for i, col in enumerate(row.find_all('td')):
        col_header = sp_headers[i]
        curr_val = col.text.strip()
        if 'Date' in col_header:
            try:
                curr_val = dt.datetime.strptime(curr_val, '%Y-%m-%d')
            except:
                curr_val = np.nan
        sp_dict[col_header].append(curr_val)

sp_const = pd.DataFrame(sp_dict).set_index('Symbol')
sp_const = sp_const[['Security','Date first added','Founded','GICS Sector','GICS Sub Industry']]

changes_headers = ['Date','Ticker_Added','Security_Added',
                   'Ticker_Removed']

changes_dict = {k:[] for k in changes_headers}


rowspan = 0
for row in changes.find_all('tr'):
       
    for i, col in enumerate(row.find_all('td')[:-1]):
        curr_val = col.text.strip()
        if len(row) == 12:
            if i == 0:
                curr_val = dt.datetime.strptime(curr_val, '%B %d, %Y')
                curr_date = curr_val
            if i == 4:
                continue
            changes_dict[changes_headers[i]].append(curr_val)
            
        else:
            if i == 0:
                changes_dict[changes_headers[i]].append(curr_date)
                changes_dict[changes_headers[1]].append(curr_val)
            elif i == 3:
                continue
            else:
                changes_dict[changes_headers[i + 1]].append(curr_val)
changes = pd.DataFrame(changes_dict)    

sp_500_hist = {dt.datetime.today().date():sp_const}

curr_const = sp_const.copy()

for change_date in changes.Date.drop_duplicates():
    curr_date = change_date.date()
    curr_changes = changes[changes.Date == change_date]
    curr_drops = curr_changes.Ticker_Added.tolist()
    curr_adds = curr_changes.Ticker_Removed.tolist()
    
    curr_adds = list(filter(lambda x: x != '', curr_adds))
    curr_drops = list(filter(lambda x: x != '', curr_drops))
    
    curr_rows = pd.DataFrame({'Symbol': curr_adds}).set_index('Symbol')
    
    curr_const = curr_const[curr_const.index.isin(curr_drops) == False]
    curr_const = pd.concat([curr_const, curr_rows], axis = 0)
    
    sp_500_hist[curr_date] = curr_const
    
    
all_sp500_names = pd.concat(list(sp_500_hist.values())).reset_index()[['Symbol','Security']].drop_duplicates()

In [58]:
prices_dfs = []
profiles_dfs = []

failed_prices = []
failed_profiles = []

start_time = time.time()

start_date = dt.datetime(2000,1,1)
for ticker in all_sp500_names.Symbol:
    curr_stock = yahoo_query(ticker.replace('.','-'), start_date)
    try:
        curr_stock.hist_prices_query()
        curr_prices = curr_stock.hist_prices[['{}_adjclose'.format(ticker)]]
        curr_prices.columns = [ticker]
        prices_dfs.append(curr_prices)
    except:
        failed_prices.append(ticker)
        continue
    
    try:
        curr_stock.full_info_query()
        curr_profile = curr_stock.profile
        profiles_dfs.append(curr_profile)
    except:
        failed_profiles.append(ticker)
        continue
    
end_time = time.time()
print("Completed in {} seconds.".format(end_time - start_time))

Completed in 1081.4660942554474 seconds.


In [113]:
cleaned_dfs = [df.reset_index().drop_duplicates().set_index('index') for df in prices_dfs]

sp_prices = pd.concat(cleaned_dfs, axis = 1)

sp_profiles = pd.concat(profiles_dfs, axis = 0).reset_index().drop_duplicates().set_index('index')

In [128]:
sp_const_engine = create_engine('sqlite:///sp500cons.db', echo = False)
sp_prices.to_sql('histprices', con = sp_const_engine, 
                 if_exists='replace', index_label = 'Date')
sp_profiles.to_sql('profiles', con = sp_const_engine, 
                   if_exists='replace', index_label = 'Symbol')

spx_const_dfs_lst = []

for k,v in sp_500_hist.items():
    curr_spx_df = v[['Security','Date first added']]
    curr_spx_df.columns = ['Name','Added_Date']
    curr_spx_df['Latest_Date'] = k
    spx_const_dfs_lst.append(curr_spx_df)
    
pd.concat(spx_const_dfs_lst, axis = 0).to_sql('histComponents', con = sp_const_engine, 
                                              if_exists='replace', index_label = 'Symbol')

In [136]:
failed_prices

['BRK.B',
 'BF.B',
 'NFX',
 'SCG',
 'ESRX',
 'COL',
 'AET',
 'CA',
 'EVHC',
 'ANDV',
 'XL',
 'GGP',
 'DPS',
 'TWX',
 'MON',
 'WYN',
 'CSRA',
 'SNI',
 'BCR',
 'LVLT',
 'SPLS',
 'DD',
 'WFM',
 'RAI',
 'YHOO',
 'MJN',
 'DNB',
 'HAR',
 'LLTC',
 'STJ',
 'HOT',
 'EMC',
 'CPGX',
 'GAS',
 'TE',
 'CVC',
 'BXLT',
 'CCE',
 'ARG',
 'TWC',
 'SNDK',
 'CAM',
 'POM',
 'GMCR',
 'PCL',
 'PCP',
 'BRCM',
 'CMCSK',
 'CSC',
 'SIAL',
 'HCBK',
 'JOY',
 'HSP',
 'FDO',
 'KRFT',
 'TEG',
 'LO',
 'WIN',
 'CFN',
 'PETM',
 'SWY',
 'COV',
 'FRX',
 'BEAM',
 'JDSU',
 'BMC',
 'APOL',
 'CVH',
 'PCS',
 'ANR',
 'LXK',
 'DV',
 'SHLD',
 'PGN',
 'NVLS',
 'SVU',
 'MHS',
 'MWW',
 'JNS',
 'CEPH',
 'NSM',
 'NOVL',
 'GENZ',
 'Q',
 'KG',
 'EK',
 'MIL',
 'STR',
 'XTO',
 'BJS',
 'ACAS',
 'DJ',
 'AV',
 'QTRN',
 'BS']