In [99]:
from lxml import html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from matplotlib.collections import LineCollection
from sklearn import cluster, covariance, manifold
import pickle
import pandas_datareader as pdr
import pandas as pd
import datetime as dt
import os

## 1. Web crawler for the s&p500 companies list
use selenium and xpath selector to scrape the s&p500 companies list,
using Options() to avoid Chrome browser pop out.

In [27]:
def save_500_list():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

    ########## no web browser pop out ############
    chrome_options = Options()
    chrome_options.add_argument("--headless")  
    browser = webdriver.Chrome(chrome_options = chrome_options)
    browser.get(url)

    tree = html.fromstring(browser.page_source)
    result = tree.xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr/td[1]/a[1]/text()')

    browser.close()
    
    return result

In [28]:
names = save_500_list()
print(names[:10])

['MMM', 'ABT', 'ABBV', 'ABMD', 'ACN', 'ATVI', 'AYI', 'ADBE', 'AMD', 'AAP']


## 2. Store the companies list as a pickle file

In [18]:
with open('s&p500_tickers.pickle','wb') as file:
    pickle.dump(names, file)

## 3. Read the companies list from 's&p500_tickers.pickle' file, wait for download.

In [71]:
start_time = dt.datetime(2015,1,1) 
end_time = dt.datetime(2016,12,31)

with open('s&p500_tickers.pickle','rb') as file:
    all_list = pickle.load(file)

In [31]:
if not os.path.exists('s&p500_data'):
    os.makedirs('s&p500_data')

## 4. Use a for loop to download first 100 cpmpanies data 

In [69]:
def download_data():
    for symbol in all_list[:100]:
        if not os.path.exists('s&p500_data/{}.csv'.format(symbol)):
            temp_data = pdr.DataReader(ticker, 'yahoo', start_time, end_time)
            temp_data.to_csv('s&p500_data/{}.csv'.format(symbol))
        else:
            continue

In [70]:
download_data()

## 5. Compile all data into one pandas dataframe

In [88]:
def compile_data():
    main_df = pd.DataFrame()
    
    for count,symbol in enumerate(all_list[:100]):
        temp_df = pd.read_csv('s&p500_data/{}.csv'.format(symbol))
        temp_df.set_index('Date',inplace=True)
        
        temp_df.rename(columns = {'Adj Close': symbol}, inplace=True)
#         print(temp_df.head())
#         break
        temp_df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], 1, inplace=True)
        
        if main_df is None:
            main_df = temp_df
        else:
            main_df = main_df.join(temp_df, how='outer')
    if count%10 == 0:
        print(count)
    return main_df

In [97]:
result_df = compile_data()
result_df.head()

Unnamed: 0_level_0,MMM,ABT,ABBV,ABMD,ACN,ATVI,AYI,ADBE,AMD,AAP,...,CPB,COF,CAH,KMX,CCL,CAT,CBOE,CBRE,CBS,CELG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31,150.427399,41.626999,57.834881,38.060001,82.357475,19.56358,138.711304,72.699997,2.67,158.376587,...,150.427399,150.427399,150.427399,150.427399,150.427399,150.427399,150.427399,150.427399,150.427399,150.427399
2015-01-02,150.189392,41.516052,58.232586,37.310001,81.924049,19.544163,138.523163,72.339996,2.67,157.660675,...,150.189392,150.189392,150.189392,150.189392,150.189392,150.189392,150.189392,150.189392,150.189392,150.189392
2015-01-05,146.8022,41.525291,57.136696,37.07,80.540825,19.272312,135.195709,71.980003,2.66,155.582504,...,146.8022,146.8022,146.8022,146.8022,146.8022,146.8022,146.8022,146.8022,146.8022,146.8022
2015-01-06,145.236755,41.053726,56.853886,36.130001,79.959869,18.913078,133.502304,70.529999,2.63,155.47316,...,145.236755,145.236755,145.236755,145.236755,145.236755,145.236755,145.236755,145.236755,145.236755,145.236755
2015-01-07,146.289551,41.3866,59.151722,37.279999,81.638191,18.505302,135.86911,71.110001,2.58,158.814102,...,146.289551,146.289551,146.289551,146.289551,146.289551,146.289551,146.289551,146.289551,146.289551,146.289551


In [100]:
edge_model = covariance.GraphLassoCV()

In [103]:
edge_model.fit(result_df)

GraphLassoCV(alphas=4, assume_centered=False, cv=None, enet_tol=0.0001,
       max_iter=100, mode='cd', n_jobs=1, n_refinements=4, tol=0.0001,
       verbose=False)
