In [1]:
# Import libraries
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Set paths
fig_path = './figures/'
data_path = './data/'

------------------------------------
**Get the companies**
------------------------------------

In [2]:
# Read data

mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
snp = pd.read_csv(data_path+"snp_allstocks_2015_2019.csv", index_col='Date', parse_dates=True, date_parser=mydateparser)
info = pd.read_csv(data_path+'snp_info.csv', index_col=0)

# https://www.slickcharts.com/sp500
# https://datahub.io/core/s-and-p-500-companies-financials
detailed_info = pd.read_csv(data_path+'constituents-financials.csv', index_col=0)
stocks_sorted = detailed_info.sort_values('Market Cap', ascending=False)['Sector']

In [3]:
stocks_by_sector = {sector: [] for sector in info['GICS Sector'].unique()}

for stock in stocks_sorted.index[:160]:
    if stock in ['PCLN', 'TWX', 'AET', 'MON', 'PX', 'ESRX']:
        continue
    stock = 'BRK-B' if stock == 'BRK.B' else stock
    
    sector = info.set_index('Symbol').loc[stock]['GICS Sector']
    stocks_by_sector[sector].append(stock)
# stocks

In [4]:
# Select top three stocks (by market cap) within each GICS Sector
stocks = {}
for sector in stocks_by_sector:
    stocks[sector] = stocks_by_sector[sector][:3]

In [5]:
# use a subset of stocks instead for easy understanding
tmp = {}
tmp['Information Technology'] = stocks['Information Technology']
tmp['Communication Services'] = stocks['Communication Services']
tmp['Consumer Discretionary'] = stocks['Consumer Discretionary']
tmp['Financials'] = stocks['Financials']

stocks = tmp
stocks

{'Information Technology': ['AAPL', 'MSFT', 'V'],
 'Communication Services': ['GOOGL', 'GOOG', 'FB'],
 'Consumer Discretionary': ['AMZN', 'HD', 'MCD'],
 'Financials': ['JPM', 'BAC', 'WFC']}

------------------------------------
**Get the data**
------------------------------------

In [6]:
sector_col = []
stock_col = []
for sector in stocks:
    for stock in stocks[sector]:
        sector_col.append(sector)
        stock_col.append(stock)

df = pd.DataFrame(columns=[sector_col, stock_col], index=snp.index)
df.head()

Unnamed: 0_level_0,Information Technology,Information Technology,Information Technology,Communication Services,Communication Services,Communication Services,Consumer Discretionary,Consumer Discretionary,Consumer Discretionary,Financials,Financials,Financials
Unnamed: 0_level_1,AAPL,MSFT,V,GOOGL,GOOG,FB,AMZN,HD,MCD,JPM,BAC,WFC
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2015-01-02,,,,,,,,,,,,
2015-01-05,,,,,,,,,,,,
2015-01-06,,,,,,,,,,,,
2015-01-07,,,,,,,,,,,,
2015-01-08,,,,,,,,,,,,


In [7]:
for sector in stocks:
    for stock in stocks[sector]:
        df.loc[:,(sector, stock)] = snp[stock]
df = df.T
df

Unnamed: 0,Date,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-14,2015-01-15,...,2018-12-17,2018-12-18,2018-12-19,2018-12-20,2018-12-21,2018-12-24,2018-12-26,2018-12-27,2018-12-28,2018-12-31
Information Technology,AAPL,109.330002,106.25,106.260002,107.75,111.889999,112.010002,109.25,110.220001,109.800003,106.82,...,163.940002,166.070007,160.889999,156.830002,150.729996,146.830002,157.169998,156.149994,156.229996,157.740005
Information Technology,MSFT,46.759998,46.330002,45.650002,46.23,47.59,47.189999,46.599998,46.360001,45.959999,45.48,...,102.889999,103.970001,103.690002,101.510002,98.230003,94.129997,100.559998,101.18,100.389999,101.57
Information Technology,V,66.254997,64.792503,64.375,65.237503,66.112503,65.1325,65.004997,65.195,63.889999,63.282501,...,131.399994,132.660004,131.259995,128.759995,124.260002,121.730003,130.229996,132.009995,130.940002,131.940002
Communication Services,GOOGL,529.549988,519.460022,506.640015,505.149994,506.910004,500.720001,497.059998,501.799988,505.929993,504.01001,...,1025.650024,1043.410034,1035.459961,1023.580017,991.25,984.669983,1047.849976,1052.900024,1046.680054,1044.959961
Communication Services,GOOG,521.937744,511.057617,499.212799,498.357513,499.928864,493.454498,489.854309,493.464447,498.128784,499.043732,...,1016.530029,1028.709961,1023.01001,1009.409973,979.539978,976.219971,1039.459961,1043.880005,1037.079956,1035.609985
Communication Services,FB,78.449997,77.190002,76.150002,76.150002,78.18,77.739998,76.720001,76.449997,76.279999,74.050003,...,140.190002,143.660004,133.240005,133.399994,124.949997,124.059998,134.179993,134.520004,133.199997,131.089996
Consumer Discretionary,AMZN,308.519989,302.190002,295.290009,298.420013,300.459991,296.929993,291.410004,294.73999,293.269989,286.950012,...,1520.910034,1551.47998,1495.079956,1460.829956,1377.449951,1343.959961,1470.900024,1461.640015,1478.02002,1501.969971
Consumer Discretionary,HD,103.43,101.260002,100.949997,104.410004,106.720001,104.889999,104.419998,103.730003,102.639999,101.010002,...,167.970001,170.039993,167.559998,164.160004,160.479996,158.139999,168.279999,170.320007,170.220001,171.820007
Consumer Discretionary,MCD,93.260002,92.230003,92.400002,94.010002,94.360001,93.209999,93.010002,92.830002,91.540001,91.379997,...,180.789993,179.710007,179.160004,173.679993,174.149994,170.279999,174.029999,175.710007,175.559998,177.570007
Financials,JPM,62.490002,60.549999,58.98,59.07,60.389999,59.34,58.830002,58.84,56.810001,54.990002,...,99.010002,98.540001,97.290001,96.449997,94.169998,92.139999,95.959999,97.040001,96.830002,97.620003


------------------------------------
**Represent DataFrame as one column Multi-index data**
------------------------------------
Also standardize individual asset names (e.g. asset_1)

In [8]:
# Represent multi-index pandas dataframe as a tensor
# (Multi-index dataframe with only one column of data)
import itertools

T = 1006
N = 12

sector_idx = []
stock_idx = []
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        sector_idx.append([sector] * T)
        stock_idx.append([stock_name] * T)

sector_idx = list(itertools.chain.from_iterable(sector_idx))
stock_idx = list(itertools.chain.from_iterable(stock_idx))

time_idx = list(snp.index[:T]) * N


df_tensor = pd.DataFrame(index=[sector_idx, stock_idx, time_idx], columns=['Price'])
df_tensor.index.names = ['Sector', 'Asset', 'Date']

In [9]:
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        df_tensor['Price'].loc[sector, stock_name] = list(snp[stock])

  result = self._run_cell(


In [10]:
df_tensor

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Price
Sector,Asset,Date,Unnamed: 3_level_1
Information Technology,asset_1,2015-01-02,109.33
Information Technology,asset_1,2015-01-05,106.25
Information Technology,asset_1,2015-01-06,106.26
Information Technology,asset_1,2015-01-07,107.75
Information Technology,asset_1,2015-01-08,111.89
...,...,...,...
Financials,asset_3,2018-12-24,43.6
Financials,asset_3,2018-12-26,45.59
Financials,asset_3,2018-12-27,45.53
Financials,asset_3,2018-12-28,45.78


------------------------------------
**Tensorize the data**
------------------------------------

In [11]:
from hottbox.core import Tensor
from hottbox.pdtools import pd_to_tensor

In [12]:
tensor = pd_to_tensor(df_tensor)

In [13]:
tensor

This tensor is of order 3 and consists of 12072 elements.
Sizes and names of its modes are (4, 3, 1006) and ['Sector', 'Asset', 'Date'] respectively.