In [3]:
import pandas as pd
from pandas.tseries.offsets import BDay 
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta 
import time

#Alpha_Vantage is a stock data provider.  This library makes api calls much easier 
import alpha_vantage
from alpha_vantage.timeseries import TimeSeries  

## First, we'll read in the data output by the scraper and do any cleaning that didn't survive the "to_csv" part of the last notebook.  

In [4]:
df = pd.read_csv('AllHoldings.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390168 entries, 0 to 390167
Data columns (total 12 columns):
manager            390168 non-null object
filing_date        390168 non-null object
report_date        390168 non-null object
sh_name            390168 non-null object
sh_class           390168 non-null object
cusip              390168 non-null object
val1000            390168 non-null int64
share_count        390168 non-null int64
share_or_prin      390168 non-null object
discretion         390168 non-null object
sole_vote_amt      390168 non-null int64
shared_vote_amt    390168 non-null int64
dtypes: int64(4), object(8)
memory usage: 35.7+ MB


In [6]:
df['report_date'] = pd.to_datetime(df.report_date)

In [7]:
df['filing_date'] = pd.to_datetime(df.filing_date)

## The next step is to return only the top 20 holdings, aggregated by date and Cusip/Share.   

In [8]:
# Sums up the holdings by date/cusip and returns a flattened dataframe with report_date as the index
summed_df = df.groupby(['report_date', 'cusip', 'sh_name'])['report_date', 'cusip', 'sh_name', 'val1000'].sum().reset_index()
summed_df.set_index('report_date')

Unnamed: 0_level_0,cusip,sh_name,val1000
report_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-06-30,000360206,AAON INC,1179
2013-06-30,000361105,AAR CORP,2039
2013-06-30,000375204,ABB LTD,4802
2013-06-30,00081T108,ACCO BRANDS CORP,65
2013-06-30,000957100,ABM INDS INC,1731
2013-06-30,00101J106,THE ADT CORPORATION,12867
2013-06-30,001031103,AEP INDS INC,536
2013-06-30,00104Q107,AFC ENTERPRISES INC,4421
2013-06-30,001055102,AFLAC INC,75176
2013-06-30,001084102,AGCO CORP,22610


In [9]:
#Sets variables that will be used to loop through each report_date and select the top 20 by notional value
looper = summed_df.report_date.value_counts().index
top_df = pd.DataFrame(columns=['report_date', 'cusip', 'sh_name', 'val1000'])

In [10]:
for x in looper:
    top_df = top_df.append(summed_df[(summed_df.report_date == x)].nlargest(20, 'val1000'))

In [11]:
top_df = top_df.sort_values(['report_date', 'val1000'], ascending=[False, False]).reset_index().drop('index', axis=1)

## Now, the goal is to pull back stock tickers for all of the stocks that appear in our 420 row dataframe.  

#### This is a somewhat complicated task as stocks can undergo certain changes (which is why CUSIPS are often used as an identifier.

#### Fidelity seems to be the only free source of a cusip lookup, so I will first scrape there.  If there aren't too many NaNs after scraping Fidelity, I'll just figure out the rest of the tickers by hand.  If there are a lot, then I'll try some other automated solution.

In [10]:
# creates a list of unique cusips.
cusips = list(top_df.cusip.unique())

In [11]:
cusips[:5]

['037833100', '78462F103', '949746101', '060505104', '500754106']

In [21]:
url1 = 'https://quotes.fidelity.com/mmnet/SymLookup.phtml?reqforlookup=REQUESTFORLOOKUP&productid=mmnet&isLoggedIn=mmnet&rows=50&for=stock&by=cusip&criteria='
url2 = '&submit=Search'
ticker = []

for cusip in cusips:
    response = requests.get(url1+cusip+url2)
    time.sleep(1)
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        ticker.append(soup.find('td', {'align':'center'}).get_text())
    except:
        ticker.append(cusip+'Not Found')

In [12]:
#cusip_ticker = pd.DataFrame({'cusip':cusips, 'tickers':ticker})
cusip_ticker = pd.read_csv('cusip_ticker.csv')  
cusip_ticker.columns=['d', 'cusip', 'ticker']
cusip_ticker.drop('d', axis=1)
cusip_ticker.head()

Unnamed: 0,d,cusip,ticker
0,0,949746101,WFC
1,1,500754106,KHC
2,2,191216100,KO
3,3,459200101,IBM
4,4,25816109,AXP


In [13]:
cusip_ticker[cusip_ticker.ticker.str.contains('Not')]
cusip_ticker.iloc[7,2] = 'SPY' 
cusip_ticker.iloc[10,2] = 'SPY' 
cusip_ticker.iloc[14,2] = 'VRX' #Company acquired by Bausch and Lomb 
cusip_ticker.iloc[20,2] = 'AAPL' #Looks like a miskey by the 13F filer
cusip_ticker.iloc[21,2] = 'DTV' #Company acquired by AT&T
cusip_ticker.iloc[29,2] = 'AGN' #Company acquired
cusip_ticker.iloc[33,2] = 'EMC' #Company acquired
cusip_ticker.iloc[46,2] = 'QQQ' #Fidelity tool not able to handle ETFs
cusip_ticker.iloc[48,2] = 'PCLN' #Company acquired by Bausch and Lomb 
cusip_ticker.iloc[56,2] = 'FRX' #Company acquired 
cusip_ticker.iloc[57,2] = 'MDT' #Company acquired 
cusip_ticker.iloc[58,2] = 'DWDP' #Company acquired 
cusip_ticker.iloc[60,2] = 'DELL' #Company went private 
cusip_ticker.iloc[61,2] = 'YHOO' #Company acquired 
cusip_ticker.iloc[64,2] = 'FDML' #Company went private 


In [25]:
API = 'INAPOMK44YJBVQY9'
ts = TimeSeries(key='API', output_format='pandas', indexing_type='date')

In [27]:
alpha_data, alpha_meta_data = ts.get_daily_adjusted(symbol='WFC', outputsize='full')

In [28]:
alpha_data.head()

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1995-01-03,23.0,23.25,22.88,23.25,3.0971,311300.0,0.0,1.0
1995-01-04,23.12,23.38,22.75,22.88,3.0478,356800.0,0.0,1.0
1995-01-05,23.12,23.38,22.88,23.12,3.0798,766700.0,0.0,1.0
1995-01-06,23.0,23.5,23.0,23.25,3.0971,384600.0,0.0,1.0
1995-01-09,23.12,23.12,22.88,22.88,3.0478,202500.0,0.0,1.0


In [31]:
alpha_meta_data

{'1. Information': 'Daily Time Series with Splits and Dividend Events',
 '2. Symbol': 'WFC',
 '3. Last Refreshed': '2018-09-24 13:48:11',
 '4. Output Size': 'Full size',
 '5. Time Zone': 'US/Eastern'}

## The above "alpha_data" dataframe came from the alphavantage API.  Virtually all of the cusips belonging to the tickers above (cusips not found) were missing from AV.  So, I instead sourced the data from another source that has unadjusted close prices only.  Given the infrequency of my strategy's trading, the likelihood of corporate actions having a significant effect is low.

In [14]:
prices = pd.read_csv('13F_prices.txt', index_col=0, header=None, names=['date', 'ticker', 'price'])
prices.reset_index(inplace=True)

In [15]:
top_df = pd.merge(top_df, cusip_ticker, how='left', on='cusip').drop('d', axis=1)

In [16]:
top_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420 entries, 0 to 419
Data columns (total 5 columns):
report_date    420 non-null datetime64[ns]
cusip          420 non-null object
sh_name        420 non-null object
val1000        420 non-null object
ticker         420 non-null object
dtypes: datetime64[ns](1), object(4)
memory usage: 19.7+ KB


##  The dates I have are the dates when the investors held a position.  Their reports are filed 43-45 days after this date.  To make my life easier, I ignored the "filing_date" column from my 13F scraper and just use the next business day 44 days after the "report_date" col. 

In [17]:
top_df['buy_date'] = top_df['report_date'].apply(lambda x: x +  timedelta(days=44) + BDay(1))

In [18]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71214 entries, 0 to 71213
Data columns (total 3 columns):
date      71214 non-null object
ticker    71214 non-null object
price     71214 non-null float64
dtypes: float64(1), object(2)
memory usage: 1.6+ MB


In [19]:
prices['date'] = pd.to_datetime(prices.date)

In [20]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71214 entries, 0 to 71213
Data columns (total 3 columns):
date      71214 non-null datetime64[ns]
ticker    71214 non-null object
price     71214 non-null float64
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 1.6+ MB


In [21]:
top_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420 entries, 0 to 419
Data columns (total 6 columns):
report_date    420 non-null datetime64[ns]
cusip          420 non-null object
sh_name        420 non-null object
val1000        420 non-null object
ticker         420 non-null object
buy_date       420 non-null datetime64[ns]
dtypes: datetime64[ns](2), object(4)
memory usage: 23.0+ KB


In [22]:
top_df.head()

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date
0,2018-06-30,037833100,APPLE INC,54835353,AAPL,2018-08-14
1,2018-06-30,78462F103,SPDR S&P 500 ETF TR,29281034,SPY,2018-08-14
2,2018-06-30,949746101,WELLS FARGO & CO NEW,25059643,WFC,2018-08-14
3,2018-06-30,060505104,BANK AMER CORP,21612437,BAC,2018-08-14
4,2018-06-30,500754106,KRAFT HEINZ CO,20906754,KHC,2018-08-14


In [23]:
prices.head()

Unnamed: 0,date,ticker,price
0,2013-07-01,AAPL,409.220001
1,2013-07-01,AGN,125.440002
2,2013-07-01,AIG,45.09
3,2013-07-01,AMGN,97.489998
4,2013-07-01,AMZN,282.1


In [24]:
pd.merge(top_df, prices, how='left', left_on=['ticker', 'buy_date'], right_on=['ticker', 'date'])

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date,date,price
0,2018-06-30,037833100,APPLE INC,54835353,AAPL,2018-08-14,NaT,
1,2018-06-30,78462F103,SPDR S&P 500 ETF TR,29281034,SPY,2018-08-14,NaT,
2,2018-06-30,949746101,WELLS FARGO & CO NEW,25059643,WFC,2018-08-14,NaT,
3,2018-06-30,060505104,BANK AMER CORP,21612437,BAC,2018-08-14,NaT,
4,2018-06-30,500754106,KRAFT HEINZ CO,20906754,KHC,2018-08-14,NaT,
5,2018-06-30,191216100,COCA COLA CO,18764843,KO,2018-08-14,NaT,
6,2018-06-30,023135106,AMAZON COM INC,18040550,AMZN,2018-08-14,NaT,
7,2018-06-30,025816109,AMERICAN EXPRESS CO,15349606,AXP,2018-08-14,NaT,
8,2018-06-30,451100101,ICAHN ENTERPRISES LP,11826854,IEP,2018-08-14,NaT,
9,2018-06-30,594918104,MICROSOFT CORP,10012998,MSFT,2018-08-14,NaT,


### I found it weird that the above didn't work.  I spent a lot of time converting the datetime objects using various modules from pd.datetime and datetime.  Eventually, I figured out that there was whitespace in the prices.ticker column.

In [25]:
prices['ticker']=prices['ticker'].str.strip()

In [26]:
merged = pd.merge(top_df, prices, how='left', left_on=['ticker', 'buy_date'], right_on=['ticker', 'date'])

In [27]:
merged

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date,date,price
0,2018-06-30,037833100,APPLE INC,54835353,AAPL,2018-08-14,2018-08-14,209.750000
1,2018-06-30,78462F103,SPDR S&P 500 ETF TR,29281034,SPY,2018-08-14,2018-08-14,283.900000
2,2018-06-30,949746101,WELLS FARGO & CO NEW,25059643,WFC,2018-08-14,2018-08-14,58.070000
3,2018-06-30,060505104,BANK AMER CORP,21612437,BAC,2018-08-14,2018-08-14,30.790000
4,2018-06-30,500754106,KRAFT HEINZ CO,20906754,KHC,2018-08-14,2018-08-14,59.520000
5,2018-06-30,191216100,COCA COLA CO,18764843,KO,2018-08-14,2018-08-14,45.890000
6,2018-06-30,023135106,AMAZON COM INC,18040550,AMZN,2018-08-14,2018-08-14,1919.650000
7,2018-06-30,025816109,AMERICAN EXPRESS CO,15349606,AXP,2018-08-14,2018-08-14,102.180000
8,2018-06-30,451100101,ICAHN ENTERPRISES LP,11826854,IEP,2018-08-14,NaT,
9,2018-06-30,594918104,MICROSOFT CORP,10012998,MSFT,2018-08-14,2018-08-14,109.560000


##  Looks like there was some data missing from my prices file.  I'm going to see if I can solve the problem easily 

In [28]:
bad_tickers = merged[merged['price'].isna()].ticker.unique()
bad_dates = merged[merged.date.isna()].buy_date.unique()

In [29]:
bad_tickers

array(['IEP', 'QQQ', 'IWM', 'GOOG', 'PCLN', 'VWO', 'EMC', 'AGN', 'VRX',
       'DTV', 'SPY', 'DWDP'], dtype=object)

In [30]:
prices[(prices.date.isin(bad_dates))&(prices.ticker=='IEP')]

Unnamed: 0,date,ticker,price
26629,2015-05-15,IEP,91.85


In [31]:
merged[merged.ticker=='IEP']

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date,date,price
8,2018-06-30,451100101,ICAHN ENTERPRISES LP,11826854,IEP,2018-08-14,NaT,
30,2018-03-31,451100101,ICAHN ENTERPRISES LP,9007801,IEP,2018-05-15,NaT,
50,2017-12-31,451100101,ICAHN ENTERPRISES LP,8369991,IEP,2018-02-14,NaT,
68,2017-09-30,451100101,ICAHN ENTERPRISES LP,8420874,IEP,2017-11-14,NaT,
89,2017-06-30,451100101,ICAHN ENTERPRISES LP,7712502,IEP,2017-08-14,NaT,
111,2017-03-31,451100101,ICAHN ENTERPRISES LP,7201477,IEP,2017-05-15,NaT,
130,2016-12-31,451100101,ICAHN ENTERPRISES LP,7791664,IEP,2017-02-14,NaT,
152,2016-09-30,451100101,ICAHN ENTERPRISES LP,6425167,IEP,2016-11-14,NaT,
169,2016-06-30,451100101,ICAHN ENTERPRISES LP,6672168,IEP,2016-08-15,NaT,
188,2016-03-31,451100101,ICAHN ENTERPRISES LP,7422606,IEP,2016-05-16,NaT,


In [53]:
bad = merged[merged.ticker.isin(bad_tickers)&(merged.price.isna())].sort_values(by='ticker')

In [54]:
bad

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date,date,price
216,2015-12-31,G0177J108,ALLERGAN PLC,4706704,AGN,2016-02-15,NaT,
259,2015-06-30,25490A309,DIRECTV,4779892,DTV,2015-08-14,NaT,
328,2014-09-30,25490A309,DIRECTV,3697653,DTV,2014-11-14,NaT,
305,2014-12-31,25490A309,DIRECTV,3951832,DTV,2015-02-16,NaT,
287,2015-03-31,25490A309,DIRECTV,3891948,DTV,2015-05-15,NaT,
331,2014-09-30,260543103,DOW CHEM CO,3539742,DWDP,2014-11-14,NaT,
196,2016-03-31,268648102,E M C CORP MASS,3939809,EMC,2016-05-16,NaT,
176,2016-06-30,268648102,E M C CORP MASS,4264695,EMC,2016-08-15,NaT,
60,2017-12-31,02079K107,ALPHABET INC,4300285,GOOG,2018-02-14,NaT,
361,2014-03-31,451100101,ICAHN ENTERPRISES LP,10566566,IEP,2014-05-15,NaT,


## There aren't too many of these, so I'm going to try and source them from AlphaVantage, and barring that, look them up in Yahoo.  In a worst case scenario, I'll drop the ticker(s) from my dataset completely.

In [58]:
for tick in bad_tickers:
    time.sleep(10)
    data, metadata = ts.get_daily_adjusted(symbol=tick, outputsize='full')
    data['ticker'] = tick
    alpha_data.append(data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


ValueError: Thank you for using Alpha Vantage! Please visit https://www.alphavantage.co/premium/ if you would like to have a higher API call volume.

## Well, that's not going to work

In [60]:
bad=bad.reset_index()


In [63]:
bad = bad.drop('index', axis=1)

In [68]:
bad.iloc[0]

report_date    2015-06-30 00:00:00
cusip                    25490A309
sh_name                    DIRECTV
val1000                    4779892
ticker                         DTV
buy_date       2015-08-14 00:00:00
date           2015-08-14 00:00:00
price                          NaN
Name: 1, dtype: object

In [104]:
list(cusip_ticker['ticker'])


['WFC',
 'KHC',
 'KO',
 'IBM',
 'AXP',
 'AAPL',
 'IEP',
 'SPY',
 'MSFT',
 'PG',
 'SPY',
 'PSX',
 'USB',
 'APD',
 'VRX',
 'T',
 'AMGN',
 'WMT',
 'PEP',
 'PFE',
 'AAPL',
 'DTV',
 'VWO',
 'EBAY',
 'GM',
 'AIG',
 'AGN',
 'AMZN',
 'MDT',
 'AGN',
 'MRK',
 'SPY',
 'CHTR',
 'EMC',
 'JPM',
 'DVA',
 'CVX',
 'BAX',
 'BAC',
 'QQQ',
 'FB',
 'GOOGL',
 'BABA',
 'NFLX',
 'BKNG',
 'IWM',
 'QQQ',
 'GS',
 'PCLN',
 'DAL',
 'GE',
 'GOOG',
 'HOLX',
 'XOM',
 'CVI',
 'EEM',
 'FRX',
 'MDT',
 'DWDP',
 'CP',
 'DELL',
 'YHOO',
 'MCO',
 'COP',
 'FDML']

In [75]:
bad.iloc[0,7] = 304.51

## Since I'm going to add stuff by hand, it'll be easier to just do it in Excel.

In [82]:
bad.to_csv('missing.csv')

In [83]:
prices = pd.read_csv('13F_prices_w-missing.csv', index_col=0, header=None, names=['date', 'ticker', 'price'])
prices.reset_index(inplace=True)

In [84]:
prices['date'] = pd.to_datetime(prices.date)

In [98]:
prices['ticker']=prices['ticker'].str.strip()
merged = pd.merge(top_df, prices, how='left', left_on=['ticker', 'buy_date'], right_on=['ticker', 'date'])

In [118]:
merged[merged.buy_date.between('2014-11-01', '2014-11-25')]

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date,date,price
312,2014-09-30,949746101,WELLS FARGO & CO NEW,26622642,WFC,2014-11-14,2014-11-14,53.35
313,2014-09-30,191216100,COCA COLA CO,17684439,KO,2014-11-14,2014-11-14,42.73
314,2014-09-30,459200101,INTERNATIONAL BUSINESS MACHS,14514852,IBM,2014-11-14,2014-11-14,164.16
315,2014-09-30,025816109,AMERICAN EXPRESS CO,13673170,AXP,2014-11-14,2014-11-14,90.67
316,2014-09-30,451100101,ICAHN ENTERPRISES LP,11347389,IEP,2014-11-14,2014-11-14,105.31
317,2014-09-30,037833100,APPLE INC,9466057,AAPL,2014-11-14,2014-11-14,114.18
318,2014-09-30,30231G102,EXXON MOBIL CORP,6107645,XOM,2014-11-14,2014-11-14,95.09
319,2014-09-30,018490102,ALLERGAN INC,6096637,AGN,2014-11-14,2014-11-14,243.77
320,2014-09-30,742718109,PROCTER & GAMBLE CO,5527385,PG,2014-11-14,2014-11-14,2.08
321,2014-09-30,742718109,PROCTER & GAMBLE CO,5527385,PG,2014-11-14,2014-11-14,88.11


In [117]:
top_df[top_df.ticker=='YHOO']

Unnamed: 0,report_date,cusip,sh_name,val1000,ticker,buy_date
415,2013-06-30,984332106,YAHOO INC,1978203,YHOO,2013-08-14
