In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

#Alpha_Vantage is a stock data provider.  This library makes api calls much easier 
import alpha_vantage
from alpha_vantage.timeseries import TimeSeries  

## First, we'll read in the data output by the scraper and do any cleaning that didn't survive the "to_csv" part of the last notebook.  

In [2]:
df = pd.read_csv('AllHoldings.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390168 entries, 0 to 390167
Data columns (total 12 columns):
manager            390168 non-null object
filing_date        390168 non-null object
report_date        390168 non-null object
sh_name            390168 non-null object
sh_class           390168 non-null object
cusip              390168 non-null object
val1000            390168 non-null int64
share_count        390168 non-null int64
share_or_prin      390168 non-null object
discretion         390168 non-null object
sole_vote_amt      390168 non-null int64
shared_vote_amt    390168 non-null int64
dtypes: int64(4), object(8)
memory usage: 35.7+ MB


In [4]:
df['report_date'] = pd.to_datetime(df.report_date)

In [5]:
df['filing_date'] = pd.to_datetime(df.filing_date)

## The next step is to return only the top 20 holdings, aggregated by date and Cusip/Share.   

In [6]:
# Sums up the holdings by date/cusip and returns a flattened dataframe with report_date as the index
summed_df = df.groupby(['report_date', 'cusip', 'sh_name'])['report_date', 'cusip', 'sh_name', 'val1000'].sum().reset_index()
summed_df.set_index('report_date')

Unnamed: 0_level_0,cusip,sh_name,val1000
report_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-06-30,000360206,AAON INC,1179
2013-06-30,000361105,AAR CORP,2039
2013-06-30,000375204,ABB LTD,4802
2013-06-30,00081T108,ACCO BRANDS CORP,65
2013-06-30,000957100,ABM INDS INC,1731
2013-06-30,00101J106,THE ADT CORPORATION,12867
2013-06-30,001031103,AEP INDS INC,536
2013-06-30,00104Q107,AFC ENTERPRISES INC,4421
2013-06-30,001055102,AFLAC INC,75176
2013-06-30,001084102,AGCO CORP,22610


In [7]:
#Sets variables that will be used to loop through each report_date and select the top 20 by notional value
looper = summed_df.report_date.value_counts().index
top_df = pd.DataFrame(columns=['report_date', 'cusip', 'sh_name', 'val1000'])

In [8]:
for x in looper:
    top_df = top_df.append(summed_df[(summed_df.report_date == x)].nlargest(20, 'val1000'))

In [9]:
top_df.sort_values(['report_date', 'val1000'], ascending=[False, False]).reset_index().drop('index', axis=1)

Unnamed: 0,report_date,cusip,sh_name,val1000
0,2018-06-30,037833100,APPLE INC,54835353
1,2018-06-30,78462F103,SPDR S&P 500 ETF TR,29281034
2,2018-06-30,949746101,WELLS FARGO & CO NEW,25059643
3,2018-06-30,060505104,BANK AMER CORP,21612437
4,2018-06-30,500754106,KRAFT HEINZ CO,20906754
5,2018-06-30,191216100,COCA COLA CO,18764843
6,2018-06-30,023135106,AMAZON COM INC,18040550
7,2018-06-30,025816109,AMERICAN EXPRESS CO,15349606
8,2018-06-30,451100101,ICAHN ENTERPRISES LP,11826854
9,2018-06-30,594918104,MICROSOFT CORP,10012998


## Now, the goal is to pull back stock tickers for all of the stocks that appear in our 420 row dataframe.  

#### This is a somewhat complicated task as stocks can undergo certain changes (which is why CUSIPS are often used as an identifier.

#### Fidelity seems to be the only free source of a cusip lookup, so I will first scrape there.  If there aren't too many NaNs after scraping Fidelity, I'll just figure out the rest of the tickers by hand.  If there are a lot, then I'll try some other automated solution.

In [11]:
# creates a list of unique cusips.
cusips = list(top_df.cusip.unique())

In [12]:
cusips[:5]

['949746101', '500754106', '191216100', '459200101', '025816109']

In [13]:
url1 = 'https://quotes.fidelity.com/mmnet/SymLookup.phtml?reqforlookup=REQUESTFORLOOKUP&productid=mmnet&isLoggedIn=mmnet&rows=50&for=stock&by=cusip&criteria='
url2 = '&submit=Search'
ticker = []

for cusip in cusips:
    response = requests.get(url1+cusip+url2, verify=False)
    time.sleep(2)
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        ticker.append(soup.find('td', {'align':'center'}).get_text())
    except:
        ticker.append(cusip+'Not Found')







In [14]:
cusip_ticker = pd.DataFrame({'cusip':cusips, 'tickers':ticker})

# stock_ref = pd.merge(top_df, cusip_ticker, on='cusip', how='left')    

In [15]:
cusip_ticker[cusip_ticker.tickers.str.contains('Not')]
cusip_ticker.iloc[7,1] = 'SPY' 
cusip_ticker.iloc[10,1] = 'SPY' 
cusip_ticker.iloc[14,1] = 'VRX' #Company acquired by Bausch and Lomb 
cusip_ticker.iloc[20,1] = 'AAPL' #Looks like a miskey by the 13F filer
cusip_ticker.iloc[21,1] = 'DTV' #Company acquired by AT&T
cusip_ticker.iloc[29,1] = 'AGN' #Company acquired
cusip_ticker.iloc[33,1] = 'EMC' #Company acquired
cusip_ticker.iloc[46,1] = 'QQQ' #Fidelity tool not able to handle ETFs
cusip_ticker.iloc[48,1] = 'PCLN' #Company acquired by Bausch and Lomb 
cusip_ticker.iloc[56,1] = 'FRX' #Company acquired 
cusip_ticker.iloc[57,1] = 'MDT' #Company acquired 
cusip_ticker.iloc[58,1] = 'DWDP' #Company acquired 
cusip_ticker.iloc[60,1] = 'DELL' #Company went private 
cusip_ticker.iloc[61,1] = 'YHOO' #Company acquired 
cusip_ticker.iloc[64,1] = 'FDML' #Company went private 


In [16]:
cusip_ticker

Unnamed: 0,cusip,tickers
0,949746101,WFC
1,500754106,KHC
2,191216100,KO
3,459200101,IBM
4,025816109,AXP
5,037833100,AAPL
6,451100101,IEP
7,78462F953,SPY
8,594918104,MSFT
9,742718109,PG


In [17]:
pd.merge()

Unnamed: 0,report_date,cusip,sh_name,val1000
67366,2015-09-30,949746101,WELLS FARGO & CO NEW,26344336
62204,2015-09-30,500754106,KRAFT HEINZ CO,24006343
58609,2015-09-30,191216100,COCA COLA CO,17287331
61501,2015-09-30,459200101,INTERNATIONAL BUSINESS MACHS,12265188
56899,2015-09-30,025816109,AMERICAN EXPRESS CO,11473901
57079,2015-09-30,037833100,APPLE INC,9981207
61279,2015-09-30,451100101,ICAHN ENTERPRISES LP,7677234
65227,2015-09-30,78462F953,SPDR S&P 500 ETF TRUST,6816831
63008,2015-09-30,594918104,MICROSOFT CORP,5869018
64492,2015-09-30,742718109,PROCTER & GAMBLE CO,5066588
