## Goal 1: Follow the ARK Daily Trade
## Goal 2: Analyze the Holdings of ARK ETFs

In [2]:
# General Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Datetime Library
from datetime import date

import scipy.optimize as optimization
from itertools import combinations

# Helper Functions
import acquire, explore, optimal_weights

# Environment File
import env

# Create A Progressive Bar for Loop Operation 
from tqdm.notebook import tqdm

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Load the file path
tradedb = env.arktradedb

### Acquire the trading data

In [9]:
# Read the trading data on 071621

df = pd.read_csv(f"{tradedb}ARK_Trade_07162021.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 9 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   INTRA-DAY TRADE TO TOTAL FUND HOLDINGS (NOT INDIVIDUAL BASKET)  27 non-null     object 
 1   Unnamed: 1                                                      26 non-null     object 
 2   Unnamed: 2                                                      26 non-null     object 
 3   Unnamed: 3                                                      26 non-null     object 
 4   Unnamed: 4                                                      26 non-null     object 
 5   Unnamed: 5                                                      26 non-null     object 
 6   Unnamed: 6                                                      26 non-null     object 
 7   Unnamed: 7                                              

In [10]:
# Print the first 5 rows
df.head()

Unnamed: 0,INTRA-DAY TRADE TO TOTAL FUND HOLDINGS (NOT INDIVIDUAL BASKET),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,ARK offers fully transparent Exchange Traded F...,,,,,,,,
1,,,,,,,,,
2,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF,
3,ARKG,2021-07-16,Buy,SGFY,82671G100,SIGNIFY HEALTH INC,52820,0.0172,
4,ARKG,2021-07-16,Buy,VERV,92539P101,VERVE THERAPEUTICS INC,22838,0.0131,


### Prepare the daily trading data

In [11]:
# Delete the first two rows

df = df.drop([0,1])
df.head()

Unnamed: 0,INTRA-DAY TRADE TO TOTAL FUND HOLDINGS (NOT INDIVIDUAL BASKET),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
2,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF,
3,ARKG,2021-07-16,Buy,SGFY,82671G100,SIGNIFY HEALTH INC,52820,0.0172,
4,ARKG,2021-07-16,Buy,VERV,92539P101,VERVE THERAPEUTICS INC,22838,0.0131,
5,ARKG,2021-07-16,Buy,QSI,74765K105,QUANTUM-SI INC,301320,0.0404,
6,ARKG,2021-07-16,Buy,RPTX,760273102,REPARE THERAPEUTICS INC,24260,0.0087,


In [13]:
df.columns

Index(['INTRA-DAY TRADE TO TOTAL FUND HOLDINGS (NOT INDIVIDUAL BASKET)',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'],
      dtype='object')

In [15]:
# Drop the last column

df = df.drop(columns = 'Unnamed: 8')
df.head()

Unnamed: 0,INTRA-DAY TRADE TO TOTAL FUND HOLDINGS (NOT INDIVIDUAL BASKET),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
2,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF
3,ARKG,2021-07-16,Buy,SGFY,82671G100,SIGNIFY HEALTH INC,52820,0.0172
4,ARKG,2021-07-16,Buy,VERV,92539P101,VERVE THERAPEUTICS INC,22838,0.0131
5,ARKG,2021-07-16,Buy,QSI,74765K105,QUANTUM-SI INC,301320,0.0404
6,ARKG,2021-07-16,Buy,RPTX,760273102,REPARE THERAPEUTICS INC,24260,0.0087


In [27]:
# Correct the column names

col_names = list(df.iloc[0].values)
df.columns = col_names
df.head()

Unnamed: 0,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF
2,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF
3,ARKG,2021-07-16,Buy,SGFY,82671G100,SIGNIFY HEALTH INC,52820,0.0172
4,ARKG,2021-07-16,Buy,VERV,92539P101,VERVE THERAPEUTICS INC,22838,0.0131
5,ARKG,2021-07-16,Buy,QSI,74765K105,QUANTUM-SI INC,301320,0.0404
6,ARKG,2021-07-16,Buy,RPTX,760273102,REPARE THERAPEUTICS INC,24260,0.0087


In [29]:
# Delete the first row

df = df.drop([2])
df.head()

Unnamed: 0,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF
3,ARKG,2021-07-16,Buy,SGFY,82671G100,SIGNIFY HEALTH INC,52820,0.0172
4,ARKG,2021-07-16,Buy,VERV,92539P101,VERVE THERAPEUTICS INC,22838,0.0131
5,ARKG,2021-07-16,Buy,QSI,74765K105,QUANTUM-SI INC,301320,0.0404
6,ARKG,2021-07-16,Buy,RPTX,760273102,REPARE THERAPEUTICS INC,24260,0.0087
7,ARKG,2021-07-16,Sell,CDNA,14167L103,CAREDX INC,84052,0.0774


In [31]:
# Reset the index

df = df.reset_index()
df = df.drop(columns='index')
df.head()

Unnamed: 0,FUND,Date,Direction,Ticker,CUSIP,Name,Shares,% of ETF
0,ARKG,2021-07-16,Buy,SGFY,82671G100,SIGNIFY HEALTH INC,52820,0.0172
1,ARKG,2021-07-16,Buy,VERV,92539P101,VERVE THERAPEUTICS INC,22838,0.0131
2,ARKG,2021-07-16,Buy,QSI,74765K105,QUANTUM-SI INC,301320,0.0404
3,ARKG,2021-07-16,Buy,RPTX,760273102,REPARE THERAPEUTICS INC,24260,0.0087
4,ARKG,2021-07-16,Sell,CDNA,14167L103,CAREDX INC,84052,0.0774


### Summary of NASDAQ Stocks in US

In [None]:
# Load all the stocks from US in NASDAQ
df_nasdaqus = pd.read_csv(f"{database}NASDAQ_US_032821.csv")

# Print the concise summary of the dataframe
df_nasdaqus.info()

In [None]:
# Inspect 5 random stocks 
df_nasdaqus.sample(5)

In [None]:
# Compute the percentage of stocks lack of IPO years
df_nasdaqus['IPO Year'].value_counts(dropna=False, normalize=True).head(1)

**Takeaways**
1. About 40% of the stocks' IPO years are missing in the dataframe. How come so many stocks don't have IPO years? 
2. More than 200 stocks don't have sector and industry information. 
3. 13 stocks don't have market cap.

In [None]:
df_nasdaqus['IPO Year'].value_counts().sort_index().plot(kind='bar', figsize=(13,7))
plt.show()

In [None]:
mask = (df_nasdaqus['IPO Year'] == 2021) | (df_nasdaqus['IPO Year'] == 2020) | (df_nasdaqus['IPO Year'] == 2019)
mask.sum()

In [None]:
df_nasdaqrecent = df_nasdaqus[mask]
df_nasdaqrecent

In [None]:
# The distribution of most recent IPO companies by sectors
pd.concat([df_nasdaqrecent.Sector.value_counts(),
           df_nasdaqrecent.Sector.value_counts(normalize=True)], axis=1)

**Takeaways**: 62% of the New IPOs belong to Finance sector.

In [None]:
# The distribution of most recent IPO companies by industries
df_nasdaqrecent.Industry.value_counts(normalize=True).head(10)

In [None]:
df_nasdaqus.sort_values(by='Market Cap', ascending=False).head(10)

In [None]:
df_nasdaqus.Sector.value_counts(normalize=True)

### Acquire the Historical Data of Major Indexes

In [None]:
%%time
# Create a list of benchmark index symbols
symbols = ['^GSPC', '^DJI', '^IXIC', '^RUT', '^NDX']

# Define the start and end date
start_date = "1995-01-01"
end_date = date.today()

# Acquire their adjusted closing price
df_marks = acquire.acquire_stock_adjclosing(symbols, start_date, end_date)

# Print the number of records
print(df_marks.shape[0])

# Create a list of benchmark index names
benchmarks = ['SP500', 'Dow30', 'Nasdaq Composite', 'Russell2000', 'NASDQA 100']

# Rename the columns as bechmark index names
df_marks.columns = benchmarks

# Inspect the tail of the dataframe
df_marks.tail()

In [None]:
# Inspect the head of the dataframe
df_marks.head()

In [None]:
df_marks['SP500'].diff(1)

In [None]:
# Print the concise summary
df_marks.info()

### Data Preparation

In [None]:
# Compute the number of the null values in each column
df_marks.isnull().sum(axis=0)

### Data Exploration

In [None]:
# Plot the growth of the major indexes since 1995

df_marks_1995 = df_marks.apply(lambda i: i/i.iloc[0], axis=0)
df_marks_1995.plot(figsize=(14,8))
# plt.axhline(y=1)
# plt.axhline(y=2, linestyle='--', alpha=0.5)
# plt.axhline(y=3, linestyle='--', alpha=0.5)
# plt.axhline(y=4, linestyle='--', alpha=0.5)
plt.show()

**Takeaways**
1. Today's stock market looks like the dot-com bubble.

In [None]:
# # Save as csv file 

# database = env.database
# df_marks_1995.to_csv(f"{database}major_indexes.csv")

In [None]:
# Plot the growths of the major indexes by customizing the start date since 1995

print("Enter the date:")
start = input()

df_marks.loc[start:].apply(lambda i: i/i.iloc[0], axis=0).plot(figsize=(14,8))
plt.axhline(y=1, linestyle='--', alpha=0.5)
# plt.axhline(y=20000, linestyle='--', alpha=0.5)
# plt.axhline(y=30000, linestyle='--', alpha=0.5)
# plt.axhline(y=40000, linestyle='--', alpha=0.5)
plt.show()

**Takeaways**
1. The returns on the indexes depends on the where you start to invest.

In [None]:
# Add new columns for year, month, day, and day_name

df_marks = df_marks.assign(year = df_marks.index.year, 
                           month = df_marks.index.month,
                           day = df_marks.index.day,
                           day_name = df_marks.index.day_name())

# Sanity check
df_marks.info()

In [None]:
# Group the observation by year
grouped_year = df_marks.groupby('year')

# Print the data type
type(grouped_year)

In [None]:
# Print all the group names
grouped_year.groups.keys()

In [None]:
# Print all the group names
grouped_year.indices.keys()

**Takeaways**
1. Pandas GroupBy objects have been created.
2. The name of each group is its year.
3. There are two ways to acess the group names. 

In [None]:
# Construct dataframe from group with provided name
# Take year 2000 for example

df_marks2000 = grouped_year.get_group(2000)
df_marks2000

In [None]:
# Compute the relative prices using the first day of year as the base for year 2000
df_marks2000[benchmarks].apply(lambda i: i*10000/i.iloc[1], axis=0)

In [None]:
# Compute the relative prices for each year

# Create a empty dataframe
df_marks_relative = pd.DataFrame()

# Create a list of years
years = list(grouped_year.groups.keys())

# For Loop through the years and compute the relative prices

for year in years:
    df_marks_year = grouped_year.get_group(year)
    df_marks_year = df_marks_year[benchmarks].apply(lambda i: i*10000/i.head(1).values, axis=0)
    df_marks_relative = pd.concat([df_marks_relative, df_marks_year])
    
# Sanity check
df_marks_relative

In [None]:
# Add new columns for year, month, day, and day_name

df_marks_relative = df_marks_relative.assign(year = df_marks_relative.index.year, 
                                             month = df_marks_relative.index.month,
                                             day = df_marks_relative.index.day,
                                             day_name = df_marks_relative.index.day_name())

# Sanity check
df_marks_relative.info()

In [None]:
# Create the figures grids
fig, ax = plt.subplots(2, 2, figsize=(16, 12))

# Flatten the figures grids
ax = ax.ravel()

# Adjust the benchmarks
benchmarks = ['S&P500', 'Dow 30', 'Nasdaq Composite', 'Russell 2000']

for benchmark in benchmarks:
    df_marks_relative.loc["2008": "2011"].groupby('month')[benchmark].mean().plot(ax=ax[0], legend=True)
    
for benchmark in benchmarks:
    df_marks_relative.loc["2012": "2016"].groupby('month')[benchmark].mean().plot(ax=ax[1], legend=False)
    
for benchmark in benchmarks:
    df_marks_relative.loc["2016": "2019"].groupby('month')[benchmark].mean().plot(ax=ax[2], legend=False)
    
for benchmark in benchmarks:    
    df_marks_relative.loc["2021"][benchmark].plot(ax=ax[3], legend=False)

ax[0].set_title("Obama I")
ax[1].set_title("Obama II")
ax[2].set_title("Trump")
ax[3].set_title("Year 2021")

ax[0].set_xticks([i for i in range(1,13)])
ax[1].set_xticks([i for i in range(1,13)])
ax[2].set_xticks([i for i in range(1,13)])
    
plt.show()

### Background

#### Index Fund
An <b>index fund</b> is a type of mutural fund or exchange-traded fund (ETF). It's made up of stocks or bonds attemting to earn the same return as a particular index.

Index funds are passively managed, which means that they typically hold waht's in the index (which rarely changes) to maximize returns and minimize costs. 

#### What is an ETF?
An ETF is a fund can be traded on an exchange like a stock, which means they can be bought and sold throughtout the trading day (unlike mutual funds, which are priced at the end of the trading day).

An ETF is a type of security that tracks an index, sector, commodity or other asset, but which can be purchased or sold on stock exchange the same as a regular stock.

ETF share prices fluctuate all day as the ETF is bought and sold and it is different from mutual funds that only trade once a day after the market closes.
#### How an ETF is structured? 
An ETF can be structured to track anything from the price of an individual commodity to a large and diverse collection of securities. 

ETFs can contain all types of investments including stocks, commodities, or bonds; some offter U.S. only holdings, while other are international. 

???Will the structure of ETF change over time???

#### Reference
- [Investing in Index Funds for Beginners](https://www.thebalance.com/investing-in-index-funds-for-beginners-356318)

### Actively Managed ETFs from ARK

In [None]:
%%time
# Create a list of interested stocks
arks = ['ARKK', 'ARKQ', 'ARKW', 'ARKG', 'ARKF']

# Creata a list of benchmark indexes
references = ['^GSPC', '^DJI', '^IXIC', 'VOOG','TSLA']

# Define start and end date
start_date = '2014-09-30'
end_date = date.today()

# Acquire their adjusted closing prices from yahoo finance
df_arks = acquire.acquire_stock_data(arks+references, 'yahoo', start_date, end_date)

# Inspect the dataframe
print(df_arks.shape)
df_arks.tail()

In [None]:
# Make a copy of the dataframe
df = df_arks.copy()

# Print the concise summary of the copied dataframe
df.info()

**Takeaways**: Those five ETFs have different inception dates.

In [None]:
# Add a new column to compute the average of the 5 active ETFs
df = df.assign(arks_avg = df[arks].mean(axis=1))

# Sanity check
df.tail()

In [None]:
# Plot the prices of the 5 ETFs over time

df[arks].plot(figsize=(14,8))
plt.ylabel("Adjusted Closing Price")
plt.title("Change of ARK Actively Managed ETFs' Prices Over Time")
plt.show()

In [None]:
# Plot the mean over time and zoom in since the beginning of last March

# Set up the figure size
plt.figure(figsize=(13,8))

plt.subplot(121)
df.arks_avg.plot()
plt.ylabel("Adjusted Closing Price")
plt.axvline(x="2016-02-15")
plt.axvline(x="2018-03-01")
plt.axvline(x="2019-10-15")
# Extra plt.show() here will show the two plots in two rows. 

plt.subplot(122)
df.arks_avg.plot()
plt.ylabel("Adjusted Closing Price")
plt.xlim('2020-03-01', end_date)
plt.show()

**Takeaways**
1. The curve of the mean shows a flat-up-flat-up pattern. 
2. Since March 2020, the price of the ARK's ETFs increased about 2.5 folds on average. 

In [None]:
# Scale the adjusted closing prices and indexes using today's value as the reference
df_scaled = df.apply(lambda i: i*10000/i.tail(1).values, axis=0)

# Inspect the scaled dataframe
df_scaled.tail()

In [None]:
# Plot the scaled indexes and the mean of ETFs

cols = references + ['arks_avg']

df_scaled[cols].plot(figsize=(14,8))
plt.axhline(y=10000)
plt.ylabel("Scaled Indexes and Prices")
plt.show()

In [None]:
# Plot the scaled indexes and the mean of ETFs since March 2020

df_scaled[cols].plot(figsize=(14,8))
plt.axhline(y=1.0)
plt.xlim("2020-03-01", end_date)
plt.show()

In [None]:
# Plot pairwise relationships in the dataset

sns.pairplot(df_scaled[cols], dropna=True)
plt.show()

In [None]:
# Compute the daily returns of the ETFs

returns_d = np.log(df/df.shift(1))
returns_d.hist(figsize=(16,9), bins=100)
plt.show()

### Breaking Down ARK ETF Holdings

In [None]:
# Load the file path
database = env.database

#### ARKK

In [None]:
# Load the fund holding for ARKK
df_arkk_holdings = pd.read_csv(f"{database}/arkk_holdings.csv")

# Print the number of holdings
print("Number of Holdings:", df_arkk_holdings.shape[0])

In [None]:
# Print the top 10 holdings
df_arkk_holdings.head(10)

In [None]:
# Print the concise information
df_arkk_holdings.info()

**Takeaway**
1. Comparing with the information of last december, the number of holdings changes from 48 to 59 and the weights(%) of the top 10 holdings also vary. 
3. Null vlaues are observed in all columns. 
4. The date is in object format, which may need to change to datetime. 

In [None]:
# Drop the rows with NaN value(s) and correct the wrong ticker
df_arkk_holdings = explore.prepare_etf_holdings(df_arkk_holdings)
df_arkk_holdings.tail()

In [None]:
%%time

# Define start and end date
start_date = '2014-10-31' # When the ARKK commenced operation
end_date = date.today()

# Compute the reutuns of each stock since its inception date
explore.holdings_sum(df_arkk_holdings, start_date, end_date)

#### ARKW

In [None]:
# Load the file path
database = env.database

# Load the fund holding for ARKK
df_arkw_holdings = pd.read_csv(f"{database}/arkw_holdings.csv")

# Print the number of holdings
print("Number of Holdings: ", df_arkw_holdings.shape[0])

In [None]:
# Drop the rows and NaN value(s) and correct the wrong tickers
df_arkw_holdings = explore.prepare_etf_holdings(df_arkw_holdings)
df_arkw_holdings.tail()

In [None]:
%%time

# Define start and end date
start_date = '2014-09-30' # When the ARKW commenced operation
end_date = date.today()

# Compute the reutuns of each stock since its inception date
explore.holdings_sum(df_arkw_holdings, start_date, end_date)

#### ARKG

In [None]:
# Load the fund holding for ARKK
df_arkg_holdings = pd.read_csv(f"{database}/arkg_holdings.csv")

# Print the number of holdings
print("Number of Holdings:", df_arkg_holdings.shape[0])

In [None]:
# Drop the rows with NaN value(s) and correct the wrong ticker
df_arkg_holdings = explore.prepare_etf_holdings(df_arkg_holdings)
df_arkg_holdings.tail()

In [None]:
%%time

# Define start and end date
start_date = '2014-10-31' # When the ARKG commenced operation
end_date = date.today()

# Compute the reutuns of each stock since its inception date
explore.holdings_sum(df_arkg_holdings, start_date, end_date)

## Compute Optimal Weights of ETFs
#### Vanguard Information Technology ETF (VGT)
- Seeks to track the performance of a benchmark index that measure the investment return of stocks in the information technology sector.
- 341 Holdings in total and inception date: 2004-01-26
- Top 10 Holdings: Apple, Microsoft, NVIDIA, Visa, Mastercard, Paypal, Intel, Adobe, Salesforce, Broadcom
- Expense ratio: 0.10%

#### Vanguard S&P 500 Growth Index Fund ETF Shares (VOOG)
- Invests in stocks in the S&P500 Grwoth Index (233/233), composed of the growth companies in the S&P 500. 
- 233 holdings in total and inception date: 2010-09-07
- Top 10 holdings(50.50%): Apple, Microsoft, Amazon, Alphabet, Facebook, Tesla, NVIDIA, Paypal, Netflix, Adobe
- Expense ratio: 0.10%

#### Vanguard Russell 1000 Growth ETF (VONG)
- Invest in stocks in the Russell 1000 Growth Index, a broadly diversified index predominantly made up of growth stocks of large U.S. Companies.
- 459 holdings and inception date: 2010-09-20
- Top 10 holdings(45.90%): Apple, Microsoft, Amazon, Alphabet, Facebook, Tesla, Visa, NVIDIA, Mastercard, PayPal
- Expense ratio: 0.08%

#### Invesco QQQ (QQQ)
- An exchange-traded fund that tracks the Nasdaq-100 index. The index includes the 100 largest non-financial companies listed on the Nasdaq based on market cap. 
- rated the best-perfoming large-cap growth fund (1 of 327) based on total return over the past 15 years by Lipper, as of Dec 31, 2020.
- 103 holdings and inception date: 1999-03-10
- Top 10 holdings(51.12%): Apple, Microsoft, Amazon, Tesla, Facebook, Alphabet, Alphabet, NVIDIA, Paypal, Intel

### Summary of the Interested ETFs

In [None]:
# Define the index and columns of the dataframe

etfs = ['VGT', 'VOOG', 'VONG', 'QQQ', 'BND']
cols = ['full_name', 'benchmark', 'num_of_holdings', 'inception_date', 'expense_ratio(%)']

df_sum = pd.DataFrame(index=etfs, columns=cols)

df_sum.loc['VGT'] = pd.Series({'full_name': 'Vanguard Information Technology ETF',
                           'benchmark': 'MSCI US IMI Info Technology 25/50', 
                           'num_of_holdings': 341,
                           'inception_date': '2004-01-26',
                           'expense_ratio(%)': 0.10})

df_sum.loc['VOOG'] = pd.Series({'full_name': 'Vanguard S&P 500 Growth ETF',
                           'benchmark': 'S&P 500 Growth Index', 
                           'num_of_holdings': 233,
                           'inception_date': '2010-09-07',
                           'expense_ratio(%)': 0.10})

df_sum.loc['VONG'] = pd.Series({'full_name': 'Vanguard Russell 1000 Growth ETF',
                           'benchmark': 'Russell 1000 Growth Index', 
                           'num_of_holdings': 459,
                           'inception_date': '2010-09-20',
                           'expense_ratio(%)': 0.08})

df_sum.loc['QQQ'] = pd.Series({'full_name': 'Invesco QQQ',
                           'benchmark': 'Nasdaq-100 Index', 
                           'num_of_holdings': 102,
                           'inception_date': '1999-03-10',
                           'expense_ratio(%)': 0.20})

df_sum.loc['BND'] = pd.Series({'full_name': 'Vanguard Total Bond Market ETF',
                           'benchmark': 'BloomBarc US Agg Float Adj Index', 
                           'num_of_holdings': 10074,
                           'inception_date': '2007-04-03',
                           'expense_ratio(%)': 0.035})

df_sum

In [None]:
# Create a dataframe for the top 10 holdings for the stock ETFs as of 03/23/2021

df_top10 = pd.DataFrame({'VGT': ['Apple', 'Microsoft', 'NVIDIA', 'Visa', 'Mastercard', 
                                 'Paypal', 'Intel', 'Adobe', 'Salesforce', 'Broadcom'], 
                         'VOOG': ['Apple', 'Microsoft', 'Amazon', 'Alphabet', 'Facebook', 
                                  'Tesla', 'NVIDIA', 'Paypal', 'Netflix', 'Adobe'], 
                         'VONG': ['Apple', 'Microsoft', 'Amazon', 'Alphabet', 'Facebook', 
                                  'Tesla', 'Visa', 'NVIDIA', 'Mastercard', 'PayPal'], 
                         'QQQ': ['Apple', 'Microsoft', 'Amazon', 'Tesla', 'Facebook', 
                                 'Alphabet', 'Alphabet', 'NVIDIA', 'Paypal', 'Intel']})

df_top10

### Acquire Historical Data

In [None]:
etfs = ['VGT', 'VOOG', 'VONG', 'QQQ', 'BND']

# Set up the start and end dates

start_date = '2010-09-20'
end_date = date.today()

# Download the historical data of the interested stocks
df_etfs = acquire.acquire_stock_data(etfs, 'yahoo', start_date, end_date)

# Take a quick peek at the data
df_etfs.head()

In [None]:
# Drop the rows with NaN values

df_etfs = df_etfs.loc["2010-09-22":]
df_etfs.head()

In [None]:
# Print the concise information
df_etfs.info()

In [None]:
# Plot their returns if I invest $1 on 2010-09-22

df_etfs.apply(lambda i: i/i.head(1).values, axis=0).plot(figsize=(13,7))
plt.show()

**Takeaways**
1. Apparently, this 5 ETFs can be divied into three groups based on the returns:
- Group 1: VGT and QQQ
- Group 2: VOOG and VONG
- Group 3: BND

In [None]:
# Create a set of top 10 holdings for the 4 ETFs 
VGT = set(df_top10.VGT)
VOOG = set(df_top10.VOOG)
VONG = set(df_top10.VONG)
QQQ = set(df_top10.QQQ)

# The common stocks in the top 10 holdings of 2 ETFs: VOOG and VONG
print("VONG & VOOG:")
print(VONG & VOOG)

# The common stocks in the top 10 holdings of 2 ETFs: VGT and VOOG
print('VGT & VOOG')
print(VGT & VOOG)

# The common stocks in the top 10 holdings of 2 ETFs: QQQ and VOOG
print('QQQ & VOOG')
print(QQQ & VOOG)

# Compute the common stocks in the top 10 holdings of VGT and QQQ
print('VGT&QQQ')
print(VGT&QQQ)

In [None]:
# Enter the end data

print("Enter the end date in yyyy-mm-dd:")
end_date = input()

data = df_etfs.loc[: end_date]
data.info()

In [None]:
# Plot their returns on 2019-12-31 if I invest $1 on 2010-09-22

data.apply(lambda i: i/i.head(1).values, axis=0).plot(figsize=(13,7))
plt.show()

In [None]:
optimal_weights.possible_optimal_weight(data, etfs)