# Venture Fund Analysis

In [32]:
import os
import pandas as pd

# Set the relative path to the 'Historical_Data_13F' folder
directory = './Historical_Data_13F_Updated/venture_funds'

# List all CSV files in the 'Historical_Data_13F' directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Check if we have any CSV files to process
if csv_files:
    # Build the full file path for the first CSV file
    first_csv_path = os.path.join(directory, csv_files[0])

    # Load the first CSV file into a DataFrame
    df = pd.read_csv(first_csv_path)

    # Print the first few rows of the DataFrame
    print(df.head())
else:
    print("No CSV files found in the directory.")


                   Stock Symbol  Type  Shares Held  Market Value  \
0        Snowflake  inc.   SNOW   NaN     32221080    7387649000   
1  Uber technologies inc   UBER   NaN     28411000    1548684000   
2   Meta platforms  inc.     FB   NaN      3753400    1105489000   
3      Expedia group inc   EXPE   NaN      2513568     432635000   
4        Microsoft corp.   MSFT   NaN      1763135     415694000   

   % of Portfolio  Previous % of Portfolio  Ranking  Change in shares  \
0         52.8279                  24.7121      1.0          22729505   
1         11.0744                  13.4062      2.0                 0   
2          7.9052                   9.4861      3.0                 0   
3          3.0937                   4.9321      4.0          -1512617   
4          2.9726                   3.1653      5.0            225000   

     % Change Change Type % Ownership Qtr first owned                  sector  \
0  239.470320    addition   63.552426         Q3 2020  INFORMATION TECH

In [33]:
# Path to the directory containing CSV files
directory = './Historical_Data_13F_Updated/venture_funds'

# Initialize an empty dictionary to store the ticker presence data
ticker_presence = {}

# Get all CSV files from the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Iterate over the files and extract unique tickers
for csv_file in csv_files:
    # Strip the '.csv' extension for use as DataFrame column names
    clean_file_name = csv_file.replace('.csv', '')

    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    unique_tickers = df['Symbol'].unique()
    
    # Update the ticker presence for each file
    for ticker in unique_tickers:
        if ticker not in ticker_presence:
            # Initialize a new entry in the dictionary with the cleaned file names
            ticker_presence[ticker] = {clean_name: 0 for clean_name in [name.replace('.csv', '') for name in csv_files]}
        # Mark the presence of the ticker in the current file
        ticker_presence[ticker][clean_file_name] = 1

# Create a DataFrame from the ticker presence dictionary
one_hot_encoded_df = pd.DataFrame.from_dict(ticker_presence, orient='index')

# Reset the index to get tickers as a column instead of an index
one_hot_encoded_df.reset_index(inplace=True)
one_hot_encoded_df.rename(columns={'index': 'Ticker'}, inplace=True)

# Save the one_hot_encoded_df DataFrame to a CSV file
one_hot_encoded_df.to_csv('venture_fund_encoded_data.csv', index=False)

# Print the first few rows of the one-hot encoded DataFrame to verify
print(one_hot_encoded_df.head())

  Ticker  altimeter-2021_q1  altimeter-2019_q4  coatue-2019_q3  \
0   SNOW                  1                  0               0   
1   UBER                  1                  1               1   
2     FB                  1                  1               1   
3   EXPE                  1                  1               0   
4   MSFT                  1                  1               1   

   altimeter-2021_q2  altimeter-2021_q3  coatue-2021_q4  coatue-2019_q4  \
0                  1                  1               1               0   
1                  1                  1               1               1   
2                  1                  1               1               1   
3                  1                  1               0               1   
4                  1                  1               1               1   

   altimeter-2019_q3  coatue-2021_q1  ...  tiger_global-2021_q1  \
0                  0               1  ...                     1   
1                 

## Isolating Tickers to Pull Updated Historical Data

This being a dynamic analysis with updating information each quarter, the holdings within the funds and the historical pricing data of each financial instrument changes. Because of this, we are going to pull a list of all the unique individual tickers and run them through whatever data provider, scraper, or API can give us historical pricing. Because I do not have access to a tool like Bloomberg for example, I am going to pull them from [Yahoo Finance](http://localhost:8888/lab/tree/Documents/Github/capital-markets/Yahoo%20Finance%20API%20Data%20Pull.ipynb), which you can access using the provided link.

Once doing so, we are going to update that information, place them into the Historical Data folder, which we will then access, clean and create a new dataframe inside of this project to access.

In [34]:

# Extract the 'Ticker' column into a new DataFrame
tickers_df = pd.DataFrame(one_hot_encoded_df['Ticker'])

# Display the first few rows to verify
print(tickers_df.head())

# If you want to save this to a new CSV file:
#tickers_df.to_csv('list_of_tickers.csv', index=False)


  Ticker
0   SNOW
1   UBER
2     FB
3   EXPE
4   MSFT
