# Biotech Fund Analysis

In [50]:
import os
import pandas as pd

# Set the relative path to the 'Historical_Data_13F' folder
directory = './Historical_Data_13F_Updated/biotech_funds'

# List all CSV files in the 'Historical_Data_13F' directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Check if we have any CSV files to process
if csv_files:
    # Build the full file path for the first CSV file
    first_csv_path = os.path.join(directory, csv_files[0])

    # Load the first CSV file into a DataFrame
    df = pd.read_csv(first_csv_path)

    # Print the first few rows of the DataFrame
    print(df.head())
else:
    print("No CSV files found in the directory.")


                          Stock Symbol Type  Shares Held  Market Value  \
0                    Seagen inc   SGEN  NaN   47269424.0  9.250154e+09   
1               Beigene ltd adr   BGNE  NaN   13301597.0  3.810109e+09   
2            Incyte corporation   INCY  NaN   31999398.0  2.871626e+09   
3    Acadia pharmaceuticals inc   ACAD  NaN   41904586.0  1.728564e+09   
4  Alexion pharmaceuticals inc.  ALXN1  NaN    8760794.0  1.002498e+09   

   % of Portfolio  Previous % of Portfolio  Ranking  Change in shares  \
0         36.9623                  35.4286      1.0               0.0   
1         15.2246                   9.7871      2.0         1524312.0   
2         11.4746                  14.6749      3.0             317.0   
3          6.9071                   8.9591      4.0               0.0   
4          4.0058                   4.3373      5.0               0.0   

    % Change Change Type  % Ownership Qtr first owned       sector  \
0   0.000000         NaN    26.407998         

## Transforming the Data

In [51]:
# Path to the directory containing CSV files
directory = './Historical_Data_13F_Updated/biotech_funds'

# Initialize an empty dictionary to store the ticker presence data
ticker_presence = {}

# Get all CSV files from the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Iterate over the files and extract unique tickers
for csv_file in csv_files:
    # Strip the '.csv' extension for use as DataFrame column names
    clean_file_name = csv_file.replace('.csv', '')

    file_path = os.path.join(directory, csv_file)
    df = pd.read_csv(file_path)
    unique_tickers = df['Symbol'].unique()
    
    # Update the ticker presence for each file
    for ticker in unique_tickers:
        if ticker not in ticker_presence:
            # Initialize a new entry in the dictionary with the cleaned file names
            ticker_presence[ticker] = {clean_name: 0 for clean_name in [name.replace('.csv', '') for name in csv_files]}
        # Mark the presence of the ticker in the current file
        ticker_presence[ticker][clean_file_name] = 1

# Create a DataFrame from the ticker presence dictionary
one_hot_encoded_df = pd.DataFrame.from_dict(ticker_presence, orient='index')

# Reset the index to get tickers as a column instead of an index
one_hot_encoded_df.reset_index(inplace=True)
one_hot_encoded_df.rename(columns={'index': 'Ticker'}, inplace=True)

# Save the one_hot_encoded_df DataFrame to a CSV file
one_hot_encoded_df.to_csv('biotech_hedge_fund_encoded_data.csv', index=False)

# Print the first few rows of the one-hot encoded DataFrame to verify
print(one_hot_encoded_df.head())


  Ticker  baker_bros-2020_q3  baker_bros-2020_q2  baker_bros-2020_q1  \
0   SGEN                   1                   1                   1   
1   BGNE                   1                   1                   1   
2   INCY                   1                   1                   1   
3   ACAD                   1                   1                   1   
4  ALXN1                   1                   1                   1   

   baker_bros-2020_q4  orbimed-2019_q4  casdin-2021_q4  orbimed-2021_q1  \
0                   1                0               0                0   
1                   1                0               1                0   
2                   1                1               0                0   
3                   1                1               0                1   
4                   1                1               0                1   

   perceptive-2021_q4  orbimed-2021_q3  ...  perceptive-2020_q4  \
0                   0                1  ...      

## Exploratory Data Analysis

In [58]:
# df going forward is based on the most recent one hot encoded file
df = one_hot_encoded_df

# Initialize a list to keep track of stocks held by all funds
stocks_held_by_all_funds = []

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Assume that all funds hold the stock unless proven otherwise
    hold_by_all = True
    
    # Check if the stock is held by all funds in any of the quarters
    for fund in ['baker_bros', 'casdin_capital', 'orbimed', 'perceptive']:
        # If the stock is not held by this fund in any quarter, set hold_by_all to False
        if not any(row[fund + '-' + str(year) + '-' + quarter] == 1 for year in range(2020, 2021+1) for quarter in ['q1', 'q2', 'q3', 'q4']):
            hold_by_all = False
            break
    
    # If hold_by_all is still True, the stock is held by all funds
    if hold_by_all:
        stocks_held_by_all_funds.append(row['Ticker'])

# Display the list of stocks held by all funds
print(stocks_held_by_all_funds)


KeyError: 'baker_bros-2020-q1'