In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import missingno as msno
import seaborn as sns
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter('ignore') # 
from pandarallel import pandarallel
 
# Initialization
pandarallel.initialize()

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
mutual_fund_holding = pd.read_csv('mutual_fund_holding_noDrops.csv')
mutual_fund_holding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53285149 entries, 0 to 53285148
Data columns (total 7 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   wficn               float64
 2   permno              int64  
 3   prim_prospectus_bm  object 
 4   quarter             object 
 5   dollar_holdings     float64
 6   shares              float64
dtypes: float64(3), int64(2), object(2)
memory usage: 2.8+ GB


In [3]:
mutual_fund_holding = mutual_fund_holding.sort_values(by=['wficn', 'quarter'])

In [4]:
available_quarters = mutual_fund_holding['quarter'].unique()
def sort_quarters(quarters):
    # Parse and sort the quarters
    sorted_quarters = sorted(quarters, key=lambda x: (int(x[:4]), int(x[5])))
    return sorted_quarters
    
available_quarters = sort_quarters(available_quarters)

def get_past_h_quarters(available_quarters, quarter_of_interest, h):
    # Find the index of the quarter of interest
    try:
        index_of_interest = available_quarters.index(quarter_of_interest)
    except ValueError:
        # Quarter of interest not in list
        return []

    # Calculate start index, ensuring it does not go below 0
    start_index = max(0, index_of_interest - h)

    # Slice the list to get the past h quarters, excluding the quarter of interest
    past_h_quarters = available_quarters[start_index:index_of_interest]

    return past_h_quarters

In [5]:
def calculate_metrics(group,h_values=[3, 7, 11, 15, 19]):

    import pandas as pd
    
    def get_past_h_quarters(available_quarters, quarter_of_interest, h):
        # Find the index of the quarter of interest
        try:
            index_of_interest = available_quarters.index(quarter_of_interest)
        except ValueError:
            # Quarter of interest not in list
            return []
    
        # Calculate start index, ensuring it does not go below 0
        start_index = max(0, index_of_interest - h)

        # Slice the list to get the past h quarters, excluding the quarter of interest
        past_h_quarters = available_quarters[start_index:index_of_interest]
    
        return past_h_quarters
    market_cap = pd.read_pickle('market_cap.pickle')
    available_quarters = [f"{year}Q{quarter}" for year in range(1980, 2021) for quarter in range(1, 5)]
    # Initialize results list
    results = []
    
    # Sort and ensure unique quarters within the group for correct processing
    unique_quarters = group['quarter'].unique()
    for quarter in unique_quarters:
        # Initialize a dictionary to hold results for the quarter across all h_values
        quarter_results = {
            'wficn': group['wficn'].iloc[0],
            'quarter': quarter
        }

        for h in h_values:
            # Determine the current quarter and past h quarters
            past_h_quarters = get_past_h_quarters(available_quarters, quarter, h)

            # Filter the group dataframe to find permnos available in the current quarter
            permnos_current_quarter = group[group['quarter'] == quarter]['permno'].unique()

            # available permnos of current quarter
            available_permnos_current_quarter = market_cap[market_cap['quarter'] == quarter]['permno'].unique()

            # Assuming permnos_current_quarter and available_permnos_current_quarter are numpy arrays or lists
            permnos_current_set = set(permnos_current_quarter)
            available_permnos_current_set = set(available_permnos_current_quarter)
            
            # Combine the sets to include any permnos from the current quarter not already in the available set
            available_permnos_current_set = permnos_current_set.union(available_permnos_current_set)
            
            # ++++++ Percent_Within ++++++
            # Filter group for the past quarters + current quarter
            past_plus_current_quarter_group = group[group['quarter'].isin([quarter] + past_h_quarters)]

            num_aviable_quarters = past_plus_current_quarter_group['quarter'].nunique()

            # Unique permnos in the past quarters + current quarter of the group
            past_plus_current_quarter_permnos = past_plus_current_quarter_group['permno'].unique()
        
            # Filter the available permno in past quarter permnos because some holding in past might be available today
            available_past_plus_current_quarter_permnos = set(past_plus_current_quarter_permnos).intersection(available_permnos_current_set)
            # Calculate metrics
            percent_within_h = len(permnos_current_set) / len(available_past_plus_current_quarter_permnos)
                            
            if_past_quarter_missing_h = 1 if num_aviable_quarters < h+1 else 0

            # Append metrics to quarter_results
            quarter_results[f'percent_within_{h}'] = percent_within_h
            quarter_results[f'if_past_quarter_missing_{h}'] = if_past_quarter_missing_h

        # Append the quarter results to the main results list
        results.append(quarter_results)

    return pd.DataFrame(results)

In [6]:
percent_within = mutual_fund_holding.groupby('wficn').parallel_apply(calculate_metrics)

In [7]:
percent_within

Unnamed: 0_level_0,Unnamed: 1_level_0,wficn,quarter,percent_within_3,if_past_quarter_missing_3,percent_within_7,if_past_quarter_missing_7,percent_within_11,if_past_quarter_missing_11,percent_within_15,if_past_quarter_missing_15,percent_within_19,if_past_quarter_missing_19
wficn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100001.0,0,100001.0,1990Q3,1.000000,1,1.000000,1,1.000000,1,1.000000,1,1.000000,1
100001.0,1,100001.0,1990Q4,1.000000,1,1.000000,1,1.000000,1,1.000000,1,1.000000,1
100001.0,2,100001.0,1991Q1,0.982759,1,0.982759,1,0.982759,1,0.982759,1,0.982759,1
100001.0,3,100001.0,1991Q2,0.950000,0,0.950000,1,0.950000,1,0.950000,1,0.950000,1
100001.0,4,100001.0,1991Q3,0.915254,0,0.915254,1,0.915254,1,0.915254,1,0.915254,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
605153.0,4,605153.0,2019Q3,0.909271,0,0.874952,1,0.874952,1,0.874952,1,0.874952,1
605153.0,5,605153.0,2019Q4,0.615907,0,0.597222,1,0.597222,1,0.597222,1,0.597222,1
605153.0,6,605153.0,2020Q1,0.625335,0,0.606083,1,0.606083,1,0.606083,1,0.606083,1
605153.0,7,605153.0,2020Q2,0.939383,0,0.901054,0,0.901054,1,0.901054,1,0.901054,1


In [10]:
percent_within = percent_within.drop(columns=['wficn']).reset_index()

In [11]:
percent_within.to_pickle('fund_percent_within.pickle')

In [13]:
percent_within.describe().round(2)

Unnamed: 0,wficn,level_1,percent_within_3,if_past_quarter_missing_3,percent_within_7,if_past_quarter_missing_7,percent_within_11,if_past_quarter_missing_11,percent_within_15,if_past_quarter_missing_15,percent_within_19,if_past_quarter_missing_19
count,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0,425646.0
mean,279737.87,36.08,0.76,0.09,0.64,0.19,0.58,0.28,0.54,0.36,0.51,0.42
std,191872.47,31.33,0.17,0.29,0.21,0.39,0.23,0.45,0.24,0.48,0.24,0.49
min,100001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,103151.0,11.0,0.65,0.0,0.49,0.0,0.4,0.0,0.35,0.0,0.32,0.0
50%,200297.0,27.0,0.78,0.0,0.63,0.0,0.55,0.0,0.5,0.0,0.47,0.0
75%,500334.75,53.0,0.9,0.0,0.8,0.0,0.75,1.0,0.71,1.0,0.68,1.0
max,605153.0,162.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
mutual_fund_holding = mutual_fund_holding.drop(columns='Unnamed: 0')

In [19]:
mutual_fund_holding

Unnamed: 0,wficn,permno,prim_prospectus_bm,quarter,dollar_holdings,shares
498786,100001.0,20482,,1990Q3,6.141531e+05,140000.0
2672471,100001.0,59176,,1990Q3,1.644114e+05,30000.0
2794338,100001.0,15667,,1990Q3,1.410000e+06,120000.0
2826690,100001.0,65859,,1990Q3,3.050000e+05,20000.0
3371917,100001.0,27051,,1990Q3,2.012500e+05,10000.0
...,...,...,...,...,...,...
53234805,605153.0,12391,,2020Q3,7.162600e+02,118.0
53237780,605153.0,14642,,2020Q3,3.444300e+02,43.0
53260447,605153.0,15597,,2020Q3,6.900000e+00,1.0
53264153,605153.0,16454,,2020Q3,3.652500e+02,25.0


In [21]:
mutual_fund_holding_bm = mutual_fund_holding[mutual_fund_holding.prim_prospectus_bm.notna()].sort_values(['wficn','quarter'])

In [23]:
mutual_fund_holding_bm.quarter.unique()

array(['2003Q2', '2003Q3', '2003Q4', '2004Q1', '2004Q2', '2004Q3',
       '2004Q4', '2005Q1', '2005Q2', '2005Q3', '2005Q4', '2006Q1',
       '2006Q2', '2006Q3', '2006Q4', '2007Q1', '2007Q2', '2007Q3',
       '2007Q4', '2008Q1', '2008Q2', '2008Q3', '2008Q4', '2009Q1',
       '2009Q2', '2009Q3', '2009Q4', '2010Q1', '2010Q2', '2010Q3',
       '2010Q4', '2011Q1', '2011Q2', '2011Q3', '2011Q4', '2012Q1',
       '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2', '2013Q3',
       '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4', '2015Q1',
       '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2', '2016Q3',
       '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1',
       '2018Q2', '2018Q3', '2018Q4', '2019Q1', '2019Q2', '2019Q3',
       '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2000Q2', '2000Q3',
       '2000Q4', '2001Q1', '2001Q2', '2001Q3', '2001Q4', '2002Q1',
       '2002Q2', '2002Q3', '2002Q4', '2003Q1', '1999Q4', '2000Q1',
       '1999Q3', '1997Q4', '1998Q4', '1999Q1', '1999Q2', '1994

In [26]:
# Group by 'wficn' and 'quarter', and aggregate 'permno' into a set
grouped_permnos = mutual_fund_holding_bm.groupby(['wficn', 'quarter'])['permno'].agg(lambda x: len(set(x))).reset_index()

# Rename the column for clarity
grouped_permnos.rename(columns={'permno': 'permnos_num'}, inplace=True)

In [28]:
mutual_fund_holding_bm.columns

Index(['wficn', 'permno', 'prim_prospectus_bm', 'quarter', 'dollar_holdings',
       'shares'],
      dtype='object')

In [31]:
bm = mutual_fund_holding_bm[['wficn', 'prim_prospectus_bm', 'quarter']].drop_duplicates()

In [32]:
bm

Unnamed: 0,wficn,prim_prospectus_bm,quarter
55170,100001.0,MSCI ACWI NR USD,2003Q2
104220,100001.0,MSCI ACWI NR USD,2003Q3
104672,100001.0,MSCI ACWI NR USD,2003Q4
105122,100001.0,MSCI ACWI NR USD,2004Q1
105499,100001.0,MSCI ACWI NR USD,2004Q2
...,...,...,...
5135821,605146.0,S&P 500 TR USD,2019Q3
4458713,605146.0,S&P 500 TR USD,2019Q4
9559791,605146.0,S&P 500 TR USD,2020Q1
13394578,605146.0,S&P 500 TR USD,2020Q2


In [34]:
grouped_permnos = grouped_permnos.merge(bm,on=['wficn','quarter'])

In [35]:
grouped_permnos

Unnamed: 0,wficn,quarter,permnos_num,prim_prospectus_bm
0,100001.0,2003Q2,83,MSCI ACWI NR USD
1,100001.0,2003Q3,90,MSCI ACWI NR USD
2,100001.0,2003Q4,92,MSCI ACWI NR USD
3,100001.0,2004Q1,110,MSCI ACWI NR USD
4,100001.0,2004Q2,121,MSCI ACWI NR USD
...,...,...,...,...
145452,605146.0,2019Q3,23,S&P 500 TR USD
145453,605146.0,2019Q4,23,S&P 500 TR USD
145454,605146.0,2020Q1,22,S&P 500 TR USD
145455,605146.0,2020Q2,9,S&P 500 TR USD


In [39]:
def compute_metrics_for_benchmark(benchmark_df, market_data, h_values,available_quarters):
    # Initialize a list to store results
    results = []
    
    # Unique benchmarks and quarters
    benchmarks = benchmark_df['prim_prospectus_bm'].unique()
    
    for benchmark in benchmarks:
        # Filter data for the current benchmark
        benchmark_data = benchmark_df[benchmark_df['prim_prospectus_bm'] == benchmark]  
        quarters = benchmark_data['quarter'].unique()
        
        for quarter in quarters:
            # Dictionary to hold the results for the current benchmark and quarter
            result_entry = {
                'benchmark': benchmark,
                'quarter': quarter
            }
            
            # Current quarter permnos from mutual_fund_holding_bm
            current_quarter_permnos = set(benchmark_data[benchmark_data['quarter'] == quarter]['permno'].unique())
            
            # Available permnos from market_cap for the current quarter
            available_permnos_current_quarter = set(market_data[market_data['quarter'] == quarter]['permno'].unique())
            
            # Union of current permnos and available permnos
            available_permnos_current_set = current_quarter_permnos.union(available_permnos_current_quarter)
            
            for h in h_values:
                # Calculate past h quarters + current quarter
                past_h_quarters = get_past_h_quarters(available_quarters, quarter, h)
                
                # All permnos in past h quarters + current from mutual_fund_holding_bm
                past_plus_current_quarter_permnos_bm = set(benchmark_data[benchmark_data['quarter'].isin(past_h_quarters + [quarter])]['permno'].unique())
                
                # Intersection of available current set and past + current benchmark permnos
                permnos_bm = len(available_permnos_current_set.intersection(past_plus_current_quarter_permnos_bm))
                
                # Add the result under the corresponding 'h' column
                result_entry[f'permnos_bm_h{h}'] = permnos_bm
            
            # Append the dictionary to results list
            results.append(result_entry)

    # Convert results list to DataFrame
    results_df = pd.DataFrame(results)
    return results_df


In [40]:
available_quarters = available_quarters
market_cap = pd.read_pickle('market_cap.pickle')  # Assuming market_cap is defined and includes 'quarter' and 'permno'
h_values = [3,7,11,15,19]  # Example values for h
computed_df = compute_metrics_for_benchmark(mutual_fund_holding_bm, market_cap, h_values,available_quarters)

In [50]:
computed_df = computed_df.rename(columns={'benchmark':'prim_prospectus_bm'})

In [51]:
computed_df

Unnamed: 0,prim_prospectus_bm,quarter,permnos_bm_h3,permnos_bm_h7,permnos_bm_h11,permnos_bm_h15,permnos_bm_h19
0,MSCI ACWI NR USD,2003Q2,1348,1545,1674,1737,1737
1,MSCI ACWI NR USD,2003Q3,1273,1587,1708,1815,1815
2,MSCI ACWI NR USD,2003Q4,1312,1564,1742,1843,1865
3,MSCI ACWI NR USD,2004Q1,1356,1617,1784,1890,1926
4,MSCI ACWI NR USD,2004Q2,1527,1775,1908,2008,2063
...,...,...,...,...,...,...,...
876,MSCI EAFE NR USD,2000Q4,867,882,882,882,882
877,MSCI EAFE NR USD,1999Q3,57,57,57,57,57
878,MSCI EAFE NR USD,1999Q2,38,38,38,38,38
879,MSCI EAFE NR USD,1998Q4,24,24,24,24,24


In [79]:
merged_df = grouped_permnos.merge(computed_df,on=['prim_prospectus_bm','quarter'],how='left') 

In [80]:
# Compute percent_benchmark_h for each h (3, 7, 11, 15, 19)
for h in [3, 7, 11, 15, 19]:
    merged_df[f'percent_benchmark_{h}'] = merged_df['permnos_num']/merged_df[f'permnos_bm_h{h}']

In [81]:
percent_benchmark = merged_df[['wficn','quarter','prim_prospectus_bm','percent_benchmark_3','percent_benchmark_7','percent_benchmark_11','percent_benchmark_15','percent_benchmark_19']]

In [82]:
percent_benchmark.head()

Unnamed: 0,wficn,quarter,prim_prospectus_bm,percent_benchmark_3,percent_benchmark_7,percent_benchmark_11,percent_benchmark_15,percent_benchmark_19
0,100001.0,2003Q2,MSCI ACWI NR USD,0.061573,0.053722,0.049582,0.047784,0.047784
1,100001.0,2003Q3,MSCI ACWI NR USD,0.070699,0.056711,0.052693,0.049587,0.049587
2,100001.0,2003Q4,MSCI ACWI NR USD,0.070122,0.058824,0.052813,0.049919,0.04933
3,100001.0,2004Q1,MSCI ACWI NR USD,0.081121,0.068027,0.061659,0.058201,0.057113
4,100001.0,2004Q2,MSCI ACWI NR USD,0.07924,0.068169,0.063417,0.060259,0.058652


In [83]:
percent_within.head()

Unnamed: 0,wficn,level_1,quarter,percent_within_3,if_past_quarter_missing_3,percent_within_7,if_past_quarter_missing_7,percent_within_11,if_past_quarter_missing_11,percent_within_15,if_past_quarter_missing_15,percent_within_19,if_past_quarter_missing_19
0,100001.0,0,1990Q3,1.0,1,1.0,1,1.0,1,1.0,1,1.0,1
1,100001.0,1,1990Q4,1.0,1,1.0,1,1.0,1,1.0,1,1.0,1
2,100001.0,2,1991Q1,0.982759,1,0.982759,1,0.982759,1,0.982759,1,0.982759,1
3,100001.0,3,1991Q2,0.95,0,0.95,1,0.95,1,0.95,1,0.95,1
4,100001.0,4,1991Q3,0.915254,0,0.915254,1,0.915254,1,0.915254,1,0.915254,1


In [84]:
fund_sparsity = percent_within.merge(percent_benchmark,on=['wficn','quarter'],how='left')

In [85]:
fund_sparsity = fund_sparsity.drop(columns=['level_1'])

In [90]:
fund_sparsity = fund_sparsity[['wficn', 'quarter','prim_prospectus_bm', 'percent_within_3', 'if_past_quarter_missing_3',
       'percent_within_7', 'if_past_quarter_missing_7', 'percent_within_11',
       'if_past_quarter_missing_11', 'percent_within_15',
       'if_past_quarter_missing_15', 'percent_within_19',
       'if_past_quarter_missing_19', 
       'percent_benchmark_3', 'percent_benchmark_7', 'percent_benchmark_11',
       'percent_benchmark_15', 'percent_benchmark_19']]

In [92]:
fund_sparsity.to_pickle('2024_04_11_fund_sparsity.pickle')