In [6]:
import os
import pickle
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams


In [7]:
def load_pickle_files(folder_path):
    pickle_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pkl'):
            year = filename[-8:-4]
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'rb') as file:
                data = pickle.load(file)
            pickle_dict[year] = data
    return pickle_dict

In [8]:
def main(folder_path):
    graphs_by_year = load_pickle_files(folder_path)
    # Initialize an empty list to store the results
    results = []

    # Iterate through each graph in the dictionary
    for year, graph in graphs_by_year.items():
        # Compute centrality measures
        degree_centrality = nx.degree_centrality(graph)
        closeness_centrality = nx.closeness_centrality(graph)
        betweenness_centrality = nx.betweenness_centrality(graph)
        eigenvector_centrality = nx.eigenvector_centrality_numpy(graph)
        
        # Store the results for each node
        for node in graph.nodes():
            results.append({
                'Year': year,
                'Node': node,
                'Degree Centrality': degree_centrality[node],
                'Closeness Centrality': closeness_centrality[node],
                'Betweenness Centrality': betweenness_centrality[node],
                'Eigenvector Centrality': eigenvector_centrality[node]
            })

    # Convert the results list to a DataFrame
    centrality_df = pd.DataFrame(results)

    # Display the DataFrame
    print(centrality_df)

    return centrality_df
        


In [9]:
folder_path = '../Pickle Graphs/'
centrality_df = main(folder_path)

       Year     Node  Degree Centrality  Closeness Centrality  \
0      2010  1417398           0.008818              0.154232   
1      2010       20           0.000827              0.130905   
2      2010  1130464           0.003582              0.149992   
3      2010    12400           0.000276              0.127890   
4      2010   354707           0.002204              0.109409   
...     ...      ...                ...                   ...   
56653  2023  1906425           0.000218              0.000000   
56654  2023  1611282           0.000218              0.000000   
56655  2023  1972912           0.000218              0.000000   
56656  2023  1929288           0.000218              0.000000   
56657  2023  1945154           0.000218              0.000000   

       Betweenness Centrality  Eigenvector Centrality  
0                    0.018056            9.095141e-05  
1                    0.000000            1.769807e-05  
2                    0.004272            1.306589e-

In [15]:
centrality_df.rename(columns={'Node':'cik', 'Year':'year'}, inplace=True)
centrality_df['year']=centrality_df['year'].astype(int)

# Load in Financial Data
financial_data = pd.read_csv("../Financial Data/CompustatCRSP_Annual.csv")
financial_data.rename(columns={"fyear" : "year"}, inplace=True)
financial_data = financial_data[['year','bkvlps', 'epspx', 'cik']]

# Load in Sector Codes
sector_codes = pd.read_csv("../Financial Data/NAICS.csv", dtype={'naics':str})
sector_codes = sector_codes[['cik','naics']]
sector_codes = sector_codes.drop_duplicates(subset=['cik'])
# Create new columns for sector, subsector, and industry group
sector_codes['sector'] = sector_codes['naics'].str[:2]
sector_codes['subsector'] = sector_codes['naics'].str[:3]
sector_codes['industry_group'] = sector_codes['naics'].str[:4]
sector_codes.drop(['naics'],axis=1, inplace=True)
centrality_df['year']=centrality_df['year'].astype(int)

print(centrality_df)
# Merge the financial data with the centrality data
merged_df = pd.merge(financial_data, centrality_df, on=['cik', 'year'])
merged_df = pd.merge(merged_df, sector_codes, on=['cik'])

       year      cik  Degree Centrality  Closeness Centrality  \
0      2010  1417398           0.008818              0.154232   
1      2010       20           0.000827              0.130905   
2      2010  1130464           0.003582              0.149992   
3      2010    12400           0.000276              0.127890   
4      2010   354707           0.002204              0.109409   
...     ...      ...                ...                   ...   
56653  2023  1906425           0.000218              0.000000   
56654  2023  1611282           0.000218              0.000000   
56655  2023  1972912           0.000218              0.000000   
56656  2023  1929288           0.000218              0.000000   
56657  2023  1945154           0.000218              0.000000   

       Betweenness Centrality  Eigenvector Centrality  
0                    0.018056            9.095141e-05  
1                    0.000000            1.769807e-05  
2                    0.004272            1.306589e-

ValueError: You are trying to merge on float64 and object columns for key 'year'. If you wish to proceed you should use pd.concat