In [8]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("./Companies_Analyzed.csv")

# Extract the CIK column
cik_column = df['CIK']

# Initialize an empty set to store unique CIKs
unique_ciks = set()

# Iterate through the CIK column
for entry in cik_column:
    # Split the entry by semicolon and add each CIK to the set
    if pd.notna(entry):  # Check if the entry is not NaN
        ciks = entry.split(';')
        unique_ciks.update(ciks)

# Convert the set to a list (optional)
unique_ciks_list = list(unique_ciks)
print(unique_ciks_list)
unique_ciks_list = [ x for x in unique_ciks_list if x.isdigit() ]
unique_ciks_list = [ int(x) for x in unique_ciks_list ]


['1368265', '1049502', '1017303', '1448431', '799167', '1144980', '1434754', '884624', '894627', '732026', '1001385', '1753368', '944148', '1422892', '946647', ' 0001757898', '1417398', '1299969', ' 0001042618', '0001220751', '0001365129', '1286613', '0001505065', '882796', '1035092', '0001118404', '1468328', '0000944492', '1164771', '82811', '0000046675', '1078075', '1128361', '943452', '0001434621', '0001001171', '1230992', '850209', ' 0000103220', '67347', '944809', '928340', '1094038', '1129155', '1370416', '896156', ' 0000885978', '1922641', '1173420', '61004', '350868', '923601', '1010086', ' 0000816951', '1262823', '920371', '1185348', '9984', '861842', '1104657', ' 0001058371', ' 0001291000', '896264', '18349', '1116942', '29332', ' 0000896494', '1465128', '0000776734', ' 0001049130', '28917', '1055726', '39020', '824142', '750686', '1379041', '0001005284', '1130713', '1030469', '0000857121', '1369085', ' 0001493225', ' 0000912752', '1385280', '916365', '828944', '1376339', '00

In [9]:
import pandas as pd

# Load the lookup table
lookup_df = pd.read_csv("C:/Users/feldberg.dartmouth/Downloads/rawdata/cik_name_lookup.csv", sep='\t')
lookup_df = lookup_df.drop_duplicates(subset=['cw_id'])
lookup_df.drop(columns=['row_id', 'edgar_name', "match_name"], inplace = True)
lookup_df.dropna(subset=['cik', 'cw_id'], inplace=True)
lookup_df = lookup_df.astype('int32')

# Load the company relations data
relations_df = pd.read_csv('C:/Users/feldberg.dartmouth/Downloads/rawdata/company_relations.csv', sep='\t')
relations_df.drop(columns=['relation_id', 'relation_type', 'relation_origin', 'origin_id'], inplace = True)
relations_df.dropna(subset=['source_cw_id', 'target_cw_id'], inplace=True)
relations_df = relations_df.astype('int32')

# Create a dictionary for cw_id to CIK mapping
cw_to_cik = dict(zip(lookup_df['cw_id'], lookup_df['cik']))

# Replace source_cw_id and target_cw_id with corresponding CIK values
relations_df['source_cik'] = relations_df['source_cw_id'].map(cw_to_cik)
relations_df['target_cik'] = relations_df['target_cw_id'].map(cw_to_cik)

# Drop the original cw_id columns if no longer needed
relations_df.drop(columns=['source_cw_id', 'target_cw_id'], inplace=True)

filtered_relations_df = relations_df[
    relations_df['source_cik'].isin(unique_ciks_list) | 
    relations_df['target_cik'].isin(unique_ciks_list)
]

# Save the updated dataframe to a new CSV file
filtered_relations_df.to_csv('C:/Users/feldberg.dartmouth/Downloads/rawdata/updated_company_relations_bw.csv', index=False)

In [10]:
import pandas as pd

# Load the lookup table
lookup_df = pd.read_csv("C:/Users/feldberg.dartmouth/Downloads/rawdata/CIK_CompanyID.csv")
lookup_df = lookup_df.drop_duplicates(subset=['BoardID', 'CIKCode'])
lookup_df.drop(columns=['BoardName'], inplace = True)
lookup_df.dropna(subset=['CIKCode', 'BoardID'], inplace=True)
lookup_df = lookup_df.astype('int32')


# Load the company relations data
relations_df = pd.read_csv('C:/Users/feldberg.dartmouth/Downloads/rawdata/Board_Listed_Associations.csv')
relations_df = relations_df[relations_df.columns.intersection(['OverlapYearStart', 'OverlapYearEnd', 'BoardID', 'CompanyID'])]
relations_df.dropna(subset=['OverlapYearStart', 'OverlapYearEnd', 'BoardID', 'CompanyID'], inplace=True)
relations_df.replace('Curr', 2024, inplace=True)
relations_df = relations_df.astype('int32')

# Create a dictionary for cw_id to CIK mapping
BoardID_to_cik = dict(zip(lookup_df['BoardID'], lookup_df['CIKCode']))

# Replace source_cw_id and target_cw_id with corresponding CIK values
relations_df['source_cik'] = relations_df['BoardID'].map(BoardID_to_cik)
relations_df['target_cik'] = relations_df['CompanyID'].map(BoardID_to_cik)

# Drop the original cw_id columns if no longer needed
relations_df.drop(columns=['BoardID', 'CompanyID'], inplace=True)

# Filter to only data of interest
filtered_relations_df = relations_df[
    relations_df['source_cik'].isin(unique_ciks_list) | 
    relations_df['target_cik'].isin(unique_ciks_list)
]

# Save the updated dataframe to a new CSV file
filtered_relations_df.to_csv('C:/Users/feldberg.dartmouth/Downloads/rawdata/updated_company_relations_bx.csv', index=False)



In [11]:
df_bw = pd.read_csv('C:/Users/feldberg.dartmouth/Downloads/rawdata/updated_company_relations_bw.csv')
df_bx = pd.read_csv('C:/Users/feldberg.dartmouth/Downloads/rawdata/updated_company_relations_bx.csv')

def expand_years(row):
    years = range(row['OverlapYearStart'], row['OverlapYearEnd'] + 1)
    return pd.DataFrame({
        'year': years,
        'source_cik': row['source_cik'],
        'target_cik': row['target_cik']
    })

# Drop rows with missing values in the relevant columns
df_bx_cleaned = df_bx.dropna(subset=['source_cik', 'target_cik'])
df_bw_cleaned = df_bw.dropna(subset=['source_cik', 'target_cik'])

# Ensure all relevant columns are integers
df_bx_cleaned['source_cik'] = df_bx_cleaned['source_cik'].astype(int)
df_bx_cleaned['target_cik'] = df_bx_cleaned['target_cik'].astype(int)

df_bw_cleaned['source_cik'] = df_bw_cleaned['source_cik'].astype(int)
df_bw_cleaned['target_cik'] = df_bw_cleaned['target_cik'].astype(int)

# Expand df_bx_cleaned
expanded_bx_cleaned = pd.concat([expand_years(row) for _, row in df_bx_cleaned.iterrows()], ignore_index=True)
expanded_bx_cleaned['relationship'] = 'subsidiary'

# Add relationship column to df_bw_cleaned
df_bw_cleaned['relationship'] = 'interlock'

# Combine the dataframes
combined_df_cleaned = pd.concat([df_bw_cleaned[['year', 'source_cik', 'target_cik', 'relationship']], expanded_bx_cleaned], ignore_index=True)
combined_df_cleaned = combined_df_cleaned[(combined_df_cleaned['year'] >= 2010) & (combined_df_cleaned['year'] <= 2023)]
combined_df_cleaned = combined_df_cleaned.drop_duplicates()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bx_cleaned['source_cik'] = df_bx_cleaned['source_cik'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bx_cleaned['target_cik'] = df_bx_cleaned['target_cik'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bw_cleaned['source_cik'] = df_bw_cleaned['source_cik'].astype(

In [12]:
import networkx as nx
import pickle

# Create a dictionary to store the graphs for each year
graphs = {}

# Create graphs for each year
for year in combined_df_cleaned['year'].unique():
    G = nx.MultiDiGraph()
    year_data = combined_df_cleaned[combined_df_cleaned['year'] == year]
    
    for _, row in year_data.iterrows():
        G.add_edge(row['source_cik'], row['target_cik'], relationship=row['relationship'])
    
    graphs[year] = G

# Define the base path for saving the pickle files
base_path = './Pickle Graphs3/'  # Change this to your desired path

# Iterate through each year and graph in the dictionary
for year, graph in graphs.items():
    # Define the filename for each graph
    filename = f'{base_path}/graph_{year}.pkl'
    
    # Save the graph to a pickle file
    with open(filename, 'wb') as f:
        pickle.dump(graph, f)

    print(f'Graph for year {year} saved to {filename}')


Graph for year 2010 saved to ./Pickle Graphs3//graph_2010.pkl
Graph for year 2011 saved to ./Pickle Graphs3//graph_2011.pkl
Graph for year 2012 saved to ./Pickle Graphs3//graph_2012.pkl
Graph for year 2013 saved to ./Pickle Graphs3//graph_2013.pkl
Graph for year 2014 saved to ./Pickle Graphs3//graph_2014.pkl
Graph for year 2015 saved to ./Pickle Graphs3//graph_2015.pkl
Graph for year 2016 saved to ./Pickle Graphs3//graph_2016.pkl
Graph for year 2017 saved to ./Pickle Graphs3//graph_2017.pkl
Graph for year 2018 saved to ./Pickle Graphs3//graph_2018.pkl
Graph for year 2019 saved to ./Pickle Graphs3//graph_2019.pkl
Graph for year 2020 saved to ./Pickle Graphs3//graph_2020.pkl
Graph for year 2022 saved to ./Pickle Graphs3//graph_2022.pkl
Graph for year 2023 saved to ./Pickle Graphs3//graph_2023.pkl
Graph for year 2021 saved to ./Pickle Graphs3//graph_2021.pkl
