In [13]:
import pickle
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [14]:
# Load the dictionary from the file
with open('../Intermediate Data/belongings_by_year.pkl', 'rb') as f:
    belongings_by_year = pickle.load(f)

# Load in Financial Data
financial_data = pd.read_csv("../Financial Data/CompustatCRSP_Annual.csv")
financial_data.rename(columns={"fyear" : "year"}, inplace=True)
financial_data = financial_data[['year','bkvlps', 'epspx', 'cik']]

sector_codes = pd.read_csv("../Financial Data/NAICS.csv", dtype={'naics':str})
sector_codes = sector_codes[['cik','naics']]
sector_codes = sector_codes.drop_duplicates(subset=['cik'])
# Create new columns for sector, subsector, and industry group
sector_codes['sector'] = sector_codes['naics'].str[:2]
sector_codes['subsector'] = sector_codes['naics'].str[:3]
sector_codes['industry_group'] = sector_codes['naics'].str[:4]
sector_codes.drop(['naics'],axis=1, inplace=True)
print(sector_codes)


           cik sector subsector industry_group
0         1750     42       423           4238
13        3197     33       333           3334
27     1808834     52       522           5222
41        2098     33       332           3322
55        2186     33       334           3342
...        ...    ...       ...            ...
20544  1337298     32       325           3251
20558  1896084     21       212           2122
20562  1505065     33       334           3345
20569  1536196     32       325           3254
20582  1922641     22       221           2211

[1499 rows x 4 columns]


In [17]:
triadic_data = []
for year, companies in belongings_by_year.items():
    for cik, motifs in companies.items():
        motifs['year'] = float(year)
        motifs['cik'] = cik
        triadic_data.append(motifs)

triadic_df = pd.DataFrame(triadic_data)

# Merge the financial data with the triadic data
merged_df = pd.merge(financial_data, triadic_df, on=['cik', 'year'])
merged_df = pd.merge(merged_df, sector_codes, on=['cik'])

TRIAD_NAMES = ["003", "012", "102", "021D", "021U", "021C", "111D", "111U", "030T", "030C", "201", "120D", "120U", "120C", "210", "300"]
rename_dict = {name: f"motif_{name}" for name in TRIAD_NAMES}
merged_df.rename(columns=rename_dict, inplace=True)
merged_df.to_csv("../Intermediate Data/no_dummies_financial_triad_merged.csv")

# Create dummy variables for years and motifs
merged_df = pd.get_dummies(merged_df, columns=["year"], dtype=float, drop_first=True)

merged_df.to_csv("../Intermediate Data/financial_triad_merged.csv")

In [79]:

# Define the independent variables
X = merged_df.drop(columns=['bkvlps', 'epspx', 'cik'])
X = sm.add_constant(X)  # Add a constant term to the model

# Dependent variable: bkvlps
y_bkvlps = merged_df['bkvlps']

# Run the regression for bkvlps
model_bkvlps = sm.OLS(y_bkvlps, X, missing='drop').fit()
#print(model_bkvlps.summary())

# Dependent variable: epspx
y_epspx = merged_df['epspx']

# Run the regression for epspx
model_epspx = sm.OLS(y_epspx, X, missing='drop').fit()
#print(model_epspx.summary())

In [80]:
from docx import Document
from docx.shared import Pt

def regression_to_word(models, model_names, file_name):
    doc = Document()
    
    for model, name in zip(models, model_names):
        doc.add_heading(f'Regression Results for {name}', level=1)
        doc.add_paragraph(model.summary().as_text())
    
    doc.save(file_name)

# Assuming model_bkvlps and model_epspx are your regression models
models = [model_bkvlps, model_epspx]
model_names = ['bkvlps', 'epspx']

# Output the results to a Word document
regression_to_word(models, model_names, '../Outputs/regression_results.docx')