In [None]:
import pandas as pd
# !pip install wrds
import wrds
import os
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
# !pip install fuzzywuzzy
from fuzzywuzzy import fuzz, process

In [None]:
ipo_data = pd.read_csv('inputs/IPO-age(9).csv')
spac_data = pd.read_csv('inputs/SPACs2016-2021.csv')

In [None]:
ipo_unique_cusips = ipo_data['CUSIP'].is_unique
spac_unique_cusips = spac_data['CUSIP'].is_unique


print("Unique CUSIP in IPO dataset:", ipo_unique_cusips)
print("Unique CUSIP in SPAC dataset:", spac_unique_cusips)

In [None]:
ipo_data_clean = ipo_data.drop_duplicates(subset='CUSIP', keep='last')
spac_data_clean = spac_data.drop_duplicates(subset='CUSIP', keep='last')

In [None]:
merged_clean_data = pd.merge(ipo_data_clean, spac_data_clean, on=['CUSIP'], how='left', suffixes=('_IPO', '_SPAC'))
merged_clean_data['IS_SPAC'] = merged_clean_data['SPAC IPO '].notna()

In [None]:
spacs_only = merged_clean_data[merged_clean_data['IS_SPAC']]
ipos_not_spacs = merged_clean_data[~merged_clean_data['IS_SPAC']]

# Display some entries from each dataset
print("SPACs Identified:")
print(spacs_only.head())

print("\nIPOs Not SPACs:")
print(ipos_not_spacs.head())

spacs_only.to_csv('inputs/spacs_identified.csv', index=False)
ipos_not_spacs.to_csv('inputs/ipos_not_spacs.csv', index=False)

In [None]:
spacs_identified = pd.read_csv('inputs/spacs_identified.csv')
ipos_identified = pd.read_csv('inputs/ipos_not_spacs.csv')

In [None]:
combined_ipo_spacs = pd.concat([spacs_identified, ipos_not_spacs])
combined_ipo_spacs = combined_ipo_spacs.drop_duplicates(subset='CUSIP')
combined_ipo_spacs.to_csv('inputs/combined_ipo_spacs.csv')

In [None]:
selected_columns = combined_ipo_spacs[['IPO name', 'ticker', 'Date of IPO', 'IS_SPAC']]
selected_columns.to_csv('inputs/filtered_data.csv')

In [None]:
ccm_youngfirms = "inputs/ccm_youngfirms_2000_2018.dta/ccm_youngfirms_2000_2018.dta"

if not os.path.exists(ccm_youngfirms):
    zip_path = "inputs/ccm_youngfirms_2000_2018.zip"

    with zipfile.ZipFile(zip_path,'r') as zip_ref:
        zip_ref.extractall("inputs/ccm_youngfirms_2000_2018.dta")

In [None]:
ccm_data = pd.read_stata(ccm_youngfirms)

In [None]:
relevant_vars = ['adrr', 'curuscn', 'scf', 'src', 'apdedate', 'fdate', 'pdate', 'acominc',
                 'acox', 'at', 'am', 'ao', 'aoloch', 'aox', 'ap', 'at', 'caps', 'capx', 'cb',
                 'ch', 'che', 'clg', 'cogs', 'csho', 'cusip', 'cshrt', 'cstk', 'dd', 'dlc',
                 'dn', 'do', 'datadate', 'dt', 'ebit', 'ebitda', 'epspi', 'fca', 'ffo', 'gdwl',
                 'gp', 'ib', 'intan', 'invt', 'lt', 'lct', 'ni', 'niadj', 'np', 'pi', 'ppegt',
                 'pnrsho', 'ppent', 're', 'revt', 'sale', 'seq', 'tdc', 'teq', 'tstk', 'txt',
                 'wcap', 'naicsh', 'mkvalt', 'acchg', 'accrt', 'amc', 'ano', 'arce', 'cshi',
                 'depc', 'derhedgl']

In [None]:
ccm_data_filtered =ccm_data[relevant_vars]
threshold = 0.9 * len(ccm_data_filtered)
ccm_data_filtered = ccm_data_filtered.dropna(thresh=threshold, axis=1)

In [None]:
combined_data = pd.read_csv('inputs/filtered_data.csv')
combined_data.rename(columns={'CUSIP' : 'cusip'}, inplace=True)

In [None]:
ccm_ipo_spac_merge = pd.merge(combined_data, ccm_data_filtered, on='cusip', how='left')