In [1]:
import pandas as pd
# !pip install wrds
import wrds
import os
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
# !pip install fuzzywuzzy
from fuzzywuzzy import fuzz, process

  from pandas.core import (


In [2]:
ipo_data = pd.read_csv('inputs/IPO-age(9).csv')
spac_data = pd.read_csv('inputs/SPACs2016-2021.csv')

In [3]:
ipo_unique_cusips = ipo_data['CUSIP'].is_unique
spac_unique_cusips = spac_data['CUSIP'].is_unique


print("Unique CUSIP in IPO dataset:", ipo_unique_cusips)
print("Unique CUSIP in SPAC dataset:", spac_unique_cusips)

Unique CUSIP in IPO dataset: False
Unique CUSIP in SPAC dataset: False


In [4]:
ipo_data_clean = ipo_data.drop_duplicates(subset='CUSIP', keep='last')
spac_data_clean = spac_data.drop_duplicates(subset='CUSIP', keep='last')

In [5]:
merged_clean_data = pd.merge(ipo_data_clean, spac_data_clean, on=['CUSIP'], how='left', suffixes=('_IPO', '_SPAC'))
merged_clean_data['IS_SPAC'] = merged_clean_data['SPAC IPO '].notna()

In [6]:
spacs_only = merged_clean_data[merged_clean_data['IS_SPAC']]
ipos_not_spacs = merged_clean_data[~merged_clean_data['IS_SPAC']]

# Display some entries from each dataset
print("SPACs Identified:")
print(spacs_only.head())

print("\nIPOs Not SPACs:")
print(ipos_not_spacs.head())

spacs_only.to_csv('inputs/spacs_identified.csv', index=False)
ipos_not_spacs.to_csv('inputs/ipos_not_spacs.csv', index=False)

SPACs Identified:
       offer date                        IPO name ticker      CUSIP Rollup VC  \
12695    20160224     Silver Run Acquisition Corp  SRAQU  82811P200      0  0   
12696    20160303         Jensyn Acquisition Corp  JSYNU  47632B201      0  0   
12724    20160526       Landcadia Holdings II Inc  LCAHU  51476W206      0  0   
12736    20160707          M III Acquisition Corp  MIIIU  55378T203      0  0   
12758    20160916  Saban Capital Acquisition Corp  SCACU  78516C205      0  0   

      Dual Internet Post-issue shares CRSP perm  Founding  Unnamed: 11  \
12695    1        0          45000000     16021       -99          NaN   
12696    1        0           5213400     16012      2014          NaN   
12724    1        0          30000000     16173      2008          NaN   
12736    1        0                -9     16261       -99          NaN   
12758    1        0          28500000     16470      2016          NaN   

      Unnamed: 12 Unnamed: 13                     

In [7]:
spacs_identified = pd.read_csv('inputs/spacs_identified.csv')
ipos_identified = pd.read_csv('inputs/ipos_not_spacs.csv')

In [8]:
combined_ipo_spacs = pd.concat([spacs_identified, ipos_not_spacs])
combined_ipo_spacs = combined_ipo_spacs.drop_duplicates(subset='CUSIP')
combined_ipo_spacs.to_csv('inputs/combined_ipo_spacs.csv')

In [9]:
selected_columns = combined_ipo_spacs[['IPO name', 'ticker', 'CUSIP', 'Date of IPO', 'IS_SPAC']]
selected_columns.to_csv('inputs/filtered_data.csv')

In [10]:
ccm_youngfirms = "inputs/ccm_youngfirms_2000_2018.dta/ccm_youngfirms_2000_2018.dta"

if not os.path.exists(ccm_youngfirms):
    zip_path = "inputs/ccm_youngfirms_2000_2018.zip"

    with zipfile.ZipFile(zip_path,'r') as zip_ref:
        zip_ref.extractall("inputs/ccm_youngfirms_2000_2018.dta")

In [12]:
ccm_data = pd.read_stata(ccm_youngfirms)
ccm_data.to_csv('inputs/all_ccm_data.csv')

In [13]:
relevant_vars = ['tic','adrr', 'curuscn', 'scf', 'src', 'apdedate', 'fdate', 'pdate', 'acominc',
                 'acox', 'at', 'am', 'ao', 'aoloch', 'aox', 'ap', 'at', 'caps', 'capx', 'cb',
                 'ch', 'che', 'clg', 'cogs', 'csho', 'cusip', 'cshrt', 'cstk', 'dd', 'dlc',
                 'dn', 'do', 'datadate', 'dt', 'ebit', 'ebitda', 'epspi', 'fca', 'ffo', 'gdwl',
                 'gp', 'ib', 'intan', 'invt', 'lt', 'lct', 'ni', 'niadj', 'np', 'pi', 'ppegt',
                 'pnrsho', 'ppent', 're', 'revt', 'sale', 'seq', 'tdc', 'teq', 'tstk', 'txt',
                 'wcap', 'naicsh', 'mkvalt', 'acchg', 'accrt', 'amc', 'ano', 'arce', 'cshi',
                 'depc', 'derhedgl']

In [14]:
ccm_data_filtered =ccm_data[relevant_vars]
ccm_data_filtered.to_csv('inputs/ccm_data_filtered.csv')

In [15]:
combined_data = pd.read_csv('inputs/filtered_data.csv')
combined_data.rename(columns={'ticker' : 'tic'}, inplace=True)
combined_data.rename(columns={'IPO name' : 'conm'}, inplace=True)
combined_data

Unnamed: 0.1,Unnamed: 0,conm,tic,CUSIP,Date of IPO,IS_SPAC
0,0,Silver Run Acquisition Corp,SRAQU,82811P200,2/29/2016,True
1,1,Jensyn Acquisition Corp,JSYNU,47632B201,3/7/2016,True
2,2,Landcadia Holdings II Inc,LCAHU,51476W206,6/1/2016,True
3,3,M III Acquisition Corp,MIIIU,55378T203,7/7/2016,True
4,4,Saban Capital Acquisition Corp,SCACU,78516C205,9/21/2016,True
...,...,...,...,...,...,...
15297,15297,Aimei Health Technology Co.,AFJKU,G01341117,,False
15298,15298,INNO HOLDINGS INC.,INHD,4576JP109,,False
15299,15299,ZKH GROUP LTD,ZKH,98877R104,,False
15300,15300,Linkage Global Inc,LGCB,G5500B102,,False


In [None]:
ccm_ipo_spac_merge = pd.merge(combined_data, ccm_data_filtered, on='tic', how='left')
ccm_ipo_spac_merge.to_csv('inputs/ccm_ipo_spac.csv')

In [None]:
def get_matches(query, choices, limit=1):
    return process.extract(query, choices, limit=limit)

In [None]:
# # Prepare a list of choices from the CCM dataset
# choices = ccm_data['conm'].unique()

# # Apply fuzzy matching to each company name in the SPAC and IPO data
# combined_data['matched_name'] = combined_data['conm'].apply(lambda x: get_matches(x, choices)[0])

In [19]:
mapping_data = {
    'cusip_in_ritter':['47632B201'],
    'cusip_in_ccm' : ['47632B102']
}
mapping_table = pd.DataFrame(mapping_data)

In [20]:
ritter_data = pd.read_csv('inputs/filtered_data.csv')
ccm_data=pd.read_csv('inputs/all_ccm_data.csv')

  ccm_data=pd.read_csv('inputs/all_ccm_data.csv')


In [21]:
mapped = pd.merge(ritter_data, mapping_table, left_on='cusip', right_on='cusip_in_ritter', how='left')
final_merge_data = pd.merge(mapped, ccm_data, left_on='cusip_in_ccm', right_on='cusip', how='left')


KeyError: 'cusip'