In [1]:
import pandas as pd
# !pip install wrds
import wrds
import os
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
# !pip install fuzzywuzzy
from fuzzywuzzy import fuzz, process

  from pandas.core import (


In [2]:
ipo_data = pd.read_csv('inputs/IPO-age(9).csv')
spac_data = pd.read_csv('inputs/SPACs2016-2021.csv')

In [3]:
ipo_unique_cusips = ipo_data['CUSIP'].is_unique
spac_unique_cusips = spac_data['CUSIP'].is_unique


print("Unique CUSIP in IPO dataset:", ipo_unique_cusips)
print("Unique CUSIP in SPAC dataset:", spac_unique_cusips)

Unique CUSIP in IPO dataset: False
Unique CUSIP in SPAC dataset: False


In [4]:
ipo_data_clean = ipo_data.drop_duplicates(subset='CUSIP', keep='last')
spac_data_clean = spac_data.drop_duplicates(subset='CUSIP', keep='last')

In [5]:
merged_clean_data = pd.merge(ipo_data_clean, spac_data_clean, on=['CUSIP'], how='left', suffixes=('_IPO', '_SPAC'))
merged_clean_data['IS_SPAC'] = merged_clean_data['SPAC IPO '].notna()

In [6]:
spacs_only = merged_clean_data[merged_clean_data['IS_SPAC']]
ipos_not_spacs = merged_clean_data[~merged_clean_data['IS_SPAC']]

# Display some entries from each dataset
print("SPACs Identified:")
print(spacs_only.head())

print("\nIPOs Not SPACs:")
print(ipos_not_spacs.head())

spacs_only.to_csv('inputs/spacs_identified.csv', index=False)
ipos_not_spacs.to_csv('inputs/ipos_not_spacs.csv', index=False)

SPACs Identified:
       offer date                        IPO name ticker      CUSIP Rollup VC  \
12695    20160224     Silver Run Acquisition Corp  SRAQU  82811P200      0  0   
12696    20160303         Jensyn Acquisition Corp  JSYNU  47632B201      0  0   
12724    20160526       Landcadia Holdings II Inc  LCAHU  51476W206      0  0   
12736    20160707          M III Acquisition Corp  MIIIU  55378T203      0  0   
12758    20160916  Saban Capital Acquisition Corp  SCACU  78516C205      0  0   

      Dual Internet Post-issue shares CRSP perm  Founding  Unnamed: 11  \
12695    1        0          45000000     16021       -99          NaN   
12696    1        0           5213400     16012      2014          NaN   
12724    1        0          30000000     16173      2008          NaN   
12736    1        0                -9     16261       -99          NaN   
12758    1        0          28500000     16470      2016          NaN   

      Unnamed: 12 Unnamed: 13                     

In [7]:
spacs_identified = pd.read_csv('inputs/spacs_identified.csv')
ipos_identified = pd.read_csv('inputs/ipos_not_spacs.csv')

In [8]:
combined_ipo_spacs = pd.concat([spacs_identified, ipos_not_spacs])
combined_ipo_spacs = combined_ipo_spacs.drop_duplicates(subset='CUSIP')
combined_ipo_spacs.to_csv('inputs/combined_ipo_spacs.csv')

In [9]:
selected_columns = combined_ipo_spacs[['IPO name', 'ticker', 'CUSIP', 'Date of IPO', 'IS_SPAC']]
selected_columns.to_csv('inputs/filtered_data.csv')

In [10]:
ccm_youngfirms = "inputs/ccm_youngfirms_2000_2018.dta/ccm_youngfirms_2000_2018.dta"

if not os.path.exists(ccm_youngfirms):
    zip_path = "inputs/ccm_youngfirms_2000_2018.zip"

    with zipfile.ZipFile(zip_path,'r') as zip_ref:
        zip_ref.extractall("inputs/ccm_youngfirms_2000_2018.dta")

In [12]:
ccm_data = pd.read_stata(ccm_youngfirms)
ccm_data.to_csv('inputs/all_ccm_data.csv')

In [13]:
relevant_vars = ['tic','adrr', 'curuscn', 'scf', 'src', 'apdedate', 'fdate', 'pdate', 'acominc',
                 'acox', 'at', 'am', 'ao', 'aoloch', 'aox', 'ap', 'at', 'caps', 'capx', 'cb',
                 'ch', 'che', 'clg', 'cogs', 'csho', 'cusip', 'cshrt', 'cstk', 'dd', 'dlc',
                 'dn', 'do', 'datadate', 'dt', 'ebit', 'ebitda', 'epspi', 'fca', 'ffo', 'gdwl',
                 'gp', 'ib', 'intan', 'invt', 'lt', 'lct', 'ni', 'niadj', 'np', 'pi', 'ppegt',
                 'pnrsho', 'ppent', 're', 'revt', 'sale', 'seq', 'tdc', 'teq', 'tstk', 'txt',
                 'wcap', 'naicsh', 'mkvalt', 'acchg', 'accrt', 'amc', 'ano', 'arce', 'cshi',
                 'depc', 'derhedgl']

In [14]:
ccm_data_filtered =ccm_data[relevant_vars]
ccm_data_filtered.to_csv('inputs/ccm_data_filtered.csv')

In [15]:
combined_data = pd.read_csv('inputs/filtered_data.csv')
combined_data.rename(columns={'ticker' : 'tic'}, inplace=True)
combined_data.rename(columns={'IPO name' : 'conm'}, inplace=True)
combined_data

Unnamed: 0.1,Unnamed: 0,conm,tic,CUSIP,Date of IPO,IS_SPAC
0,0,Silver Run Acquisition Corp,SRAQU,82811P200,2/29/2016,True
1,1,Jensyn Acquisition Corp,JSYNU,47632B201,3/7/2016,True
2,2,Landcadia Holdings II Inc,LCAHU,51476W206,6/1/2016,True
3,3,M III Acquisition Corp,MIIIU,55378T203,7/7/2016,True
4,4,Saban Capital Acquisition Corp,SCACU,78516C205,9/21/2016,True
...,...,...,...,...,...,...
15297,15297,Aimei Health Technology Co.,AFJKU,G01341117,,False
15298,15298,INNO HOLDINGS INC.,INHD,4576JP109,,False
15299,15299,ZKH GROUP LTD,ZKH,98877R104,,False
15300,15300,Linkage Global Inc,LGCB,G5500B102,,False


In [None]:
ccm_ipo_spac_merge = pd.merge(combined_data, ccm_data_filtered, on='tic', how='left')
ccm_ipo_spac_merge.to_csv('inputs/ccm_ipo_spac.csv')

In [None]:
def get_matches(query, choices, limit=1):
    return process.extract(query, choices, limit=limit)

In [None]:
# # Prepare a list of choices from the CCM dataset
# choices = ccm_data['conm'].unique()

# # Apply fuzzy matching to each company name in the SPAC and IPO data
# combined_data['matched_name'] = combined_data['conm'].apply(lambda x: get_matches(x, choices)[0])

In [20]:
ritter_data = pd.read_csv('inputs/filtered_data.csv')
ccm_data=pd.read_csv('inputs/all_ccm_data.csv')

  ccm_data=pd.read_csv('inputs/all_ccm_data.csv')


In [24]:
ritter_data['cusip_truncated'] = ritter_data['CUSIP'].str[:6]
ccm_data['cusip_truncated'] = ccm_data['cusip'].str[:6]


In [29]:
merged_data = pd.merge(ritter_data, ccm_data, on='cusip_truncated', how='left')
merged_data.to_csv('inputs/ritter_ccm.csv')

In [41]:
ipo_data['offer date'] = pd.to_datetime(ipo_data['offer date'], format='%Y%m%d')

ipo_data['formatted offer date'] = ipo_data['offer date'].dt.strftime('%m/%d/%Y')

print(ipo_data[['offer date', 'formatted offer date']].head())

ipo_data = ipo_data[ipo_data['offer date'].dt.year >= 2017]

print(ipo_data.head())

ipo_data.to_csv('inputs/cleaned_ipo_years.csv')

      offer date formatted offer date
12940 2017-01-12           01/12/2017
12941 2017-01-20           01/20/2017
12942 2017-01-20           01/20/2017
12943 2017-01-26           01/26/2017
12944 2017-01-26           01/26/2017
      offer date                     IPO name ticker      cusip Rollup VC  \
12940 2017-01-12        Gores Holdings II Inc  GSHTU  382867208      0  0   
12941 2017-01-20  Fintech Acquisition Corp II  FNTEU  31810G208      0  0   
12942 2017-01-20              Keane Group Inc   FRAC  48669A108      0  0   
12943 2017-01-26               AnaptysBio Inc   ANAB  032724106      0  1   
12944 2017-01-26                    Obseva SA   OBSV  H5861P103      0  1   

      Dual Internet Post-issue shares CRSP perm  Founding  Unnamed: 11  \
12940    1        0          46875000     16631      2016          NaN   
12941    1        0          20960000     16630       -99          NaN   
12942    0        0                 .     16557      1973          NaN   
12943    0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ipo_data['offer date'] = pd.to_datetime(ipo_data['offer date'], format='%Y%m%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ipo_data['formatted offer date'] = ipo_data['offer date'].dt.strftime('%m/%d/%Y')


In [44]:
ipo_data.rename(columns={'CUSIP' : 'cusip'}, inplace=True)
merged_ccm_ipo = pd.merge(ipo_data, ccm_data, on='cusip', how='left')
merged_ccm_ipo.drop_duplicates()
merged_ccm_ipo.to_csv('inputs/merged_ccm_ipo.csv')

In [47]:
df1 = pd.read_csv('inputs/IPO-age(9).csv')
df2 = pd.read_csv('inputs/all_ccm_data.csv')

  df2 = pd.read_csv('inputs/all_ccm_data.csv')


In [48]:
matches = df1['CUSIP'].isin(df2['cusip'])
exact_match_count = matches.sum()
print(f"Number of exact CUSIP matches: {exact_match_count}")


Number of exact CUSIP matches: 3101


In [50]:
matching_entries =df1[matches]
print(matching_entries)
matching_entries.to_csv('inputs/matching_entries.csv')

       offer date                        IPO name ticker      CUSIP Rollup VC  \
2089     19860605                         Genzyme   GENZ  372917104      0  1   
4619     19920616         Columbia Banking System   COLB  197236102      0  0   
4989     19930219     Nuveen Prem Inc Muni Fund 4    NPT  6706K4105      0  0   
5106     19930422          TCW/DW Term Trust 2003    TMT  87234U108      0  0   
5158     19930520    Nuveen CT Prem Inc Muni Fund    NTC  67060D107      0  0   
...           ...                             ...    ...        ...    ... ..   
13382    20181214             Legacy Housing Corp   LEGH  52472M101      0  0   
13678    20200603         Warner Music Group Corp    WMG  934550203      0  0   
14514    20210423                     Agiliti Inc   AGTI  00848J104      1  0   
14663    20210701                Krispy Kreme Inc   DNUT  50101L106      0  0   
14676    20210713  NorthEast Community Bancorp, I   NECB  664121100      0  0   

      Dual Internet Post-is

In [51]:
df3 = pd.read_csv('inputs/matching_entries.csv')
df4 = pd.read_csv('inputs/SPACs2016-2021.csv')

In [60]:
df3['cusip_trunc'] = df3['CUSIP'].astype(str).str[:4]
df4['cusip_trunc'] = df4['CUSIP'].astype(str).str[:4]


In [61]:
partial_matches = df3['cusip_trunc'].isin(df4['cusip_trunc'])
print(f"Number of partial CUSIP matches: {partial_matches.sum()}")

Number of partial CUSIP matches: 455


In [62]:
partial_matching = df3[partial_matches]
print(partial_matching)
partial_matching.to_csv('inputs/partial_matching.csv')

      Unnamed: 0  offer date                       IPO name ticker      CUSIP  \
15          6084    19940616         Liberty Property Trust    LRY  531172104   
19          6156    19940728    Home Properties of New York    HME  437306103   
29          6344    19941205    Apollo Group (U of Phoenix)  APOLA  037604105   
31          6376    19950110                         Diasys   DIYS  252838107   
35          6416    19950214  Globalstar Telecommunications  GSTRF  G3930H104   
...          ...         ...                            ...    ...        ...   
3070       13327    20180926            Sutro Biopharma Inc   STRO  869367102   
3083       13352    20181019      Logicbio Therapeutics Inc   LOGC  54142F102   
3088       13361    20181031    Axonics Modulation Tech Inc   AXNX  05465P101   
3089       13364    20181107         CNFinance Holdings Ltd    CNF  18979T105   
3096       13382    20181214            Legacy Housing Corp   LEGH  52472M101   

     Rollup  VC Dual Intern

In [65]:
partial_matching.rename(columns={'CUSIP' : 'cusip'}, inplace=True)

partial_matches_merge = pd.merge(partial_matching, ccm_data, on='cusip', how='left')

partial_matches_merge.to_csv('inputs/merged_ccm_matches.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_matching.rename(columns={'CUSIP' : 'cusip'}, inplace=True)
