# Any and all pacakges used for cleaning and modeling

In [1]:
import pandas as pd
# !pip install wrds
# import wrds
import os
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

## Part 1: Merging SPAC and IPO data
- Key Points: Self cleaned excel files with offering dates and firm names
- Both read in seperately and merged on the date variabe

In [2]:
spac_data = pd.read_csv("inputs/SPACs2016-2021.csv", header=0)
IPO_data = pd.read_csv("inputs/IPO_data.csv", header=0)


In [3]:
IPO_data['CUSIP'] = IPO_data['CUSIP'].apply(lambda x: str(x).zfill(9))
IPO_data

Unnamed: 0.1,Unnamed: 0,offer date,IPO name,CUSIP
0,0,01/30/1975,ROYSTER,078088610
1,1,06/09/1975,VARCO,092212610
2,2,06/10/1975,COORS ADOLPH,021701610
3,3,07/15/1975,KEYSTONE FOODS CORP,049348410
4,4,08/26/1975,C. F. BRAUN,010564710
...,...,...,...,...
15442,15442,12/14/2023,INNO HOLDINGS INC.,4576JP109
15443,15443,12/15/2023,Bayview Acquisition,000000nan
15444,15444,12/15/2023,ZKH GROUP LTD,98877R104
15445,15445,12/19/2023,Linkage Global Inc,G5500B102


In [4]:
IPO_data = IPO_data[IPO_data['CUSIP'] != '000000nan']
IPO_data

Unnamed: 0.1,Unnamed: 0,offer date,IPO name,CUSIP
0,0,01/30/1975,ROYSTER,078088610
1,1,06/09/1975,VARCO,092212610
2,2,06/10/1975,COORS ADOLPH,021701610
3,3,07/15/1975,KEYSTONE FOODS CORP,049348410
4,4,08/26/1975,C. F. BRAUN,010564710
...,...,...,...,...
15440,15440,12/01/2023,Garden Stage Ltd,G3730L107
15441,15441,12/04/2023,Aimei Health Technology Co.,G01341117
15442,15442,12/14/2023,INNO HOLDINGS INC.,4576JP109
15444,15444,12/15/2023,ZKH GROUP LTD,98877R104


In [5]:
spac_data['CUSIP'] = spac_data['CUSIP'].apply(lambda x: str(x).zfill(9))
spac_data

Unnamed: 0,SPAC IPO,Date of IPO,CUSIP
0,Aetherium Acquisition Corp.,12/29/2021,00809J200
1,Welsbach Technology Metals Acquisition Corp.,12/27/2021,950415208
2,Arogo Capital Acquisition Corp.,12/23/2021,042644203
3,Gardiner Healthcare Acquisitions Corp.,12/21/2021,365506203
4,Larkspur Health Acquisition Corp.,12/20/2021,51724W206
...,...,...,...
1009,CF Corp.,5/25/2016,000000nan
1010,KLR Energy Acquisition Corp.,3/16/2016,777385204
1011,Jensyn Acquisition Corp.,3/7/2016,47632B201
1012,Silver Run Acquisition Corp.,2/29/2016,82811P200


In [6]:
spac_data = spac_data[spac_data['CUSIP'] != '000000nan']
spac_data

Unnamed: 0,SPAC IPO,Date of IPO,CUSIP
0,Aetherium Acquisition Corp.,12/29/2021,00809J200
1,Welsbach Technology Metals Acquisition Corp.,12/27/2021,950415208
2,Arogo Capital Acquisition Corp.,12/23/2021,042644203
3,Gardiner Healthcare Acquisitions Corp.,12/21/2021,365506203
4,Larkspur Health Acquisition Corp.,12/20/2021,51724W206
...,...,...,...
1007,M III Acquisition Corp.,7/7/2016,55378T203
1008,"Landcadia Holdings, Inc.",6/1/2016,51476W206
1010,KLR Energy Acquisition Corp.,3/16/2016,777385204
1011,Jensyn Acquisition Corp.,3/7/2016,47632B201


In [18]:
# Using the 'IPO name' from IPO_data and 'SPAC IPO' from spac_data as the keys for merging
merged_IPOnSPAC = pd.merge(left=spac_data, right=IPO_data, how='left', left_on='SPAC IPO ', right_on='IPO name')

# Create an indicator column to see if there was a matching SPAC IPO entry
# 'SPAC IPO' needs to match the column name used for spac_data. Adjust accordingly if the name is different.
merged_IPOnSPAC['SPAC_IPO_Indicator'] = merged_IPOnSPAC['SPAC IPO '].notna().astype(int)

# Adjust the column name in the drop method if necessary
merged_IPOnSPAC = merged_IPOnSPAC.drop(columns=[merged_IPOnSPAC.columns[-1]])
merged_IPOnSPAC = merged_IPOnSPAC.loc[:, ['SPAC IPO ', 'Date of IPO', 'CUSIP_x','offer date', 'IPO name', 'CUSIP_y']]

# Save the merged data to a CSV file
merged_IPOnSPAC.to_csv('inputs/MergedIPO&SPAC.csv', index=False)


In [19]:
merged_IPOnSPAC

Unnamed: 0,SPAC IPO,Date of IPO,CUSIP_x,offer date,IPO name,CUSIP_y
0,Aetherium Acquisition Corp.,12/29/2021,00809J200,,,
1,Welsbach Technology Metals Acquisition Corp.,12/27/2021,950415208,,,
2,Arogo Capital Acquisition Corp.,12/23/2021,042644203,,,
3,Gardiner Healthcare Acquisitions Corp.,12/21/2021,365506203,,,
4,Larkspur Health Acquisition Corp.,12/20/2021,51724W206,,,
...,...,...,...,...,...,...
876,M III Acquisition Corp.,7/7/2016,55378T203,,,
877,"Landcadia Holdings, Inc.",6/1/2016,51476W206,,,
878,KLR Energy Acquisition Corp.,3/16/2016,777385204,,,
879,Jensyn Acquisition Corp.,3/7/2016,47632B201,,,


In [9]:
# # Convert 'offer date' from 'YYYYMMDD' to 'MM/DD/YYYY'
# IPO_data['offer date'] = pd.to_datetime(IPO_data['offer date'],format='%Y%m%d')
# IPO_data['offer date'] = IPO_data['offer date'].dt.strftime('%m/%d/%Y')

# IPO_data.to_csv("inputs/IPO_data.csv")



In [10]:
# IPO_data.rename(columns={'offer date' : 'Date of IPO'}, inplace=True)

In [11]:
#Reading in ccm data

ccm_data = pd.read_csv('inputs/cleaned_ccm.csv')
ccm_data

Unnamed: 0.1,Unnamed: 0,datadate,cusip,scf,src,apdedate,fdate,pdate,acchg,acominc,...,seq,teq,tstk,txt,wcap,naicsh,mkvalt,Seq. No.,Code,Industry
0,20283,2014-12-31,000307108,7.0,5.0,2014-12-31,2015-04-13,2015-02-25,0.000,0.000,...,97.474,95.141,0.0,2.555,63.153,622210.0,660.8841,1821.0,622210.0,Psychiatric and Substance Abuse Hospitals
1,23764,2001-12-31,000375204,7.0,53.0,,,,-63.000,-1699.000,...,2014.000,,1750.0,105.000,479.000,335311.0,,790.0,335311.0,"Power, Distribution, and Specialty Transformer..."
2,795,2014-12-31,000899104,7.0,5.0,2014-12-31,2015-04-10,2015-03-11,0.000,0.000,...,6.009,6.009,0.0,-0.552,19.890,325414.0,103.7916,506.0,325414.0,Biological Product (except Diagnostic) Manufac...
3,5229,2001-04-30,00089C107,7.0,5.0,,,,-1.785,1.413,...,104.664,,0.0,0.037,71.958,334515.0,216.1704,770.0,334515.0,Instrument Manufacturing for Measuring and Tes...
4,5683,2000-12-31,00089J102,7.0,5.0,,,,0.000,,...,4.468,,0.0,0.065,3.768,334413.0,7.9185,758.0,334413.0,Semiconductor and Related Device Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5497,20504,2007-12-31,Y8565N300,7.0,5.0,2007-12-31,2008-05-23,,0.000,0.912,...,148.794,148.794,0.0,0.000,32.806,483111.0,550.0000,1220.0,483111.0,Deep Sea Freight Transportation
5498,14415,2004-12-31,Y8897Y230,7.0,8.0,2004-12-31,,,0.000,-0.248,...,321.809,,0.0,0.000,98.240,483111.0,452.2538,1220.0,483111.0,Deep Sea Freight Transportation
5499,2671,2014-12-31,Y8977Y100,7.0,5.0,2014-12-31,2015-04-02,2015-02-26,0.000,0.000,...,1411.000,2451.000,0.0,20.000,143.000,213111.0,1008.2829,168.0,213111.0,Drilling Oil and Gas Wells
5500,2632,2014-12-31,Y9384M101,7.0,5.0,2014-12-31,2015-05-13,,0.000,-6.000,...,283.100,835.200,0.0,16.300,46.100,424710.0,997.7975,1037.0,424710.0,Petroleum Bulk Stations and Terminals


In [12]:

merged_IPOnSPAC.rename(columns={'CUSIP' : 'cusip'}, inplace=True)
merge_df_ccm = pd.merge(left=merged_IPOnSPAC, right=ccm_data, on='cusip', how='inner')

merge_df_ccm

KeyError: 'cusip'

# Filtering Desired Variables

In [None]:
relevant_vars = ['SPAC IPO ','offer date','IPO name','adrr','curuscn','scf','src','apdedate','fdate','pdate','acominc',
                 'acox','at','am','ao','aoloch','aox','ap','at','caps','capx','cb',
                 'ch','che','clg','cogs','csho','cusip','cshrt','cstk','dd','dlc','dn','do',
                 'datadate','dt','ebit','ebtida','epspi','fca','ffo','gdwl','gp','ib','intan',
                 'invt','lt','lct','ni','niadj','np','pi','ppegt','pnrsho','ppent',
                 're','revt','sale','seq','tdc','teq','tstk','txt','wcap','naicsh',
                 'mkvalt','acchg','accrt','amc','ano','arce','cshi','depc','derhedgl']

cleaned_data = merge_df_ccm[[col for col in merge_df_ccm.columns if col in relevant_vars]]
cleaned_data.to_csv("inputs/cleaned_data.csv")

Beginning Modeling

In [None]:
KNN_data = pd.read_csv("inputs/cleaned_data.csv")
KNN_data

## KNN Modeling 

In [None]:
data_types = pd.DataFrame(KNN_data.dtypes)
#data_types.to_csv("inputs/data_types.csv")

In [None]:

date_cols = ['offer date', 'datadate', 'apdedate', 'fdate', 'pdate']
for col in date_cols:
    KNN_data[col] = pd.to_datetime(KNN_data[col], errors='coerce')

# Exclude non-numeric columns properly
exclude_cols = ['IPO name', 'cusip', 'offer date', 'datadate', 'apdedate', 'fdate', 'pdate']  # Including date columns to exclude
numeric_cols = KNN_data.columns.difference(exclude_cols + ['IPO name', 'cusip', 'SPAC IPO '])

# Convert to numeric and handle missing values only for numeric columns
KNN_data[numeric_cols] = KNN_data[numeric_cols].apply(pd.to_numeric, errors='coerce')
KNN_data[numeric_cols] = KNN_data[numeric_cols].fillna(KNN_data[numeric_cols].median())

In [None]:
null_df = KNN_data.isnull().sum()
print(len(null_df))

KNN_data = KNN_data.dropna()

In [None]:
import numpy as np

# X = KNN_data[numeric_cols]  # Removed the extra space in the column name
# y = KNN_data['SPAC IPO '].dropna()  # Ensure no extra spaces here

rng = np.random.RandomState(0)
# Split data
X_train, X_test, y_train, y_test = train_test_split(KNN_data[numeric_cols], KNN_data['SPAC IPO '].dropna(), test_size=0.3, random_state=rng)

#Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Evaluate the model
predictions = knn.predict(X_test_scaled )
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

feature_names = numeric_cols[:2]  # Replace with actual names if known
X_plot = KNN_data[feature_names]

# Scaling
scaler = StandardScaler()
X_plot_scaled = scaler.fit_transform(X_plot)

# Split data
X_train_plot, X_test_plot, y_train_plot, y_test_plot = train_test_split(X_plot_scaled, KNN_data['SPAC IPO '].dropna(), test_size=0.3, random_state=rng)

# Train KNN
knn_plot = KNeighborsClassifier(n_neighbors=5)
knn_plot.fit(X_train_plot, y_train_plot)

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])

# Plot the decision boundary
x_min, x_max = X_train_plot[:, 0].min() - 1, X_train_plot[:, 0].max() + 1
y_min, y_max = X_train_plot[:, 1].min() - 1, X_train_plot[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
Z = knn_plot.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10, 6))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X_train_plot[:, 0], X_train_plot[:, 1], c=y_train_plot, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("2-Class classification (k = 5, weights = 'uniform')")
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])

plt.show()