In [6]:
import pandas as pd

# Load the dataset
file_path = 'inputs/masterMerge.csv'
df = pd.read_csv(file_path)


# Define your features and the target variable
features = ['optprcgr']
target = 'IS_SPAC'

# Ensure the target column exists
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset.")

# Fill NaN values in 'IS_SPAC' with 0 to indicate non-SPAC companies
df[target] = df[target].fillna(0)

# Convert the target column to an integer type
df[target] = df[target].astype(int)

# Separate the features (X) and the target (y)
X = df[features]
y = df[target]

# Handle missing values in features by imputing with the mean of each column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features to ensure all variables are on the same scale (optional but recommended)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using a confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['IPO', 'SPAC'])

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Analyze feature importance (coefficients)
importance = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_[0]})
importance.sort_values(by='Coefficient', ascending=False, inplace=True)

print("\nFeature Importance (Coefficients):")
print(importance)


Confusion Matrix:
[[8307    1]
 [  29    0]]

Classification Report:
              precision    recall  f1-score   support

         IPO       1.00      1.00      1.00      8308
        SPAC       0.00      0.00      0.00        29

    accuracy                           1.00      8337
   macro avg       0.50      0.50      0.50      8337
weighted avg       0.99      1.00      0.99      8337


Feature Importance (Coefficients):
    Feature  Coefficient
0  optprcgr     0.069942


  df = pd.read_csv(file_path)


In [16]:
import pandas as pd
from sklearn.metrics import confusion_matrix

# Load the dataset
file_path = 'inputs/masterMerge.csv'
df = pd.read_csv(file_path)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Get the total number of instances in the dataset
total_instances = len(df)

# Define your features and the target variable
features = ['adrr', 'curuscn', 'scf', 'src', 'apdedate', 'fdate', 'pdate', 'acominc',
                 'acox', 'at', 'am', 'ao', 'aoloch', 'aox', 'ap', 'at', 'caps', 'capx', 'cb',
                 'ch', 'che', 'clg', 'cogs', 'csho', 'cusip', 'cshrt', 'cstk', 'dd', 'dlc',
                 'dn', 'do', 'datadate', 'dt', 'ebit', 'ebitda', 'epspi', 'fca', 'ffo', 'gdwl',
                 'gp', 'ib', 'intan', 'invt', 'lt', 'lct', 'ni', 'niadj', 'np', 'pi', 'ppegt',
                 'pnrsho', 'ppent', 're', 'revt', 'sale', 'seq', 'tdc', 'teq', 'tstk', 'txt',
                 'wcap', 'naicsh', 'mkvalt', 'acchg', 'accrt', 'amc', 'ano', 'arce', 'cshi',
                 'depc', 'derhedgl']
target = 'IS_SPAC'

# Ensure the target column exists
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset.")

# Fill NaN values in 'IS_SPAC' with 0 to indicate non-SPAC companies
df[target] = df[target].fillna(0)

# Convert the target column to an integer type
df[target] = df[target].astype(int)

# Convert categorical variables to numeric using one-hot encoding
df_clean = pd.get_dummies(df)

# Separate the features (X) and the target (y)
X = df_clean[features]
y = df_clean[target]

# Handle missing values in features by imputing with the mean of each column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Train a logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Evaluate the model using a confusion matrix
conf_matrix = confusion_matrix(y, y_pred)

print(conf_matrix)

  df = pd.read_csv(file_path)


KeyError: "['tic', 'apdedate', 'fdate', 'pdate', 'cusip', 'datadate'] not in index"