In [1]:
import pandas as pd

# Load the dataset
file_path = 'inputs/masterMerge.csv'
df = pd.read_csv(file_path)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Define your features and the target variable
features = ['sale']
target = 'IS_SPAC'

# Ensure the target column exists
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset.")

# Fill NaN values in 'IS_SPAC' with 0 to indicate non-SPAC companies
df[target] = df[target].fillna(0)

# Convert the target column to an integer type
df[target] = df[target].astype(int)

# Convert categorical variables to numeric using one-hot encoding
df_clean = pd.get_dummies(df)

# Separate the features (X) and the target (y)
X = df_clean[features]
y = df_clean[target]

# Handle missing values in features by imputing with the mean of each column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features to ensure all variables are on the same scale (optional but recommended)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using a confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['IPO', 'SPAC'])

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Analyze feature importance (coefficients)
importance = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_[0]})
importance.sort_values(by='Coefficient', ascending=False, inplace=True)

print("\nFeature Importance (Coefficients):")
print(importance)


  df = pd.read_csv(file_path)


Confusion Matrix:
[[8308    0]
 [  29    0]]

Classification Report:
              precision    recall  f1-score   support

         IPO       1.00      1.00      1.00      8308
        SPAC       0.00      0.00      0.00        29

    accuracy                           1.00      8337
   macro avg       0.50      0.50      0.50      8337
weighted avg       0.99      1.00      0.99      8337


Feature Importance (Coefficients):
  Feature  Coefficient
0    sale    -1.877704


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
import pandas as pd

# Load the dataset
file_path = 'inputs/masterMerge.csv'
df = pd.read_csv(file_path)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Print the counts of each unique value in the 'IS_SPAC' column
print(df['IS_SPAC'].value_counts())

  df = pd.read_csv(file_path)


IS_SPAC
1.0    99
Name: count, dtype: int64
