In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


# Logistic Regression (Numeric columns only)

In [50]:

# Load the dataset
df = pd.read_csv("../outputs/processed_dataset.csv")
print("before drop na: ", df.shape)
print("df columns at start ", df.columns)
# Drop irrelevant columns
columns_to_drop = [
    'bankruptcy_prediction_split', 
    'filename',
    'bankruptcy_date_1',
    'bankruptcy_date_2',
    'bankruptcy_date_3',
    'period_of_report',
    'can_label',
    'qualified',
    'filing_date',
    'cik_year',
    'opinion_text', 
    'item_7',
    'gvkey', 
    'datadate',
    'cik',
    'company',
    'gc_list'
]
df.drop(columns=columns_to_drop, axis=1, inplace=True)
print("cols remaining: ", df.columns)
print("??? ", df['Unnamed: 0'].head(15))


before drop na:  (20140, 32)
df columns at start  Index(['Unnamed: 0', 'bankruptcy_date_1', 'label', 'bankruptcy_date_2',
       'filing_date', 'datadate', 'bankruptcy_date_3', 'opinion_text',
       'item_7', 'bankruptcy_prediction_split', 'cik', 'company',
       'period_of_report', 'cik_year', 'qualified', 'gc_list', 'can_label',
       'filename', 'gvkey', 'CashAndCashEquivalentsAtCarryingValue',
       'IncomeTaxExpenseBenefit', 'StockholdersEquity', 'Assets',
       'LiabilitiesAndStockholdersEquity', 'EntityPublicFloat',
       'NetIncomeLoss', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'EarningsPerShareBasic',
       'RetainedEarningsAccumulatedDeficit', 'EarningsPerShareDiluted'],
      dtype='object')
cols remaining:  Index(['Unnamed: 0', 'label', 'CashAndCashEquivalentsAtCarryingValue',
       'IncomeTaxExpenseBenefit', 'StockholdersEquity', 'Assets',
       'Liabilit

In [51]:

# Define features (X) and target (y)
#    - Select all numeric columns except 'label' as features 
y = df['label'].astype(int)  # Ensure it's numeric (0 or 1)
X = df.select_dtypes(include=['float', 'int']).drop(columns=['label'], errors='ignore')
print("Columns to be used for training: ", X.columns)




# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Dealing with class imbalance: 
#   - Create SMOTE instance
sm = SMOTE(random_state=42)

# Fit SMOTE on the training data to oversample the minority class
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Create and fit the LR model
# logreg = LogisticRegression(class_weight="balanced", max_iter=50000)
# logreg.fit(X_train, y_train)
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_res, y_train_res)

# 6. Predict on the test set
y_pred = logreg.predict(X_test)

# 7. Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Columns to be used for training:  Index(['Unnamed: 0', 'CashAndCashEquivalentsAtCarryingValue',
       'IncomeTaxExpenseBenefit', 'StockholdersEquity', 'Assets',
       'LiabilitiesAndStockholdersEquity', 'EntityPublicFloat',
       'NetIncomeLoss', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'EarningsPerShareBasic',
       'RetainedEarningsAccumulatedDeficit', 'EarningsPerShareDiluted'],
      dtype='object')
Accuracy: 0.9220456802383317
Confusion Matrix:
[[3704  309]
 [   5   10]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      4013
           1       0.03      0.67      0.06        15

    accuracy                           0.92      4028
   macro avg       0.51      0.79      0.51      4028
weighted avg       1.00      0.92      0.96      4028



In [46]:
df = pd.read_csv("../outputs/processed_dataset.csv")
print("before drop na: ", df.shape)
label_frequency_date_1 = df["label"].sum()
cols_to_drop = ['bankruptcy_prediction_split', 'filename', 'period_of_report', 'gvkey', 'datadate', 'can_label', 'qualified', 'bankruptcy_date_1', 'bankruptcy_date_2', 'bankruptcy_date_3', 'filing_date', 'cik_year', 'opinion_text'] 
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')


print(f"Label frequency for label (before drop): {label_frequency_date_1} / {len(df)}")

print("Number of NaNs in 'item_7':", df['item_7'].isna().sum())
print("Number of empty strings in 'item_7':", (df['item_7'] == "").sum())
# df = df.dropna(subset=['item_7'])
df.dropna(inplace=True)
print("shape after drop: ", df.shape)
label_frequency_date_1 = df["label"].sum()
print(f"Label frequency for label: {label_frequency_date_1} / {len(df)}")

before drop na:  (20140, 32)
Label frequency for label (before drop): 76 / 20140
Number of NaNs in 'item_7': 129
Number of empty strings in 'item_7': 0
shape after drop:  (20011, 19)
Label frequency for label: 76 / 20011


## Logistic Regression (Numeric Cols + item_7 text)

In [None]:
# Read Dataset
df = pd.read_csv("../outputs/processed_dataset.csv")

# Drop Columns Not Needed
cols_to_drop = ['bankruptcy_prediction_split', 'filename', 'period_of_report', 'gvkey', 'datadate', 'can_label', 'qualified', 'bankruptcy_date_1', 'bankruptcy_date_2', 'bankruptcy_date_3', 'filing_date', 'cik_year', 'opinion_text'] 
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Drop rows where item_7 is NA
print("Starting Shape: ", df.shape)
df = df.dropna(subset=['item_7'])
print("After dropping NAs from item_7: ", df.shape)

# Separate Target
y = df['label'].astype(int)

# Identify Numeric and Text Columns
#    for now i use "item_7" as the single text column, and the previous numeric cols are kept the same 
numeric_cols = df.select_dtypes(include=['int', 'float']).drop(columns=['label'], errors='ignore').columns
text_col = 'item_7'  # The management discussion column

# Train-Test Split
# TODO: use 'bankruptcy prediction column' 
# - Compare that column with the old test set to make sure we havent lost too many rows.
# - Visualize which years were mostly dropped in the original dataset  (percentage of dropped columns PER YEAR)
# - Repeat for the labels
# - Find some data points that were dropped and check the manually (mostly from the most recent years) (email)
# - In the email include some examples with random datapoints. 
# - Add the auditor opinion 
# - USE METRICS: Avg Precision + recall at 100, ROC Curve 

# Notes: In the original ECL XGBoost was used 
# Future: Different models for each modality, different models for the text columns. 

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42, stratify=y
)

# Build a ColumnTransformer
#    TF-IDF Vectorizer for the "item_7" text, StandardScaler for the numeric cols
## TODO: Remove stopwords
preprocessor = ColumnTransformer(
    transformers=[
        ("text_tfidf", TfidfVectorizer(max_features=5000), text_col),
        ("num", StandardScaler(), numeric_cols)  # scale numeric features
    ],
    remainder='drop'  # drop any columns not specified above
)

# Create a Pipeline that combines:
#    - Preprocessing (TF-IDF, StandardScaler)
#    - SMOTE (oversampling)
#    - LogisticRegression (classification)
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(max_iter=1000))
])

# Fit the Pipeline on the Training Data
pipeline.fit(X_train, y_train)

# Predict on the Test Set
y_pred = pipeline.predict(X_test)

# Evaluate Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Starting Shape:  (20140, 19)
After dropping NAs from item_7:  (20011, 19)
Accuracy: 0.9647764176867349
Confusion Matrix:
 [[3852  136]
 [   5   10]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      3988
           1       0.07      0.67      0.12        15

    accuracy                           0.96      4003
   macro avg       0.53      0.82      0.55      4003
weighted avg       1.00      0.96      0.98      4003

