# **Phase-3 : Classification Model** 

This is the final phase where we integrate embeddings obtained from Knowledge Graph, Financial ratios, and Volatility index data to one dataframe, which is trained on logistic regression resulting in different evaluation results, for different combination of the model

This file also shows how we can combine **structerd data** like table of financial ratios, along with **unstructered data** like textual data (summary and KG in this case) to get final result of logistic regression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

scaler = StandardScaler()

In [2]:
# manually match the names
matches = {
    "ABGSHIP": "ABG Shipyard Limited",
    "ADHUNIK": "Adhunik Metaliks Limited",
    "ANGIND": "ANG Industries Limited",
    "ASHAPURMIN": "Ashapura Minechem Limited",
    "BAFNAPH": "Bafna Pharmaceuticals Limited",
    "BHUSANSTL": "Bhushan Steel Limited",
    "CANDC": "C & C Constructions Limited",
    "EASUNREYRL": "Easun Reyrolle Limited",
    "EDL": "Empee Distilleries Limited",
    "GALLANT": "Gallantt Ispat Ltd",
    "GEMINI": "Gemini Communication Limited",
    "GUJNRECOKE": "Gujarat NRE Coke Limited",
    "INDOSOLAR": "Indosolar Limited",
    "IVRCLINFRA": "IVRCL Limited",
    "JAIHINDPRO": "Jaihind Projects Limited",
    "JENSONICOL": "Jenson & Nicholson (India) Limited",
    "JPINFRATEC": "Jaypee Infratech Limited",
    "KWALITY": "kwality limited",
    "ADVENZYM": "Advanced Enzyme Tech Ltd.",
    "AFFLE": "Affle (India) Ltd.",
    "ALEMBICLTD": "Alembic Pharmaceuticals Ltd.",
    "AMARAJABAT": "Amara Raja Batteries Ltd.",
    "ASTERDM": "Aster DM Healthcare Ltd.",
    "AVANTIFEED": "Avanti Feeds Ltd.",
    "BALRAMCHIN": "Balrampur Chini Mills Ltd.",
    "CEATLTD": "Ceat Ltd."
}

In [3]:
bankrupt_data = pd.read_excel(r'classifiaction model\bankrupt_financial_ratio_dataset_final.xlsx')
healthy_data = pd.read_excel(r'classifiaction model\healthy_financial_ratio_dataset _final.xlsx')
vix_data = pd.read_csv(r'classifiaction model\VIX_yearly_means.csv')

bankrupt_data['Bankruptcy'] = 1
healthy_data['Bankruptcy'] = 0

data = pd.concat([bankrupt_data, healthy_data], ignore_index=True)

data.rename(columns={'Feature_name': 'Year'}, inplace=True)
data = data.merge(vix_data, on='Year', how='left')

In [4]:
bankrupt_output_path = r'output\bankrupt'
healthy_output_path = r'output\healthy'
import os

bankrupt_df = pd.DataFrame(columns=bankrupt_data.columns)
healthy_df = pd.DataFrame(columns=healthy_data.columns)

In [5]:
for file in os.listdir(bankrupt_output_path):
    # print(file)
    row = bankrupt_data.loc[(bankrupt_data['Folder_name'] == matches[file.split('_')[0]]) & (bankrupt_data['Feature_name'] == int(file.split('_')[-2]))]
    bankrupt_df = pd.concat([bankrupt_df, row], ignore_index=True)

for file in os.listdir(healthy_output_path):
    # print(file)
    row = healthy_data.loc[(healthy_data['Folder_name'] == matches[file.split('_')[0]]) & (healthy_data['Feature_name'] == int(file.split('_')[-2]))]
    healthy_df = pd.concat([healthy_df, row], ignore_index=True)

  bankrupt_df = pd.concat([bankrupt_df, row], ignore_index=True)
  healthy_df = pd.concat([healthy_df, row], ignore_index=True)


In [6]:
bankrupt_df["Label"] = 1
healthy_df["Label"] = 0
import os
os.makedirs('output/classification_models', exist_ok=True)

financial_df = pd.concat([bankrupt_df, healthy_df], ignore_index=True)

overall_mean_vix = vix_data['Close '].mean()

drop_columns = ['Bankruptcy', 'path', 'Folder_name', 'Feature_name', 'Label']

def print_metrics(model_name, y_test, y_preds, y_probs):
    print(f"{model_name} Performance:")
    print(f"Test Accuracy: {accuracy_score(y_test, y_preds):.4f}")
    print(f"Precision: {precision_score(y_test, y_preds):.4f}")
    print(f"Recall: {recall_score(y_test, y_preds):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_preds):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_probs):.4f}\n")

def merge_and_train(embedding_loc, model_name):
    _embeddings = pd.read_csv(embedding_loc)
    merged_df = pd.concat([financial_df, _embeddings], axis=1)

    merged_df = merged_df.merge(vix_data, left_on='Feature_name', right_on='Year', how='left')
    merged_df['Close '] = merged_df['Close '].fillna(overall_mean_vix)
    merged_df.drop(columns=['Year'], inplace=True)

    merged_df = merged_df.sort_values(by=['Folder_name', 'Feature_name'])
    merged_df.dropna(inplace=True)
    X = merged_df.drop(columns=drop_columns)
    y = merged_df['Label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Step 5: Logistic Regression
    log_model = LogisticRegression()
    log_model.fit(X_train, y_train)
    log_preds = log_model.predict(X_test)
    log_probs = log_model.predict_proba(X_test)[:, 1]
    print_metrics("Logistic Regression", y_test, log_preds, log_probs)
    with open('output/classification_models/lr_' + model_name + '.pkl', 'wb') as f:
        pickle.dump(log_model, f)
    print("LogisticRegression model saved!")
    print()
    
    # Step 6: Random Forest
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    rf_probs = rf_model.predict_proba(X_test)[:, 1]
    print_metrics("Random Forest", y_test, rf_preds, rf_probs)
    with open('output/classification_models/rf_' + model_name + '.pkl', 'wb') as f:
        pickle.dump(rf_model, f)
    print("RandomForest model saved!")
    print()
    # Step 7: XGBoost
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb_model.fit(X_train, y_train)
    xgb_preds = xgb_model.predict(X_test)
    xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
    print_metrics("XGBoost", y_test, xgb_preds, xgb_probs)
    with open('output/classification_models/xgb_' + model_name + '.pkl', 'wb') as f:
        pickle.dump(xgb_model, f)
    print("XGBoost model saved!")
    print()
    

In [8]:
def prepare_final_dataset(embeddings_loc, file_name):
    _embeddings = pd.read_csv(embeddings_loc)
    merged_df = pd.concat([financial_df, _embeddings], axis=1)

    merged_df = merged_df.merge(vix_data, left_on='Feature_name', right_on='Year', how='left')
    merged_df['Close '] = merged_df['Close '].fillna(overall_mean_vix)
    merged_df.drop(columns=['Year'], inplace=True)

    merged_df = merged_df.sort_values(by=['Folder_name', 'Feature_name'])
    merged_df.dropna(inplace=True)
    merged_df.to_csv(f'output\embeddings\{file_name}_with_fr.csv')

prepare_final_dataset(r'output\embeddings\ConvE_10.csv', 'ConvE_10')
prepare_final_dataset(r'output\embeddings\ConvE_30.csv', 'ConvE_30')
prepare_final_dataset(r'output\embeddings\TransE_10.csv', 'TransE_10')
prepare_final_dataset(r'output\embeddings\TransE_30.csv', 'TransE_30')
prepare_final_dataset(r'output\embeddings\TransH_10.csv', 'TransH_10')
prepare_final_dataset(r'output\embeddings\TransH_30.csv', 'TransH_30')
prepare_final_dataset(r'output\embeddings\RGCN_10.csv', 'RGCN_10')
prepare_final_dataset(r'output\embeddings\RGCN_30.csv', 'RGCN_30')

In [145]:
print("ConvE model with 10 embeddings")
merge_and_train(r'output\embeddings\ConvE_10.csv', 'ConvE_10')
print("ConvE model with 30 embeddings")
merge_and_train(r'output\embeddings\ConvE_30.csv', 'ConvE_30')

ConvE model with 10 embeddings
Logistic Regression Performance:
Test Accuracy: 0.9474
Precision: 1.0000
Recall: 0.8889
F1 Score: 0.9412
ROC AUC: 1.0000

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 0.9474
Precision: 1.0000
Recall: 0.8889
F1 Score: 0.9412
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 0.8947
Precision: 1.0000
Recall: 0.7778
F1 Score: 0.8750
ROC AUC: 0.8889

XGBoost model saved!

ConvE model with 30 embeddings
Logistic Regression Performance:
Test Accuracy: 0.9474
Precision: 1.0000
Recall: 0.8571
F1 Score: 0.9231
ROC AUC: 0.9881

LogisticRegression model saved!



Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 0.8421
Precision: 0.7000
Recall: 1.0000
F1 Score: 0.8235
ROC AUC: 1.0000

XGBoost model saved!



Parameters: { "use_label_encoder" } are not used.



In [134]:
print("TransE model with 10 embeddings")
merge_and_train(r'output\embeddings\TransE_10.csv', 'TransE_10')
print("TransE model with 30 embeddings")
merge_and_train(r'output\embeddings\TransE_30.csv', 'TransE_30')

TransE model with 10 embeddings
Logistic Regression Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000


Parameters: { "use_label_encoder" } are not used.



F1 Score: 1.0000
ROC AUC: 1.0000

XGBoost model saved!

TransE model with 30 embeddings
Logistic Regression Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

XGBoost model saved!



Parameters: { "use_label_encoder" } are not used.



In [135]:
print("TransH model with 10 embeddings")
merge_and_train(r'output\embeddings\TransH_10.csv', 'TransH_10')
print("TransH model with 30 embeddings")
merge_and_train(r'output\embeddings\TransH_30.csv', 'TransH_30')

TransH model with 10 embeddings
Logistic Regression Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

XGBoost model saved!

TransH model with 30 embeddings


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Logistic Regression Performance:
Test Accuracy: 0.9474
Precision: 1.0000
Recall: 0.8889
F1 Score: 0.9412
ROC AUC: 0.8889

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

XGBoost model saved!



In [136]:
print("RGCN model with 10 embeddings")
merge_and_train(r'output\embeddings\RGCN_10.csv', 'RGCN_10')
print("RGCN model with 30 embeddings")
merge_and_train(r'output\embeddings\RGCN_30.csv', 'RGCN_30')

RGCN model with 10 embeddings
Logistic Regression Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost model saved!

RGCN model with 30 embeddings
Logistic Regression Performance:
Test Accuracy: 0.8947
Precision: 1.0000
Recall: 0.8462
F1 Score: 0.9167
ROC AUC: 1.0000

LogisticRegression model saved!

Random Forest Performance:
Test Accuracy: 0.9474
Precision: 1.0000
Recall: 0.9231
F1 Score: 0.9600
ROC AUC: 0.9359

RandomForest model saved!

XGBoost Performance:
Test Accuracy: 0.9474
Precision: 1.0000
Recall: 0.9231
F1 Score: 0.9600
ROC AUC: 0.9615

XGBoost model saved!



In [137]:
# missing_values = financial_df.isnull().sum()
# for i, j in zip(financial_df.columns, missing_values):
#     if j > 0: print(i, j)

# _embeddings_10_ce = pd.read_csv(r'output\embeddings\ConvE_10.csv')
# merged_df_10_ce = pd.concat([financial_df, _embeddings_10_ce], axis=1)

# _embeddings_30_ce = pd.read_csv(r'output\embeddings\ConvE_30.csv')
# merged_df_30_ce = pd.concat([financial_df, _embeddings_30_ce], axis=1)

# merged_df_10_ce = merged_df_10_ce.merge(vix_data, left_on='Feature_name', right_on='Year', how='left')
# merged_df_10_ce['Close '] = merged_df_10_ce['Close '].fillna(overall_mean_vix)
# merged_df_10_ce.drop(columns=['Year'], inplace=True)

# merged_df_30_ce = merged_df_30_ce.merge(vix_data, left_on='Feature_name', right_on='Year', how='left')
# merged_df_30_ce['Close '] = merged_df_30_ce['Close '].fillna(overall_mean_vix)
# merged_df_30_ce.drop(columns=['Year'], inplace=True)


In [138]:
# drop_columns = ['Bankruptcy', 'path', 'Folder_name', 'Feature_name', 'Label']

In [139]:
# merged_df_10_ce = merged_df_10_ce.sort_values(by=['Folder_name', 'Feature_name'])
# merged_df_10_ce.dropna(inplace=True)
# X_10_ce = merged_df_10_ce.drop(columns=drop_columns)
# y_10_ce = merged_df_10_ce["Label"]

# X_train_10_ce, X_test_10_ce, y_train_10_ce, y_test_10_ce = train_test_split(X_10_ce, y_10_ce, test_size=0.2)

In [140]:
# def print_metrics(model_name, y_test, y_preds, y_probs):
#     print(f"{model_name} Performance:")
#     print(f"Test Accuracy: {accuracy_score(y_test, y_preds):.4f}")
#     print(f"Precision: {precision_score(y_test, y_preds):.4f}")
#     print(f"Recall: {recall_score(y_test, y_preds):.4f}")
#     print(f"F1 Score: {f1_score(y_test, y_preds):.4f}")
#     print(f"ROC AUC: {roc_auc_score(y_test, y_probs):.4f}\n")

In [141]:
# # Step 5: Logistic Regression
# log_model = LogisticRegression()
# log_model.fit(X_train_10_ce, y_train_10_ce)
# log_preds = log_model.predict(X_test_10_ce)
# log_probs = log_model.predict_proba(X_test_10_ce)[:, 1]
# print_metrics("Logistic Regression", y_test_10_ce, log_preds, log_probs)
# print()
# # Step 6: Random Forest
# rf_model = RandomForestClassifier(random_state=42)
# rf_model.fit(X_train_10_ce, y_train_10_ce)
# rf_preds = rf_model.predict(X_test_10_ce)
# rf_probs = rf_model.predict_proba(X_test_10_ce)[:, 1]
# print_metrics("Random Forest", y_test_10_ce, rf_preds, rf_probs)
# print()
# # Step 7: XGBoost
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# xgb_model.fit(X_train_10_ce, y_train_10_ce)
# xgb_preds = xgb_model.predict(X_test_10_ce)
# xgb_probs = xgb_model.predict_proba(X_test_10_ce)[:, 1]
# print_metrics("XGBoost", y_test_10_ce, xgb_preds, xgb_probs)
# print()

In [143]:
# # for 30 embeddings ce
# merged_df_30_ce = merged_df_30_ce.sort_values(by=['Folder_name', 'Feature_name'])
# merged_df_30_ce.dropna(inplace=True)
# X_30_ce = merged_df_30_ce.drop(columns=drop_columns)
# y_30_ce = merged_df_30_ce["Label"]

# X_train_30_ce, X_test_30_ce, y_train_30_ce, y_test_30_ce = train_test_split(X_30_ce, y_30_ce, test_size=0.2)

In [144]:
# # Step 5: Logistic Regression
# log_model = LogisticRegression()
# log_model.fit(X_train_30_ce, y_train_30_ce)
# log_preds = log_model.predict(X_test_30_ce)
# log_probs = log_model.predict_proba(X_test_30_ce)[:, 1]
# print_metrics("Logistic Regression", y_test_30_ce, log_preds, log_probs)
# print()
# # Step 6: Random Forest
# rf_model = RandomForestClassifier(random_state=42)
# rf_model.fit(X_train_30_ce, y_train_30_ce)
# rf_preds = rf_model.predict(X_test_30_ce)
# rf_probs = rf_model.predict_proba(X_test_30_ce)[:, 1]
# print_metrics("Random Forest", y_test_30_ce, rf_preds, rf_probs)
# print()
# # Step 7: XGBoost
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# xgb_model.fit(X_train_30_ce, y_train_30_ce)
# xgb_preds = xgb_model.predict(X_test_30_ce)
# xgb_probs = xgb_model.predict_proba(X_test_30_ce)[:, 1]
# print_metrics("XGBoost", y_test_30_ce, xgb_preds, xgb_probs)