In [2]:
# core imports

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# import training/testing dataset

training_df = pd.read_csv(Path("../Resources/training_dataset_original.csv"))
testing_df = pd.read_csv(Path("../Resources/testing_dataset.csv"))

# split training dataset to X and y

X_train = training_df.drop(columns="status")
y_train = training_df["status"]
display(X_train.tail())

# split testing dataset to X and y

X_test = testing_df.drop(columns="status")
y_test = testing_df["status"]
display(X_test.tail())

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses
59006,-0.224841,-0.182815,-0.182817,-0.189411,-0.191518,-0.105244,-0.213641,-0.189216,-0.202266,-0.220081,-0.218942,-0.17422,-0.20558,-0.206912,-0.08673,-0.202266,-0.217191,-0.194969
59007,-0.149498,-0.153444,-0.136048,-0.198706,-0.078807,-0.131121,-0.161011,-0.177011,-0.171751,-0.191696,-0.215157,-0.20847,-0.178749,-0.166843,-0.165335,-0.171751,-0.200379,-0.158183
59008,-0.223526,-0.179687,-0.181686,-0.203005,-0.191518,-0.127842,-0.213297,-0.184402,-0.202271,-0.219337,-0.218802,-0.193295,-0.212906,-0.20522,-0.105513,-0.202271,-0.21648,-0.192328
59009,1.004145,1.004144,0.589128,0.659983,1.504063,0.315861,1.533036,0.084743,0.930268,0.630094,0.604774,0.629126,0.563857,1.107517,0.28894,0.930268,0.760133,0.937813
59010,-0.224761,-0.182817,-0.182834,-0.192112,-0.191518,-0.113492,-0.21365,-0.188879,-0.202271,-0.218346,-0.21593,-0.177898,-0.205592,-0.195994,-0.092337,-0.202271,-0.212037,-0.194449


Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses
19666,-0.215226,-0.164916,-0.194334,-0.181122,-0.181608,-0.094862,-0.208887,-0.175621,-0.182696,-0.225049,-0.236255,-0.163658,-0.193834,-0.199539,-0.105437,-0.182696,-0.226363,-0.175619
19667,-0.212986,-0.165046,-0.19422,-0.184832,-0.185888,-0.10352,-0.210132,-0.175893,-0.182165,-0.224273,-0.229114,-0.168517,-0.191778,-0.200033,-0.406323,-0.182165,-0.223765,-0.174307
19668,-0.210084,-0.161478,-0.193038,-0.182268,-0.162754,-0.098553,-0.216656,-0.172556,-0.178827,-0.223939,-0.236561,-0.165623,-0.189595,-0.205375,-0.102255,-0.178827,-0.228833,-0.170958
19669,-0.198817,-0.084905,-0.122853,-0.142938,-0.144338,-0.087156,-0.208782,-0.15834,-0.112154,-0.17197,-0.17847,-0.140385,-0.158953,-0.17205,-0.057595,-0.112154,-0.185918,-0.10181
19670,-0.200724,-0.063058,-0.087832,-0.115193,-0.159361,-0.063184,-0.210541,-0.141537,-0.091192,-0.141662,-0.144627,-0.117248,-0.14387,-0.169791,-0.01976,-0.091192,-0.161145,-0.082975


# Model Comparision

### Logistic Regression

In [2]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.38      0.01      0.01      1254

    accuracy                           0.94     19671
   macro avg       0.66      0.50      0.49     19671
weighted avg       0.90      0.94      0.91     19671



### Linear SVC

In [3]:
from sklearn.svm import LinearSVC

model = LinearSVC(tol=1e-5, max_iter=5000)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.00      0.00      0.00      1254

    accuracy                           0.94     19671
   macro avg       0.47      0.50      0.48     19671
weighted avg       0.88      0.94      0.91     19671





### MLP Classifier

In [4]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.45      0.03      0.06      1254

    accuracy                           0.94     19671
   macro avg       0.70      0.51      0.51     19671
weighted avg       0.91      0.94      0.91     19671



### SVC: kernel = 'rbf'

In [5]:
from sklearn.svm import SVC

model = SVC(kernel='rbf')
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.70      0.01      0.01      1254

    accuracy                           0.94     19671
   macro avg       0.82      0.50      0.49     19671
weighted avg       0.92      0.94      0.91     19671



### Analysis

SVC kernel = 'rbf' has the highest precision score on class 1 (bankrupt).
Recall value for class 1 is really low overall.

# Resampling Comparision

Since SVC had the best performance, I will use it to compare the effect of resampling.

### Random Under Sampler

In [6]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler()
X_resample, y_resample = sampler.fit_resample(X_train, y_train)

model = SVC(kernel='rbf')
model.fit(X_resample, y_resample)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.21      0.34     18417
           1       0.07      0.94      0.14      1254

    accuracy                           0.26     19671
   macro avg       0.53      0.57      0.24     19671
weighted avg       0.92      0.26      0.33     19671



### ClusterCentroids

In [7]:
from imblearn.under_sampling import ClusterCentroids

sampler = ClusterCentroids()
X_resample, y_resample = sampler.fit_resample(X_train, y_train)

model = SVC(kernel='rbf')
model.fit(X_resample, y_resample)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.13      0.23     18417
           1       0.07      0.93      0.13      1254

    accuracy                           0.18     19671
   macro avg       0.52      0.53      0.18     19671
weighted avg       0.91      0.18      0.23     19671



### NearMiss

In [8]:
from imblearn.under_sampling import NearMiss

sampler = NearMiss(version=1, n_neighbors=2)
X_resample, y_resample = sampler.fit_resample(X_train, y_train)

model = SVC(kernel='rbf')
model.fit(X_resample, y_resample)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.31      0.47     18417
           1       0.06      0.69      0.12      1254

    accuracy                           0.34     19671
   macro avg       0.50      0.50      0.29     19671
weighted avg       0.88      0.34      0.45     19671



### SMOTETomek

In [9]:
from imblearn.combine import SMOTETomek

sampler = SMOTETomek(sampling_strategy='all')
X_resample, y_resample = sampler.fit_resample(X_train, y_train)

model = SVC(kernel='rbf')
model.fit(X_resample, y_resample)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.32      0.48     18417
           1       0.08      0.89      0.15      1254

    accuracy                           0.35     19671
   macro avg       0.53      0.60      0.31     19671
weighted avg       0.92      0.35      0.46     19671



### Analysis

Resampling process will make class 1 precision worse, but increase class 1 recall very well.

# Imbalanced Learn Ensemble Methods

### EasyEnsembleClassifier

In [10]:
from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91     18417
           1       0.14      0.31      0.19      1254

    accuracy                           0.83     19671
   macro avg       0.54      0.59      0.55     19671
weighted avg       0.90      0.83      0.86     19671



### RUSBoostClassifier

In [11]:
from imblearn.ensemble import RUSBoostClassifier

model = RUSBoostClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90     18417
           1       0.13      0.30      0.18      1254

    accuracy                           0.83     19671
   macro avg       0.54      0.58      0.54     19671
weighted avg       0.90      0.83      0.86     19671



### Balanced Random Forest Classifier

In [12]:
from imblearn.ensemble import BalancedBaggingClassifier

model = BalancedBaggingClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93     18417
           1       0.17      0.23      0.19      1254

    accuracy                           0.88     19671
   macro avg       0.56      0.58      0.56     19671
weighted avg       0.90      0.88      0.89     19671



### Analysis

All of the Imbalanced learn methods didn't work very well eigher.

# Decomposition

Next, I will try reducing the features' dimension

### PCA

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

model = SVC(kernel='rbf')
model.fit(X_train_pca, y_train)
pred = model.predict(X_test_pca)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.00      0.00      0.00      1254

    accuracy                           0.94     19671
   macro avg       0.47      0.50      0.48     19671
weighted avg       0.88      0.94      0.91     19671



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Non-negative matrix factorization

In [22]:
from sklearn.decomposition import NMF

# NMF cannot have negative value, so add 100 and make the dataset with mean of 100
X_train_nmf = X_train + 100
X_test_nmf = X_test + 100


In [23]:

nmf = NMF(n_components=2, init='random')
nmf.fit(X_train_nmf)
X_train_nmf = nmf.transform(X_train_nmf)
X_test_nmf = nmf.transform(X_test_nmf)

model = SVC(kernel='rbf')
model.fit(X_train_nmf, y_train)
pred = model.predict(X_test_nmf)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.00      0.00      0.00      1254

    accuracy                           0.94     19671
   macro avg       0.47      0.50      0.48     19671
weighted avg       0.88      0.94      0.91     19671



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Analysis

Dimensional reduction did not work to increase performance on category 1

# Combine original and resampled dataset

Since original dataset has better precision while random under sampler has better recall, I will combine both methods to extract the best result

1. Use SVC and train with `resampled` dataset
2. Predict testing dataset with the trained SVM model
3. Filter prediction and save category 0 and category 1 in separate dataframes
4. Use SVM and train with `original` dataset
5. Predict filtered category 1 dataframe with the trained SVM model
6. Combine category 0 dataframe created in `step 3` and prediction from `step 5`
7. Check if there's any improvement on overall performance 


### Step 1 and 2

In [59]:
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler()
X_resample, y_resample = sampler.fit_resample(X_train, y_train)
svc_step1 = SVC(kernel='rbf')
svc_step1.fit(X_resample, y_resample)
pred_step2 = svc_step1.predict(X_test)

### Step 3

In [60]:
test_df_with_pred_step2 = pd.concat([X_test, pd.Series(pred_step2)], axis=1)
test_df_with_pred_step2

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses,0
0,-0.199594,-0.141596,-0.188851,-0.158005,-0.163863,-0.072463,-0.215556,-0.156723,-0.153357,-0.215075,-0.236445,-0.135659,-0.154967,-0.191582,-0.072065,-0.153357,-0.222118,-0.146289,1
1,-0.215443,-0.165890,-0.190679,-0.178147,-0.185888,-0.094894,-0.216647,-0.174752,-0.184221,-0.218643,-0.217145,-0.161139,-0.196437,-0.205648,-0.078922,-0.184221,-0.219449,-0.177933,1
2,-0.183456,-0.151310,-0.178568,-0.200559,-0.168932,-0.134841,-0.188529,-0.158184,-0.172934,-0.214003,-0.235042,-0.194698,-0.195208,-0.187763,-0.174652,-0.172934,-0.218323,-0.160734,1
3,-0.186240,-0.158815,-0.188724,-0.176846,-0.173672,-0.093812,-0.203040,-0.169162,-0.176748,-0.214476,-0.236561,-0.160170,-0.189313,-0.203606,-0.103152,-0.176748,-0.228071,-0.169595,1
4,0.670257,-0.105356,0.417748,0.554675,-0.185888,0.465208,0.257581,0.456596,0.133016,0.674757,0.410821,0.566478,0.692651,0.795436,0.779173,0.133016,0.537594,0.047973,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19666,-0.215226,-0.164916,-0.194334,-0.181122,-0.181608,-0.094862,-0.208887,-0.175621,-0.182696,-0.225049,-0.236255,-0.163658,-0.193834,-0.199539,-0.105437,-0.182696,-0.226363,-0.175619,1
19667,-0.212986,-0.165046,-0.194220,-0.184832,-0.185888,-0.103520,-0.210132,-0.175893,-0.182165,-0.224273,-0.229114,-0.168517,-0.191778,-0.200033,-0.406323,-0.182165,-0.223765,-0.174307,1
19668,-0.210084,-0.161478,-0.193038,-0.182268,-0.162754,-0.098553,-0.216656,-0.172556,-0.178827,-0.223939,-0.236561,-0.165623,-0.189595,-0.205375,-0.102255,-0.178827,-0.228833,-0.170958,1
19669,-0.198817,-0.084905,-0.122853,-0.142938,-0.144338,-0.087156,-0.208782,-0.158340,-0.112154,-0.171970,-0.178470,-0.140385,-0.158953,-0.172050,-0.057595,-0.112154,-0.185918,-0.101810,1


In [61]:
test_df_class_0 = test_df_with_pred_step2[test_df_with_pred_step2.iloc[:,-1] == 0]
test_df_class_0

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses,0
4,0.670257,-0.105356,0.417748,0.554675,-0.185888,0.465208,0.257581,0.456596,0.133016,0.674757,0.410821,0.566478,0.692651,0.795436,0.779173,0.133016,0.537594,0.047973,0
9,-0.110079,-0.114938,0.334511,0.033223,-0.185888,-0.033534,-0.123687,0.098926,-0.064258,-0.107221,-0.228191,-0.079971,0.071117,-0.119874,-0.022624,-0.064258,-0.172950,-0.080083,0
14,0.399442,0.052613,0.220498,0.257076,0.693795,0.160202,0.321417,0.286253,0.124476,0.259485,0.333481,0.252652,0.278245,0.113892,0.189822,0.124476,0.146938,0.094396,0
19,0.488797,4.991222,0.677691,0.775708,0.569968,0.460194,1.026810,0.076880,3.898554,0.670409,0.330356,0.757812,0.534658,1.157671,0.691736,3.898554,0.807066,4.331250,0
21,1.047924,0.225467,0.468718,0.603794,0.302979,0.371371,0.393775,1.116533,0.481819,0.776769,0.293116,0.611498,1.023658,0.328663,-0.576107,0.481819,0.220397,0.439322,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19648,-0.100973,0.055005,-0.161668,-0.081551,-0.185888,-0.016204,0.029436,-0.077175,0.023804,-0.173908,-0.199403,-0.046391,-0.056747,-0.127488,0.017206,0.023804,-0.176377,0.042750,0
19649,-0.011176,-0.127534,-0.103964,0.038506,0.125745,0.039844,-0.190691,-0.047689,0.017711,-0.097346,-0.178377,0.088260,0.369928,-0.064368,0.113135,0.017711,-0.141888,0.013067,0
19650,5.417963,5.114976,33.583321,20.499112,0.607714,9.121122,11.991555,8.188419,9.639589,21.128849,20.003523,14.257492,19.001901,14.477723,2.647345,9.639589,22.036313,7.198537,0
19656,0.190766,0.211923,-0.014340,0.007931,0.192419,-0.069676,0.166603,-0.084891,0.162095,0.017179,0.056866,0.015576,0.011473,0.121448,-0.030540,0.162095,0.067791,0.184681,0


In [62]:
test_df_class_1 = test_df_with_pred_step2[test_df_with_pred_step2.iloc[:,-1] == 1]
test_df_class_1

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses,0
0,-0.199594,-0.141596,-0.188851,-0.158005,-0.163863,-0.072463,-0.215556,-0.156723,-0.153357,-0.215075,-0.236445,-0.135659,-0.154967,-0.191582,-0.072065,-0.153357,-0.222118,-0.146289,1
1,-0.215443,-0.165890,-0.190679,-0.178147,-0.185888,-0.094894,-0.216647,-0.174752,-0.184221,-0.218643,-0.217145,-0.161139,-0.196437,-0.205648,-0.078922,-0.184221,-0.219449,-0.177933,1
2,-0.183456,-0.151310,-0.178568,-0.200559,-0.168932,-0.134841,-0.188529,-0.158184,-0.172934,-0.214003,-0.235042,-0.194698,-0.195208,-0.187763,-0.174652,-0.172934,-0.218323,-0.160734,1
3,-0.186240,-0.158815,-0.188724,-0.176846,-0.173672,-0.093812,-0.203040,-0.169162,-0.176748,-0.214476,-0.236561,-0.160170,-0.189313,-0.203606,-0.103152,-0.176748,-0.228071,-0.169595,1
5,-0.164565,-0.151996,-0.144394,-0.201924,-0.115710,-0.180138,-0.188624,-0.156914,-0.175169,-0.199613,-0.236561,-0.209047,-0.200835,-0.198294,-0.074859,-0.175169,-0.225907,-0.163043,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19666,-0.215226,-0.164916,-0.194334,-0.181122,-0.181608,-0.094862,-0.208887,-0.175621,-0.182696,-0.225049,-0.236255,-0.163658,-0.193834,-0.199539,-0.105437,-0.182696,-0.226363,-0.175619,1
19667,-0.212986,-0.165046,-0.194220,-0.184832,-0.185888,-0.103520,-0.210132,-0.175893,-0.182165,-0.224273,-0.229114,-0.168517,-0.191778,-0.200033,-0.406323,-0.182165,-0.223765,-0.174307,1
19668,-0.210084,-0.161478,-0.193038,-0.182268,-0.162754,-0.098553,-0.216656,-0.172556,-0.178827,-0.223939,-0.236561,-0.165623,-0.189595,-0.205375,-0.102255,-0.178827,-0.228833,-0.170958,1
19669,-0.198817,-0.084905,-0.122853,-0.142938,-0.144338,-0.087156,-0.208782,-0.158340,-0.112154,-0.171970,-0.178470,-0.140385,-0.158953,-0.172050,-0.057595,-0.112154,-0.185918,-0.101810,1


### Step 4 and 5

In [63]:
X_train_step3 = test_df_class_1.iloc[:,:-1]
y_train_step3 = test_df_class_1.iloc[:,-1]
X_train_step3

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses
0,-0.199594,-0.141596,-0.188851,-0.158005,-0.163863,-0.072463,-0.215556,-0.156723,-0.153357,-0.215075,-0.236445,-0.135659,-0.154967,-0.191582,-0.072065,-0.153357,-0.222118,-0.146289
1,-0.215443,-0.165890,-0.190679,-0.178147,-0.185888,-0.094894,-0.216647,-0.174752,-0.184221,-0.218643,-0.217145,-0.161139,-0.196437,-0.205648,-0.078922,-0.184221,-0.219449,-0.177933
2,-0.183456,-0.151310,-0.178568,-0.200559,-0.168932,-0.134841,-0.188529,-0.158184,-0.172934,-0.214003,-0.235042,-0.194698,-0.195208,-0.187763,-0.174652,-0.172934,-0.218323,-0.160734
3,-0.186240,-0.158815,-0.188724,-0.176846,-0.173672,-0.093812,-0.203040,-0.169162,-0.176748,-0.214476,-0.236561,-0.160170,-0.189313,-0.203606,-0.103152,-0.176748,-0.228071,-0.169595
5,-0.164565,-0.151996,-0.144394,-0.201924,-0.115710,-0.180138,-0.188624,-0.156914,-0.175169,-0.199613,-0.236561,-0.209047,-0.200835,-0.198294,-0.074859,-0.175169,-0.225907,-0.163043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19666,-0.215226,-0.164916,-0.194334,-0.181122,-0.181608,-0.094862,-0.208887,-0.175621,-0.182696,-0.225049,-0.236255,-0.163658,-0.193834,-0.199539,-0.105437,-0.182696,-0.226363,-0.175619
19667,-0.212986,-0.165046,-0.194220,-0.184832,-0.185888,-0.103520,-0.210132,-0.175893,-0.182165,-0.224273,-0.229114,-0.168517,-0.191778,-0.200033,-0.406323,-0.182165,-0.223765,-0.174307
19668,-0.210084,-0.161478,-0.193038,-0.182268,-0.162754,-0.098553,-0.216656,-0.172556,-0.178827,-0.223939,-0.236561,-0.165623,-0.189595,-0.205375,-0.102255,-0.178827,-0.228833,-0.170958
19669,-0.198817,-0.084905,-0.122853,-0.142938,-0.144338,-0.087156,-0.208782,-0.158340,-0.112154,-0.171970,-0.178470,-0.140385,-0.158953,-0.172050,-0.057595,-0.112154,-0.185918,-0.101810


In [64]:
svc_step4 = SVC(kernel='rbf')
svc_step4.fit(X_train, y_train)
pred_step5 = model.predict(X_train_step3)

### Step 6

In [65]:
final_pred = pd.Series(pred_step5)
step_3 = X_train_step3.reset_index(drop=True)
pred_df = pd.concat([step_3, final_pred], axis=1)
combined_df = pd.concat([pred_df, test_df_class_0.reset_index(drop=True)])
combined_df['pred'] = combined_df[0]
combined_df.drop(columns=0, inplace=True)
combined_df

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses,pred
0,-0.199594,-0.141596,-0.188851,-0.158005,-0.163863,-0.072463,-0.215556,-0.156723,-0.153357,-0.215075,-0.236445,-0.135659,-0.154967,-0.191582,-0.072065,-0.153357,-0.222118,-0.146289,1
1,-0.215443,-0.165890,-0.190679,-0.178147,-0.185888,-0.094894,-0.216647,-0.174752,-0.184221,-0.218643,-0.217145,-0.161139,-0.196437,-0.205648,-0.078922,-0.184221,-0.219449,-0.177933,1
2,-0.183456,-0.151310,-0.178568,-0.200559,-0.168932,-0.134841,-0.188529,-0.158184,-0.172934,-0.214003,-0.235042,-0.194698,-0.195208,-0.187763,-0.174652,-0.172934,-0.218323,-0.160734,1
3,-0.186240,-0.158815,-0.188724,-0.176846,-0.173672,-0.093812,-0.203040,-0.169162,-0.176748,-0.214476,-0.236561,-0.160170,-0.189313,-0.203606,-0.103152,-0.176748,-0.228071,-0.169595,1
4,-0.164565,-0.151996,-0.144394,-0.201924,-0.115710,-0.180138,-0.188624,-0.156914,-0.175169,-0.199613,-0.236561,-0.209047,-0.200835,-0.198294,-0.074859,-0.175169,-0.225907,-0.163043,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4165,-0.100973,0.055005,-0.161668,-0.081551,-0.185888,-0.016204,0.029436,-0.077175,0.023804,-0.173908,-0.199403,-0.046391,-0.056747,-0.127488,0.017206,0.023804,-0.176377,0.042750,0
4166,-0.011176,-0.127534,-0.103964,0.038506,0.125745,0.039844,-0.190691,-0.047689,0.017711,-0.097346,-0.178377,0.088260,0.369928,-0.064368,0.113135,0.017711,-0.141888,0.013067,0
4167,5.417963,5.114976,33.583321,20.499112,0.607714,9.121122,11.991555,8.188419,9.639589,21.128849,20.003523,14.257492,19.001901,14.477723,2.647345,9.639589,22.036313,7.198537,0
4168,0.190766,0.211923,-0.014340,0.007931,0.192419,-0.069676,0.166603,-0.084891,0.162095,0.017179,0.056866,0.015576,0.011473,0.121448,-0.030540,0.162095,0.067791,0.184681,0


### Step 7

In [66]:
report = classification_report(y_test, combined_df['pred'])
print(report)

              precision    recall  f1-score   support

           0       0.94      0.22      0.35     18417
           1       0.06      0.79      0.12      1254

    accuracy                           0.25     19671
   macro avg       0.50      0.50      0.23     19671
weighted avg       0.88      0.25      0.34     19671



# Combine original and resampled dataset

Since original dataset has better precision while random under sampler has better recall, I will combine both methods to extract the best result

1. Use SVC and train with `original` dataset
2. Predict testing dataset with the trained SVM model
3. Filter prediction and save category 0 and category 1 in separate dataframes
4. Use SVM and train with `resampled` dataset
5. Predict filtered category 1 dataframe with the trained SVM model
6. Combine category 0 dataframe created in `step 3` and prediction from `step 5`
7. Check if there's any improvement on overall performance 

In [68]:
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler

# Step1
svc_step1 = SVC(kernel='rbf')
svc_step1.fit(X_train, y_train)

# Step2
pred_step2 = svc_step1.predict(X_test)
test_df_with_pred_step2 = pd.concat([X_test, pd.Series(pred_step2)], axis=1)
test_df_class_0 = test_df_with_pred_step2[test_df_with_pred_step2.iloc[:,-1] == 0]
test_df_class_1 = test_df_with_pred_step2[test_df_with_pred_step2.iloc[:,-1] == 1]

# Step3
X_train_step3 = test_df_class_1.iloc[:,:-1]
y_train_step3 = test_df_class_1.iloc[:,-1]

# Step4
sampler = RandomUnderSampler()
X_resample, y_resample = sampler.fit_resample(X_train, y_train)
svc_step4 = SVC(kernel='rbf')
svc_step4.fit(X_resample, y_resample)

# Step5
pred_step5 = model.predict(X_train_step3)
final_pred = pd.Series(pred_step5)
step_3 = X_train_step3.reset_index(drop=True)

# Step6
pred_df = pd.concat([step_3, final_pred], axis=1)
combined_df = pd.concat([pred_df, test_df_class_0.reset_index(drop=True)])
combined_df['pred'] = combined_df[0]
combined_df.drop(columns=0, inplace=True)

# Step7
report = classification_report(y_test, combined_df['pred'])
print(report)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     18417
           1       0.00      0.00      0.00      1254

    accuracy                           0.94     19671
   macro avg       0.47      0.50      0.48     19671
weighted avg       0.88      0.94      0.91     19671

