In [11]:
import pandas as pd
import numpy as np
random_state=42
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [10]:
#feature to use
'''
Market Capitalization
Total Revenues
Total Assets
Earnings per Share
Net Operating Cash Flows
Total Debt
EBIT Margin
'''
sample_columns=['csho','prcc_f','revt','at','epsfi','oancf','dt','ebit','stalt','costat','spcseccd']
columns_selected=['csho','prcc_f','revt','at','epsfi','oancf','dt','ebit']
preditor=['stalt']

In [7]:
def sample_data(fiscal=None,freq=0.01,columns=sample_columns):
    #filename:healthy or bankruptcy
    df1=pd.read_csv('../data/accouting_data/healthy_accouting_stalt.csv')
    df2=pd.read_csv('../data/accouting_data/bankruptcy_accouting_stalt.csv')
    df2['stalt']=1.0 #label bankruptcy
    df1['stalt']=0.0 # label healthy
    df1=df1[columns].dropna()
    df2=df2[columns].dropna() # naive ways of handling nan data
    df1=df1[df1['costat']=='A'] #get all active companies data
    sample_df=df1.groupby('spcseccd').apply(lambda x: x.sample(frac=freq,random_state=random_state))
    # drop spcseccd columns in order to reset index
    sample_df.drop(columns='spcseccd',inplace=True)
    sample_df.reset_index(level=0,inplace=True)
    #statified sample
    #cols = sample_df.columns.tolist()
    #cols=cols[-1:] + cols[:-1]
    sample_df=sample_df[columns]
    df_final=sample_df.append(df2).reset_index().drop(columns='index')
    return df_final.copy()
        
        

In [13]:
def make_test_train(df,columns=columns_selected,pred=preditor,random_state=random_state):
    #handle with nan data
    X=df[columns_selected]
    y=df[pred]
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=random_state,stratify=y)
    
    return X_train,X_test,y_train,y_test
    
    #select feature for ML training

In [1]:
def calc_metrics(model,X_test,y_test,threshold=0.5):
    """
    Arguments:
    Model: - trained model such as Logistic Regression
    X_test - np.array of test set
    y_test -binary label in range {0,1} 
    """
    if model is None:
        return 0.0,0.0,0.0
    
    predicted_prob=model.predict_proba(X_test)[:,1]
    predicted_binary=(predicted_prob>threshold).astype(int)
    
    fpr,tpr,_=metrics.roc_curve(y_test,predicted_prob,pos_label=1)
    
    #compute AUC from prediction score
    roc_auc=metrics.auc(fpr,tpr)
    ks=np.max(tpr-fpr)
    
    #accuracy score
    accuracy_score=metrics.accuracy_score(y_test,predicted_binary)
        
    try:
        plt.title('Logistic Regression ROC Curve')
        plt.plot(fpr,tpr,'b',label='AUC=%0.2f' % roc_auc)
        plt.legend(loc='lower right')
        plt.plot([0,1],[0,1],'r--')
        plt.xlabel('False Positive rate')
        plt.ylabel('True Positive rate')
        plt.savefig('../picture/ROC_curve_1.png')
        plt.show()
    except:
        pass
    
    return roc_auc,accuracy_score,ks