<h1 "style=color:blue"> Chapter 6 </h1>

In [None]:
import pandas as pd
import numpy as np
import scipy 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

<h2 "style=color:red"> Case 1: Detecting Patterns in Financial Statements </h2>

In [None]:
def fsa_forecast(df,Revenue=100,min_g=0.05,max_g = 0.1,min_m=0.05,
                        max_m=0.15,min_ir=0.5,max_ir=1,tax = 0.3, n_ahead=5):
    fy_min=df.fyear.min()
    fy_max=df.fyear.max()
    cols = ["Year "+str(i) for i in range(n_ahead+1)]
    
    result_g=pd.DataFrame(columns=cols)
    result_m=pd.DataFrame(columns=cols)
    result_ir=pd.DataFrame(columns=cols)
    
    for i in range(fy_min,fy_max):
  
        df1 = df[(df.fyear==i) & (df.g >=min_g) & (df.g <= max_g) &
                 (df.m>=min_m) & (df.m <=max_m) &
                 (df.ir > min_ir) & (df.ir < max_ir)]
        
        df2_g=pd.DataFrame(columns=cols)
        df2_m=pd.DataFrame(columns=cols)
        df2_ir=pd.DataFrame(columns=cols)
            
        for j in range(0,n_ahead+1):      
            
            df2 = df[(df.cocode.isin(df1.cocode)) & (df.fyear==i+j)]
            
            df2_g[cols[j]]=df2.g.reset_index(drop=True)
            df2_m[cols[j]]=df2.m.reset_index(drop=True)
            df2_ir[cols[j]]=df2.ir.reset_index(drop=True)
        
        result_g = pd.concat([result_g,df2_g],ignore_index=True)
        result_m = pd.concat([result_m,df2_m],ignore_index=True)
        result_ir = pd.concat([result_ir,df2_ir],ignore_index=True)
        
    g_mean = result_g.mean(axis=0).values
    m_mean = result_m.mean(axis=0).values
    ir_mean = result_ir.mean(axis=0).values
       
    forecast = pd.DataFrame(index=range(0,n_ahead+1))
    forecast['revenue'] = (g_mean+1).cumprod()*Revenue
    forecast['ebit'] = forecast.revenue * m_mean
    forecast['net_inv']= forecast.ebit*(1-tax) * ir_mean
    forecast['fcf'] = forecast.ebit*(1-tax) - forecast.net_inv
    
    print(f"There are {result_g.shape[0]} comparable companies.\n")
                            
    return forecast,g_mean,m_mean,ir_mean,result_g, result_m, result_ir 

In [None]:
fin1 = pd.read_csv("fin1.csv")

In [None]:
fsa_forecast(fin1)[0]

<h2 "style=color:red> Case 2: Predicting Corporate Bankruptcy </h2>

In [None]:
from scipy.io import arff #for reading the arff file

In [None]:
data = arff.loadarff('5year.arff')
df = pd.DataFrame(data[0])
# the length of data object is 2. The first item gives the dataset. The second item merely 
# states whether the variable is numeric or nominal

In [None]:
df.value_counts('class')

In [None]:
df['class'] = np.where(df['class']==b'0',0,1)

In [None]:
df['class'] = df['class'].astype('category')

In [None]:
print(df.shape)
print(df.dropna().shape)

In [None]:
df.isnull().sum().sort_values(ascending=False)
# Attribute 37 has 2548 missing values. 

In [None]:
# First drop Attr37 and then drop the missing values. 
df1 = df.drop('Attr37',axis=1).dropna(axis=0)
df1.shape

In [None]:
df2 = df.drop(['Attr37','Attr27','Attr45'],axis=1).dropna(axis=0)
df2.shape

In [None]:
feature_columns = df2.columns.difference(['class'])

In [None]:
X = df2[feature_columns]
y=df2['class']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.75,random_state=13)

In [None]:
from feature_engine.selection import DropCorrelatedFeatures

In [None]:
tr = DropCorrelatedFeatures(threshold=0.8)

In [None]:
X_train1 = tr.fit_transform(X_train,y_train)

In [None]:
X_train.shape

In [None]:
X_train1.shape

In [None]:
final_features = X_train1.columns

In [None]:
X_test1 = tr.transform(X_test)

In [None]:
# X_test1 = X_test[final_features]

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote= SMOTE(random_state=13)
X_resampled, y_resampled = smote.fit_resample(X_train1, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(random_state=13,max_iter=1000)

In [None]:
lr.get_params()

In [None]:
lr.fit(X_resampled,y_resampled)

In [None]:
print(f'The accuracy score on the training dataset is {lr.score(X_resampled,y_resampled):.2f}')
print(f'The accuracy score on the test dataset is {lr.score(X_test1,y_test):.2f}')

In [None]:
y_pred = lr.predict(X_test1)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred,labels=[1,0])

In [None]:
from sklearn.metrics import plot_confusion_matrix, ConfusionMatrixDisplay

In [None]:
plot_confusion_matrix(lr,X_test1,y_test,labels=[1,0],
                      display_labels=['Bankrupt','Non-Bankrupt'],
                      cmap='binary');

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()

In [None]:
print(f'True Positive: {tp}')
print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')


In [None]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [None]:
print(f'Precision Score is: {precision_score(y_test,y_pred):.2f}')
print(f'Recall Score is: {recall_score(y_test,y_pred):.2f}')

In [None]:
print(f'Area under the ROC curve is:\
      {roc_auc_score(y_test,lr.predict_proba(X_test1)[:, 1]):.2f}')

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
plot_roc_curve(lr,X_test1,y_test);

In [None]:
from yellowbrick.classifier import DiscriminationThreshold

In [None]:
visualizer = DiscriminationThreshold(lr,
                                     n_trials=1,
                                     argmax='fscore',
                                     random_state=13,
                                     exclude = "queue_rate")

In [None]:
visualizer.fit(X_resampled, y_resampled)
visualizer.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_resampled,y_resampled)
y_pred = knn.predict(X_test1)
y_prob = knn.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(X_resampled,y_resampled)
y_pred = nb.predict(X_test1)
y_prob = nb.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])

In [None]:
from sklearn.svm import NuSVC

In [None]:
svm = NuSVC(kernel='rbf',random_state=13)
svm.fit(X_resampled,y_resampled)
y_pred = svm.predict(X_test1)
recall_score(y_test,y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(random_state=13,max_depth=5)
dt.fit(X_resampled,y_resampled)
y_pred = dt.predict(X_test1)
y_prob = dt.predict_proba(X_test1)
roc_auc_score(y_test,y_prob[:,1])

In [None]:
pd.Series(dt.feature_importances_,index=final_features).sort_values(ascending=False)

## Feature Inportance

In [None]:
from sklearn.feature_selection import RFECV, SelectFromModel

In [None]:
dt = DecisionTreeClassifier(random_state=13,max_depth=5)
rfecv = RFECV(estimator=dt,cv=5)
rfecv.fit(X_resampled,y_resampled)
X1 = rfecv.transform(X_resampled)

In [None]:
X1.shape

In [None]:
final_features[rfecv.support_]

In [None]:
# Get a mask, or integer index, of the features selected

In [None]:
dt = DecisionTreeClassifier(random_state=13,max_depth=5)
sm = SelectFromModel(estimator=dt)
sm.fit(X_resampled,y_resampled)
X1 = sm.transform(X_resampled)

In [None]:
X1.shape

In [None]:
final_features[sm.get_support()]

In [None]:
dt.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'max_depth':[3,4,5,6,7],'max_features': [5,10,15]}

In [None]:
dt=DecisionTreeClassifier()
grid = GridSearchCV(estimator=dt,param_grid=params,cv=None,
                    scoring='roc_auc')
grid.fit(X_resampled,y_resampled)

In [None]:
grid.best_params_

In [None]:
best_model = grid.best_estimator_
best_model.fit(X_resampled,y_resampled)
y_prob = best_model.predict_proba(X_test1)[:,1]
roc_auc_score(y_test,y_prob)

In [None]:
X_resampled.shape

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params = {'max_depth':range(1,10),'max_features': range(1,30),
         'criterion':['gini','entropy']}

In [None]:
dt=DecisionTreeClassifier()
grid = RandomizedSearchCV(estimator=dt,param_distributions=params,cv=None,
                    scoring='roc_auc',random_state=13)
grid.fit(X_resampled,y_resampled)

In [None]:
grid.best_params_

In [None]:
best_model = grid.best_estimator_
best_model.fit(X_resampled,y_resampled)
y_prob = best_model.predict_proba(X_test1)[:,1]
roc_auc_score(y_test,y_prob)

## Ensemble Methods

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
knn = KNeighborsClassifier()
nb = GaussianNB()
log = LogisticRegression(random_state=13,max_iter=1000)
classifiers = [('knn',knn),('Naive Bayes',nb),('Logistic',log)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_resampled,y_resampled)

In [None]:
from sklearn.metrics import f1_score
y_pred = vc.predict(X_test1)
f1_score(y_test,y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(n_estimators=500,max_depth=5,random_state=13)
rf.fit(X_resampled,y_resampled)
y_pred = rf.predict(X_test1)
f1_score(y_test,y_pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators = 100, random_state=13)
gbc.fit(X_resampled,y_resampled)
y_pred = gbc.predict(X_test1)
f1_score(y_test,y_pred)

## AutoML using lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

In [None]:
auto = LazyClassifier(ignore_warnings=True,verbose=0,random_state=13)

In [None]:
models = auto.fit(X_resampled,X_test1,y_resampled,y_test)

In [None]:
models[0].head()

In [None]:
models[0].sort_values(by ='ROC AUC',ascending=False).head()

In [None]:
models[0].sort_values(by ='F1 Score',ascending=False).head()