In [30]:
import pandas as pd


dft = pd.read_csv("SBA_Cleaned.csv")

dft["MIS_Status"] = dft["MIS_Status"].replace({"CHGOFF": 0,"P I F": 1})

df = dft.drop(columns=["City","Bank","BankState"])


dft.head()

Unnamed: 0,City,State,Bank,BankState,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv
0,EVANSVILLE,IN,FIFTH THIRD BANK,OH,451120,84,4,2,84,84,N,0,N,Y,1,60000,48000
1,NEW PARIS,IN,1ST SOURCE BANK,IN,722410,60,2,2,60,60,N,0,N,Y,1,40000,32000
2,BLOOMINGTON,IN,GRANT COUNTY STATE BANK,IN,621210,180,7,1,180,180,N,0,N,N,1,287000,215250
3,BROKEN ARROW,OK,1ST NATL BK & TR CO OF BROKEN,OK,0,60,2,1,60,60,N,0,N,Y,1,35000,28000
4,ORLANDO,FL,FLORIDA BUS. DEVEL CORP,FL,0,240,14,1,240,240,N,0,N,N,1,229000,229000


In [31]:
df = df.astype(object)
df["Term"] = pd.to_numeric(df["Term"], errors='coerce',downcast='integer')
df["NoEmp"] = pd.to_numeric(df["NoEmp"], errors='coerce',downcast='integer')
df["GrAppv"] = pd.to_numeric(df["GrAppv"], errors='coerce',downcast='integer')
df["SBA_Appv"] = pd.to_numeric(df["SBA_Appv"], errors='coerce',downcast='integer')
df["CreateJob"] = pd.to_numeric(df["CreateJob"], errors='coerce',downcast='integer')
df["RetainedJob"] = pd.to_numeric(df["RetainedJob"], errors='coerce',downcast='integer')
df["MIS_Status"] = pd.to_numeric(df["MIS_Status"], errors='coerce',downcast='integer')

df["NAICS"].astype(str)

df["NAICS"] = df["NAICS"].apply(lambda x: str(x)[:2])

In [32]:
from sklearn.model_selection import train_test_split



X = df.drop("MIS_Status", axis=1)
y = df["MIS_Status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)


categ_col = list(X.select_dtypes(include=['object', 'category']).columns)

num_col = list(X.select_dtypes(include=['float64', 'int64']).columns)

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

categorical_transformer = OneHotEncoder()

numerical_transformer = MinMaxScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categ_col),
        ('num', numerical_transformer, num_col),
        
    ])


In [34]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

XG_model = XGBClassifier(objective='binary:logistic', learning_rate=0.1, n_estimators=100)

param_grid = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.1, 0.2, 0.3]

}
# Best parameters:  {'classifier__n_estimators': 200, 'classifier__max_depth': 7, 'classifier__learning_rate': 0.2}
# Best score:  0.8241340504943894

model_XG = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', XGBClassifier(n_estimators = 200, max_depth = 10, learning_rate = 0.2,use_label_encoder=False,eval_metric='error',scale_pos_weight=124630/583523,verbosity = 3))])

grid_search_XG = RandomizedSearchCV(model_XG,param_grid,cv=5,verbose=3)

In [35]:
import xgboost
xgboost.__version__

'1.5.0'

In [36]:
model_XG.fit(X_train,y_train)

[15:00:19] DEBUG: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/gbm/gbtree.cc:155: Using tree method: 2
[15:00:19] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/tree/updater_prune.cc:101: tree pruning end, 604 extra nodes, 0 pruned nodes, max_depth=10
[15:00:19] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/tree/updater_prune.cc:101: tree pruning end, 570 extra nodes, 0 pruned nodes, max_depth=10
[15:00:19] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/tree/updater_prune.cc:101: tree pruning end, 546 extra nodes, 0 pruned nodes, max_depth=10
[15:00:19] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/tree/updater_prune.cc:101: tree pruning end, 556 extra nodes, 0 pruned nodes, max_depth=10
[15:00:20] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/tree/updater_prune.cc:101: tree pruning end, 534 extra nodes, 0 pru

In [37]:
import pickle as pkl

with open("model_XG.pkl", "wb") as file:
    pkl.dump(model_XG, file)

In [38]:
# print("Best parameters: ", grid_search_XG.best_params_)
# print("Best score: ", grid_search_XG.best_score_)

In [39]:
y_pred = model_XG.predict(X_test)

In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: ", acc)
print("Precision: ", prec)
print("Recall: ", rec)
print("F1-score: ", f1)

Accuracy:  0.6288162495269404
Precision:  0.9063162056137292
Recall:  0.6128899582536451
F1-score:  0.731266255541197


In [41]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.28      0.70      0.40     31158
           1       0.91      0.61      0.73    145881

    accuracy                           0.63    177039
   macro avg       0.59      0.66      0.57    177039
weighted avg       0.80      0.63      0.67    177039



In [42]:
from xgboost import plot_importance

plot_importance(model_XG)



ValueError: tree must be Booster, XGBModel or dict instance