In [148]:
import pandas as pd


dft = pd.read_csv("SBA_Cleaned.csv")

dft["MIS_Status"] = dft["MIS_Status"].replace({"CHGOFF": 0,"P I F": 1})
dft["NewExist"] = dft["NewExist"].replace({1: "Y",2: "N"})
dft["UrbanRural"] = dft["NewExist"].replace({0:"U",1: "Y",2: "N"})


df = dft.drop(columns=["City","Bank","BankState","RetainedJob","SBA_Appv"])


df[df["MIS_Status"] == 0].head()

Unnamed: 0,State,NAICS,Term,NoEmp,NewExist,CreateJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv
6,NJ,0,45,45,N,0,N,N,N,N,0,600000
36,OH,0,137,2,Y,0,N,Y,N,Y,0,47000
43,IN,0,120,16,N,0,N,N,Y,N,0,200000
58,ME,236115,167,9,Y,0,N,Y,N,N,0,1350000
60,MI,713930,7,10,Y,0,N,Y,N,N,0,25000


In [149]:
df = df.astype(object)
df["Term"] = pd.to_numeric(df["Term"], errors='coerce',downcast='integer')
df["NoEmp"] = pd.to_numeric(df["NoEmp"], errors='coerce',downcast='integer')
df["GrAppv"] = pd.to_numeric(df["GrAppv"], errors='coerce',downcast='integer')
# df["SBA_Appv"] = pd.to_numeric(df["SBA_Appv"], errors='coerce',downcast='integer')
df["CreateJob"] = pd.to_numeric(df["CreateJob"], errors='coerce',downcast='integer')
# df["RetainedJob"] = pd.to_numeric(df["RetainedJob"], errors='coerce',downcast='integer')
df["MIS_Status"] = pd.to_numeric(df["MIS_Status"], errors='coerce',downcast='integer')

df["NAICS"].astype(str)

df["NAICS"] = df["NAICS"].apply(lambda x: str(x)[:2])

In [150]:
from sklearn.model_selection import train_test_split



X = df.drop("MIS_Status", axis=1)
y = df["MIS_Status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)


categ_col = list(X.select_dtypes(include=['object', 'category']).columns)

num_col = list(X.select_dtypes(include=['float64', 'int64']).columns)

In [151]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

categorical_transformer = OneHotEncoder()

numerical_transformer = MinMaxScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categ_col),
        ('num', numerical_transformer, num_col),
        
    ])


In [162]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

# XG_model = XGBClassifier(objective='binary:logistic', learning_rate=0.1, n_estimators=100)

# param_grid = {
    # 'classifier__max_depth': [3, 5, 7],
    # 'classifier__n_estimators': [50, 100, 200],
    # 'classifier__learning_rate': [0.1, 0.2, 0.3]}

# Best parameters:  {'classifier__n_estimators': 200, 'classifier__max_depth': 7, 'classifier__learning_rate': 0.2}
# Best score:  0.8241340504943894

model_XG = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', XGBClassifier(objective='binary:logistic',tree_method = "hist",colsample_bytree = 0.7,_n_estimators = 200, max_depth = 10, learning_rate = 0.2,scale_pos_weight=124630/583523,verbosity = 3))])

# grid_search_XG = RandomizedSearchCV(model_XG,param_grid,cv=5,verbose=3)

In [163]:
model_XG.fit(X_train,y_train)



[14:07:14] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
Parameters: { "_n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[14:07:14] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[14:07:14] AllReduce: 1.9e-05s, 1 calls @ 19us

[14:07:14] MakeCuts: 3.6e-05s, 1 calls @ 36us

[14:07:14] PushRowPage: 0.024846s, 1 calls @ 24846us

[14:07:14] INFO: /tmp/abs_40obctay9q/croots/recipe/xgboost-split_1659548945886/work/src/tree/updater_prune.cc:101: tree pruning end, 39

In [164]:
import pickle as pkl

with open("model_XG.pkl", "wb") as file:
    pkl.dump(model_XG, file)

In [165]:
# print("Best parameters: ", grid_search_XG.best_params_)
# print("Best score: ", grid_search_XG.best_score_)

In [166]:
y_pred = model_XG.predict(X_test)

In [167]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: ", acc)
print("Precision: ", prec)
print("Recall: ", rec)
print("F1-score: ", f1)

Accuracy:  0.6180107208016313
Precision:  0.895262147691686
Recall:  0.6074951501566345
F1-score:  0.7238260145137643


In [168]:
from sklearn.metrics import classification_report

y_tpred = model_XG.predict(X_train)

print(classification_report(y_train, y_tpred))

              precision    recall  f1-score   support

           0       0.27      0.68      0.39    124630
           1       0.90      0.61      0.73    583523

    accuracy                           0.62    708153
   macro avg       0.59      0.64      0.56    708153
weighted avg       0.79      0.62      0.67    708153



In [170]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.27      0.67      0.38     31158
           1       0.90      0.61      0.72    145881

    accuracy                           0.62    177039
   macro avg       0.58      0.64      0.55    177039
weighted avg       0.78      0.62      0.66    177039



In [160]:
import numpy as np
Test = {"State": "IN",
  "NAICS": "0",
  "Term": 120,
  "NoEmp": 16,
  "NewExist": "N",
  "CreateJob": 0,
  "RetainedJob": 0,
  "FranchiseCode": "N",
  "UrbanRural": "Y",
  "RevLineCr": "Y",
  "LowDoc": "N",
  "GrAppv": 200000,
  "SBA_Appv": 150000}

data = pd.DataFrame(data=Test,index=[0])

data

# data = X_train.iloc[0]
# data = data.to_frame().T
# data
model_XG.predict(data)[0]

0