## CATBOOST

In [1]:
# !pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Collecting catboost
  Downloading catboost-1.2-cp39-cp39-win_amd64.whl (101.0 MB)
     -------------------------------------- 101.0/101.0 MB 4.4 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     -------------------------------------- 47.0/47.0 kB 391.5 kB/s eta 0:00:00
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2 graphviz-0.20.1


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from catboost import CatBoostClassifier
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def get_summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    return summary

In [4]:
file_loc = "data.csv"

In [5]:
df = pd.read_csv(file_loc)

In [6]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849.0,0.0,0.0,360.0,Yes,1
1,4583.0,1508.0,128.0,360.0,Yes,0
2,3000.0,0.0,66.0,360.0,Yes,1
3,2583.0,2358.0,120.0,360.0,Yes,1
4,6000.0,0.0,141.0,360.0,Yes,1


In [7]:
print ("Total number of rows in dataset = {}".format(df.shape[0]))
print ("Total number of columns in dataset = {}".format(df.shape[1]))

Total number of rows in dataset = 614
Total number of columns in dataset = 6


In [8]:
result = get_summary(df)
result

Dataset Shape: (614, 6)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value
0,ApplicantIncome,float64,2,503,5849.0,4583.0
1,CoapplicantIncome,float64,2,287,0.0,1508.0
2,LoanAmount,float64,3,203,0.0,128.0
3,Loan_Amount_Term,float64,2,11,360.0,360.0
4,Credit_History,object,0,2,Yes,Yes
5,Loan_Status,int64,0,2,1,0


In [9]:
target_col = "Loan_Status"
X = df.loc[:, df.columns != target_col]
y = df.loc[:, target_col]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.8, 
                                                    random_state=42)

In [11]:
X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
397,3033.0,1459.0,95.0,360.0,Yes
254,16250.0,0.0,192.0,360.0,No
217,3727.0,1775.0,131.0,360.0,Yes
4,6000.0,0.0,141.0,360.0,Yes
256,6045.0,0.0,115.0,360.0,No


In [12]:
features = list(X_train.columns)

In [13]:
cat_features = ["Credit_History"]

In [16]:
model_cb = CatBoostClassifier(task_type='CPU', iterations=100, 
                              random_state = 2022, 
                              eval_metric="F1")

In [17]:
model_cb.fit(X_train, y_train, cat_features= cat_features, plot=True, 
             eval_set=(X_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.051314
0:	learn: 0.8743169	test: 0.8064047	best: 0.8064047 (0)	total: 155ms	remaining: 15.3s
1:	learn: 0.8663102	test: 0.8314607	best: 0.8314607 (1)	total: 157ms	remaining: 7.71s
2:	learn: 0.8663102	test: 0.8298172	best: 0.8314607 (1)	total: 160ms	remaining: 5.16s
3:	learn: 0.8617021	test: 0.8370787	best: 0.8370787 (3)	total: 161ms	remaining: 3.87s
4:	learn: 0.8617021	test: 0.8394366	best: 0.8394366 (4)	total: 164ms	remaining: 3.12s
5:	learn: 0.8617021	test: 0.8382560	best: 0.8394366 (4)	total: 168ms	remaining: 2.63s
6:	learn: 0.8617021	test: 0.8394366	best: 0.8394366 (4)	total: 169ms	remaining: 2.25s
7:	learn: 0.8617021	test: 0.8394366	best: 0.8394366 (4)	total: 173ms	remaining: 1.99s
8:	learn: 0.8617021	test: 0.8394366	best: 0.8394366 (4)	total: 176ms	remaining: 1.78s
9:	learn: 0.8617021	test: 0.8394366	best: 0.8394366 (4)	total: 178ms	remaining: 1.6s
10:	learn: 0.8617021	test: 0.8394366	best: 0.8394366 (4)	total: 179ms	remaining: 1.45s
11:	learn: 0.8617021	tes

<catboost.core.CatBoostClassifier at 0x23fda1a9970>

In [18]:
y_pred = model_cb.predict(X_test)

In [19]:
f1_score(y_test, y_pred)

0.8394366197183099

In [20]:
accuracy_score(y_test, y_pred)

0.7682926829268293