In [1]:
! pip install -U -q catboost

In [2]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv("data/training.csv")
validation = pd.read_csv("data/validation.csv")
test = pd.read_csv("data/test.csv")

In [4]:
train.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Days
0,A666682,Ivy,11/05/2013 06:42:00 PM,November 2013,20604 Delorio St in Travis (TX),Stray,Injured,Cat,Intact Female,180.0,Domestic Longhair Mix,Calico,0
1,A841286,Elliott,08/25/2021 12:16:00 PM,August 2021,Austin (TX),Owner Surrender,Normal,Cat,Intact Male,90.0,Domestic Shorthair,Agouti/White,1
2,A729812,Walle,06/24/2016 11:25:00 AM,June 2016,Outside Jurisdiction,Owner Surrender,Normal,Cat,Neutered Male,1095.0,Domestic Shorthair Mix,White,1
3,A764811,Canelo,01/06/2018 04:21:00 PM,January 2018,208 Jockey Bluff Cv in Austin (TX),Public Assist,Normal,Dog,Neutered Male,365.0,Vizsla/Greyhound,Red,0
4,A678348,*Maybe,05/07/2014 04:23:00 PM,May 2014,14506 Hartsmith Dr in Austin (TX),Stray,Normal,Cat,Intact Female,28.0,Domestic Medium Hair Mix,Orange Tabby/White,0


In [5]:
def splitMonthYear(monthyears):
    month_list = []
    year_list = []
    for monthyear in monthyears:
        month, year = monthyear.split()
        month_list.append(month.lower())
        year_list.append(year)
    return month_list, year_list

In [6]:
train["Month"], train["Year"] = splitMonthYear(train["MonthYear"])
validation["Month"], validation["Year"] = splitMonthYear(validation["MonthYear"])
test["Month"], test["Year"] = splitMonthYear(test["MonthYear"])

In [7]:
train.drop(columns=["Animal ID", "Name",
                    "DateTime", "MonthYear",
                    "Found Location"], inplace=True)
validation.drop(columns=["Animal ID", "Name",
                         "DateTime", "MonthYear",
                         "Found Location"], inplace=True)
test.drop(columns=["Animal ID", "Name",
                   "DateTime", "MonthYear",
                   "Found Location"], inplace=True)

In [8]:
train.head()

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Days,Month,Year
0,Stray,Injured,Cat,Intact Female,180.0,Domestic Longhair Mix,Calico,0,november,2013
1,Owner Surrender,Normal,Cat,Intact Male,90.0,Domestic Shorthair,Agouti/White,1,august,2021
2,Owner Surrender,Normal,Cat,Neutered Male,1095.0,Domestic Shorthair Mix,White,1,june,2016
3,Public Assist,Normal,Dog,Neutered Male,365.0,Vizsla/Greyhound,Red,0,january,2018
4,Stray,Normal,Cat,Intact Female,28.0,Domestic Medium Hair Mix,Orange Tabby/White,0,may,2014


In [9]:
train_y = train["Days"].values
train = train.drop(columns="Days")
validation_y = validation["Days"].values
validation = validation.drop(columns="Days")
test_y = test["Days"].values
test = test.drop(columns="Days")

for c in ["Intake Type", "Intake Condition", "Animal Type",
          "Sex upon Intake", "Breed", "Color", "Month", "Year"]:
    train[c] = train[c].astype("category")
    validation[c] = validation[c].astype("category")
    test[c] = test[c].astype("category")

train["Age upon Intake"] = train["Age upon Intake"].astype("float")
validation["Age upon Intake"] = validation["Age upon Intake"].astype("float")
test["Age upon Intake"] = test["Age upon Intake"].astype("float")

In [10]:
train.head()

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Month,Year
0,Stray,Injured,Cat,Intact Female,180.0,Domestic Longhair Mix,Calico,november,2013
1,Owner Surrender,Normal,Cat,Intact Male,90.0,Domestic Shorthair,Agouti/White,august,2021
2,Owner Surrender,Normal,Cat,Neutered Male,1095.0,Domestic Shorthair Mix,White,june,2016
3,Public Assist,Normal,Dog,Neutered Male,365.0,Vizsla/Greyhound,Red,january,2018
4,Stray,Normal,Cat,Intact Female,28.0,Domestic Medium Hair Mix,Orange Tabby/White,may,2014


In [11]:
class_weight_0 = (sum(train_y == 0) + sum(train_y == 1)) / sum(train_y == 0)
class_weight_1 = (sum(train_y == 0) + sum(train_y == 1)) / sum(train_y == 1)

params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",  # Some others: Accuracy, Precision, Recall, F1, AUC
    "verbose": 200,  # output training process at every 200 iterations
    "random_seed": 951,
    "iterations": 2600,
    "class_weights": [class_weight_0, class_weight_1],
    "learning_rate": 0.015
}

# All input features are categorical
cat_features = [0, 1, 2, 3, 5, 6, 7, 8]
cb_classifier = CatBoostClassifier(**params)
cb_classifier.fit(
    train,
    train_y,
    eval_set=(validation, validation_y),  # data to validate on
    use_best_model=True,
    cat_features=cat_features,
)

0:	learn: 0.7261358	test: 0.7282572	best: 0.7282572 (0)	total: 99.6ms	remaining: 4m 18s
200:	learn: 0.7615795	test: 0.7552243	best: 0.7560466 (166)	total: 8.14s	remaining: 1m 37s
400:	learn: 0.7696167	test: 0.7636710	best: 0.7648779 (347)	total: 16.3s	remaining: 1m 29s
600:	learn: 0.7742454	test: 0.7660777	best: 0.7666547 (580)	total: 24.3s	remaining: 1m 20s
800:	learn: 0.7787141	test: 0.7697708	best: 0.7697708 (800)	total: 33.6s	remaining: 1m 15s
1000:	learn: 0.7820749	test: 0.7736298	best: 0.7736298 (983)	total: 42s	remaining: 1m 7s
1200:	learn: 0.7849936	test: 0.7756854	best: 0.7761230 (1161)	total: 50.4s	remaining: 58.7s
1400:	learn: 0.7871555	test: 0.7773299	best: 0.7779333 (1356)	total: 58.8s	remaining: 50.4s
1600:	learn: 0.7889808	test: 0.7769452	best: 0.7779333 (1356)	total: 1m 7s	remaining: 41.9s
1800:	learn: 0.7907109	test: 0.7777145	best: 0.7779863 (1706)	total: 1m 15s	remaining: 33.4s
2000:	learn: 0.7922137	test: 0.7771975	best: 0.7780727 (1913)	total: 1m 23s	remaining: 25s

<catboost.core.CatBoostClassifier at 0x7f549dcaddd8>

In [14]:
cb_classifier.save_model("catboost_info/best_model.cbm")