In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/model_outcome_type.csv")

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,animal id,intake type,intake condition,age upon intake,outcome type,spayed/neutered,sex,breed,color
0,0,A786884,Stray,Normal,2 years,Transfer,Neutered,Male,Beagle Mix,Tricolor


In [4]:
del df["Unnamed: 0"]
del df["animal id"]

In [5]:
df.describe().T

Unnamed: 0,count,unique,top,freq
intake type,59404,5,Stray,42839
intake condition,59404,10,Normal,53444
age upon intake,59404,11,between 6 weeks and < 1 year,16804
outcome type,59120,8,Adoption,27434
spayed/neutered,59404,4,Intact,40222
sex,59404,3,Male,31395
breed,59404,2326,Pit Bull Mix,6442
color,59404,372,Black/White,6905


In [6]:
len(df)

59404

In [7]:
df["outcome type"].value_counts()

Adoption           27434
Return to Owner    15078
Transfer           14269
Euthanasia          1639
Rto-Adopt            395
Died                 248
Disposal              35
Missing               22
Name: outcome type, dtype: int64

In [8]:
df["outcome type"] = df["outcome type"].fillna("Missing")

In [9]:
# base model
27434 / 59404

0.4618207528112585

In [10]:
target = "outcome type"
X = df.drop(target, axis=1)
y = df[target]

In [11]:
# test/train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

In [12]:
# adding mapper
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer, StandardScaler

In [13]:
mapper = DataFrameMapper([
    ("intake type", LabelBinarizer()),
    ("intake condition", LabelBinarizer()),
    ("age upon intake", LabelBinarizer()),
    ("spayed/neutered", LabelBinarizer()),
    ("sex", LabelBinarizer()),
    ("breed", LabelBinarizer()),
    ("color", LabelBinarizer())],df_out=True)

In [14]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [15]:
# first model to check performance
from sklearn.linear_model import LogisticRegression

In [16]:
model = LogisticRegression().fit(Z_train, y_train)
print("Log regress train score is " + str(model.score(Z_train, y_train)))
print("Log regress test score is " + str(model.score(Z_test, y_test)))



Log regress train score is 0.5981864296455907
Log regress test score is 0.5747761093529056


In [17]:
# adding a tree model
from sklearn.tree import DecisionTreeClassifier

In [18]:
model = DecisionTreeClassifier().fit(Z_train, y_train)
print("Decision Tree train score is " + str(model.score(Z_train, y_train)))
print("Decision Tree test score is " + str(model.score(Z_test, y_test)))

Decision Tree train score is 0.8590218391578569
Decision Tree test score is 0.526900545417817


In [19]:
# adding CatBoost for one more model
from catboost import CatBoostClassifier

In [20]:
model = CatBoostClassifier().fit(Z_train, y_train)
print("Cat Boost train score is " + str(model.score(Z_train, y_train)))
print("Cat Boost test score is " + str(model.score(Z_test, y_test)))

0:	learn: 2.0103161	total: 121ms	remaining: 2m 1s
1:	learn: 1.9506981	total: 196ms	remaining: 1m 37s
2:	learn: 1.8960741	total: 256ms	remaining: 1m 25s
3:	learn: 1.8478413	total: 318ms	remaining: 1m 19s
4:	learn: 1.8039965	total: 407ms	remaining: 1m 20s
5:	learn: 1.7638197	total: 476ms	remaining: 1m 18s
6:	learn: 1.7261609	total: 541ms	remaining: 1m 16s
7:	learn: 1.6918384	total: 602ms	remaining: 1m 14s
8:	learn: 1.6604303	total: 671ms	remaining: 1m 13s
9:	learn: 1.6316112	total: 730ms	remaining: 1m 12s
10:	learn: 1.6037528	total: 791ms	remaining: 1m 11s
11:	learn: 1.5784005	total: 851ms	remaining: 1m 10s
12:	learn: 1.5545336	total: 922ms	remaining: 1m 10s
13:	learn: 1.5320779	total: 984ms	remaining: 1m 9s
14:	learn: 1.5106890	total: 1.04s	remaining: 1m 8s
15:	learn: 1.4906410	total: 1.1s	remaining: 1m 7s
16:	learn: 1.4717800	total: 1.18s	remaining: 1m 8s
17:	learn: 1.4539887	total: 1.24s	remaining: 1m 7s
18:	learn: 1.4369837	total: 1.3s	remaining: 1m 7s
19:	learn: 1.4207820	total: 1.3

163:	learn: 1.0408703	total: 10.5s	remaining: 53.5s
164:	learn: 1.0407569	total: 10.6s	remaining: 53.5s
165:	learn: 1.0404978	total: 10.6s	remaining: 53.4s
166:	learn: 1.0401672	total: 10.7s	remaining: 53.3s
167:	learn: 1.0399771	total: 10.7s	remaining: 53.2s
168:	learn: 1.0398631	total: 10.8s	remaining: 53.1s
169:	learn: 1.0397175	total: 10.9s	remaining: 53s
170:	learn: 1.0394871	total: 10.9s	remaining: 52.9s
171:	learn: 1.0392993	total: 11s	remaining: 52.9s
172:	learn: 1.0391756	total: 11s	remaining: 52.8s
173:	learn: 1.0389671	total: 11.1s	remaining: 52.7s
174:	learn: 1.0387482	total: 11.2s	remaining: 52.7s
175:	learn: 1.0386525	total: 11.2s	remaining: 52.6s
176:	learn: 1.0385843	total: 11.3s	remaining: 52.5s
177:	learn: 1.0384804	total: 11.4s	remaining: 52.5s
178:	learn: 1.0382977	total: 11.4s	remaining: 52.4s
179:	learn: 1.0381187	total: 11.5s	remaining: 52.3s
180:	learn: 1.0379384	total: 11.5s	remaining: 52.2s
181:	learn: 1.0378301	total: 11.6s	remaining: 52.2s
182:	learn: 1.0375

325:	learn: 1.0258892	total: 20.4s	remaining: 42.1s
326:	learn: 1.0258561	total: 20.4s	remaining: 42.1s
327:	learn: 1.0257966	total: 20.5s	remaining: 42s
328:	learn: 1.0257602	total: 20.5s	remaining: 41.9s
329:	learn: 1.0257193	total: 20.6s	remaining: 41.8s
330:	learn: 1.0256484	total: 20.7s	remaining: 41.8s
331:	learn: 1.0256053	total: 20.7s	remaining: 41.7s
332:	learn: 1.0255587	total: 20.8s	remaining: 41.6s
333:	learn: 1.0255065	total: 20.8s	remaining: 41.6s
334:	learn: 1.0254539	total: 20.9s	remaining: 41.5s
335:	learn: 1.0254175	total: 21s	remaining: 41.4s
336:	learn: 1.0253277	total: 21s	remaining: 41.4s
337:	learn: 1.0253057	total: 21.1s	remaining: 41.3s
338:	learn: 1.0252642	total: 21.1s	remaining: 41.2s
339:	learn: 1.0252433	total: 21.2s	remaining: 41.1s
340:	learn: 1.0251746	total: 21.3s	remaining: 41.1s
341:	learn: 1.0251075	total: 21.3s	remaining: 41s
342:	learn: 1.0250759	total: 21.4s	remaining: 40.9s
343:	learn: 1.0249882	total: 21.4s	remaining: 40.9s
344:	learn: 1.024933

485:	learn: 1.0181843	total: 29.9s	remaining: 31.6s
486:	learn: 1.0181432	total: 29.9s	remaining: 31.5s
487:	learn: 1.0180960	total: 30s	remaining: 31.5s
488:	learn: 1.0180809	total: 30s	remaining: 31.4s
489:	learn: 1.0180402	total: 30.1s	remaining: 31.3s
490:	learn: 1.0180225	total: 30.2s	remaining: 31.3s
491:	learn: 1.0180094	total: 30.2s	remaining: 31.2s
492:	learn: 1.0179598	total: 30.3s	remaining: 31.1s
493:	learn: 1.0179162	total: 30.3s	remaining: 31.1s
494:	learn: 1.0179018	total: 30.4s	remaining: 31s
495:	learn: 1.0178826	total: 30.5s	remaining: 30.9s
496:	learn: 1.0178688	total: 30.5s	remaining: 30.9s
497:	learn: 1.0178044	total: 30.6s	remaining: 30.8s
498:	learn: 1.0177909	total: 30.6s	remaining: 30.8s
499:	learn: 1.0177173	total: 30.7s	remaining: 30.7s
500:	learn: 1.0176897	total: 30.7s	remaining: 30.6s
501:	learn: 1.0176470	total: 30.8s	remaining: 30.6s
502:	learn: 1.0175904	total: 30.9s	remaining: 30.5s
503:	learn: 1.0175477	total: 30.9s	remaining: 30.4s
504:	learn: 1.0175

645:	learn: 1.0135398	total: 39.3s	remaining: 21.5s
646:	learn: 1.0135227	total: 39.3s	remaining: 21.4s
647:	learn: 1.0134961	total: 39.4s	remaining: 21.4s
648:	learn: 1.0134129	total: 39.4s	remaining: 21.3s
649:	learn: 1.0133823	total: 39.5s	remaining: 21.3s
650:	learn: 1.0133792	total: 39.5s	remaining: 21.2s
651:	learn: 1.0133689	total: 39.6s	remaining: 21.1s
652:	learn: 1.0133455	total: 39.7s	remaining: 21.1s
653:	learn: 1.0133092	total: 39.7s	remaining: 21s
654:	learn: 1.0133044	total: 39.8s	remaining: 20.9s
655:	learn: 1.0132676	total: 39.8s	remaining: 20.9s
656:	learn: 1.0132352	total: 39.9s	remaining: 20.8s
657:	learn: 1.0131976	total: 40s	remaining: 20.8s
658:	learn: 1.0131811	total: 40s	remaining: 20.7s
659:	learn: 1.0131646	total: 40.1s	remaining: 20.6s
660:	learn: 1.0131229	total: 40.1s	remaining: 20.6s
661:	learn: 1.0130988	total: 40.2s	remaining: 20.5s
662:	learn: 1.0130739	total: 40.2s	remaining: 20.5s
663:	learn: 1.0130686	total: 40.3s	remaining: 20.4s
664:	learn: 1.0130

805:	learn: 1.0100250	total: 48.6s	remaining: 11.7s
806:	learn: 1.0100227	total: 48.7s	remaining: 11.6s
807:	learn: 1.0100165	total: 48.7s	remaining: 11.6s
808:	learn: 1.0100099	total: 48.8s	remaining: 11.5s
809:	learn: 1.0099983	total: 48.9s	remaining: 11.5s
810:	learn: 1.0099847	total: 48.9s	remaining: 11.4s
811:	learn: 1.0099647	total: 49s	remaining: 11.3s
812:	learn: 1.0099528	total: 49s	remaining: 11.3s
813:	learn: 1.0099343	total: 49.1s	remaining: 11.2s
814:	learn: 1.0099272	total: 49.2s	remaining: 11.2s
815:	learn: 1.0099192	total: 49.2s	remaining: 11.1s
816:	learn: 1.0098948	total: 49.3s	remaining: 11s
817:	learn: 1.0098826	total: 49.3s	remaining: 11s
818:	learn: 1.0098620	total: 49.4s	remaining: 10.9s
819:	learn: 1.0098584	total: 49.4s	remaining: 10.9s
820:	learn: 1.0098361	total: 49.5s	remaining: 10.8s
821:	learn: 1.0098255	total: 49.6s	remaining: 10.7s
822:	learn: 1.0097655	total: 49.6s	remaining: 10.7s
823:	learn: 1.0097492	total: 49.7s	remaining: 10.6s
824:	learn: 1.009738

967:	learn: 1.0072393	total: 58.2s	remaining: 1.92s
968:	learn: 1.0072189	total: 58.2s	remaining: 1.86s
969:	learn: 1.0072096	total: 58.3s	remaining: 1.8s
970:	learn: 1.0072060	total: 58.3s	remaining: 1.74s
971:	learn: 1.0072007	total: 58.4s	remaining: 1.68s
972:	learn: 1.0071986	total: 58.4s	remaining: 1.62s
973:	learn: 1.0071827	total: 58.5s	remaining: 1.56s
974:	learn: 1.0071661	total: 58.6s	remaining: 1.5s
975:	learn: 1.0071092	total: 58.6s	remaining: 1.44s
976:	learn: 1.0070691	total: 58.7s	remaining: 1.38s
977:	learn: 1.0070612	total: 58.7s	remaining: 1.32s
978:	learn: 1.0070564	total: 58.8s	remaining: 1.26s
979:	learn: 1.0070425	total: 58.9s	remaining: 1.2s
980:	learn: 1.0070346	total: 58.9s	remaining: 1.14s
981:	learn: 1.0070104	total: 59s	remaining: 1.08s
982:	learn: 1.0069864	total: 59s	remaining: 1.02s
983:	learn: 1.0069805	total: 59.1s	remaining: 961ms
984:	learn: 1.0069546	total: 59.2s	remaining: 901ms
985:	learn: 1.0069415	total: 59.2s	remaining: 841ms
986:	learn: 1.00693

In [21]:
# importing confusion matrix & classification report for better visuals
from sklearn.metrics import confusion_matrix, classification_report

In [22]:
y_pred = model.predict(Z_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[5767,    0,    0,    6,    0,  842,    0,  211],
       [  26,    0,    0,    3,    0,   13,    0,   20],
       [   3,    0,    0,    0,    0,    2,    0,    5],
       [ 193,    0,    0,   48,    0,   92,    0,   67],
       [  58,    0,    0,    1,    0,    9,    0,    8],
       [1400,    0,    0,    3,    0, 2289,    0,   83],
       [  59,    0,    0,    1,    0,   30,    0,    5],
       [2619,    0,    0,   23,    0,  495,    0,  470]])

In [24]:
print(classification_report(y_test, y_pred))

  'precision', 'predicted', average, warn_for)


                 precision    recall  f1-score   support

       Adoption       0.57      0.84      0.68      6826
           Died       0.00      0.00      0.00        62
       Disposal       0.00      0.00      0.00        10
     Euthanasia       0.56      0.12      0.20       400
        Missing       0.00      0.00      0.00        76
Return to Owner       0.61      0.61      0.61      3775
      Rto-Adopt       0.00      0.00      0.00        95
       Transfer       0.54      0.13      0.21      3607

       accuracy                           0.58     14851
      macro avg       0.29      0.21      0.21     14851
   weighted avg       0.56      0.58      0.52     14851



Logistic Regression and Cat Boost perform about the same while Decision Tree is overfitted and useable.

In [25]:
#more models 
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.utils import to_categorical

In [26]:
model = RandomForestClassifier().fit(Z_train, y_train)
print("Random Forest train score is " + str(model.score(Z_train, y_train)))
print("Random Forest test score is " + str(model.score(Z_test, y_test)))



Random Forest train score is 0.8446793706372185
Random Forest test score is 0.5320853814557942


In [27]:
## deep learning portion

In [28]:
target = "outcome type"
X = df.drop(target, axis=1)
y = df[target]

In [29]:
lb = LabelBinarizer()
y = lb.fit_transform(y)

In [30]:
y

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

In [32]:
mapper = DataFrameMapper([
    ("intake type", LabelBinarizer()),
    ("intake condition", LabelBinarizer()),
    ("age upon intake", LabelBinarizer()),
    ("spayed/neutered", LabelBinarizer()),
    ("sex", LabelBinarizer()),
    ("breed", LabelBinarizer()),
    ("color", LabelBinarizer())],df_out=True)

In [33]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [34]:
model = Sequential([
    Input(shape=(Z_train.shape[1],)),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(8, activation="softmax")
])

In [35]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [36]:
checkpoint_filepath = "/tmp/checkpoint"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath = checkpoint_filepath,
save_weights_only=True,
monitor="val_accuracy",
mode="max",
save_best_only=True)

In [37]:
history = model.fit(Z_train, y_train,
                   validation_data=(Z_test, y_test),
                   epochs=20, batch_size=256,
                   verbose=2, callbacks=[model_checkpoint_callback])

Epoch 1/20
175/175 - 8s - loss: 1.2454 - accuracy: 0.5424 - val_loss: 1.0614 - val_accuracy: 0.5725
Epoch 2/20
175/175 - 2s - loss: 1.0318 - accuracy: 0.5831 - val_loss: 1.0415 - val_accuracy: 0.5788
Epoch 3/20
175/175 - 2s - loss: 1.0072 - accuracy: 0.5942 - val_loss: 1.0447 - val_accuracy: 0.5797
Epoch 4/20
175/175 - 2s - loss: 0.9918 - accuracy: 0.6003 - val_loss: 1.0393 - val_accuracy: 0.5814
Epoch 5/20
175/175 - 2s - loss: 0.9763 - accuracy: 0.6081 - val_loss: 1.0426 - val_accuracy: 0.5757
Epoch 6/20
175/175 - 2s - loss: 0.9596 - accuracy: 0.6132 - val_loss: 1.0479 - val_accuracy: 0.5781
Epoch 7/20
175/175 - 2s - loss: 0.9478 - accuracy: 0.6180 - val_loss: 1.0505 - val_accuracy: 0.5759
Epoch 8/20
175/175 - 2s - loss: 0.9338 - accuracy: 0.6235 - val_loss: 1.0570 - val_accuracy: 0.5696
Epoch 9/20
175/175 - 2s - loss: 0.9219 - accuracy: 0.6286 - val_loss: 1.0658 - val_accuracy: 0.5727
Epoch 10/20
175/175 - 2s - loss: 0.9108 - accuracy: 0.6335 - val_loss: 1.0681 - val_accuracy: 0.5666