In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/model_outcome_type.csv")

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,animal id,intake type,intake condition,age upon intake,outcome type,spayed/neutered,sex,breed,color
0,0,A786884,Stray,Normal,2 years,Transfer,Neutered,Male,Beagle Mix,Tricolor


In [4]:
del df["Unnamed: 0"]
del df["animal id"]

In [5]:
df.describe().T

Unnamed: 0,count,unique,top,freq
intake type,59404,5,Stray,42839
intake condition,59404,10,Normal,53444
age upon intake,59404,11,between 6 weeks and < 1 year,16804
outcome type,59120,8,Adoption,27434
spayed/neutered,59404,4,Intact,40222
sex,59404,3,Male,31395
breed,59404,2326,Pit Bull Mix,6442
color,59404,372,Black/White,6905


In [6]:
len(df)

59404

In [7]:
df["outcome type"].value_counts()

Adoption           27434
Return to Owner    15078
Transfer           14269
Euthanasia          1639
Rto-Adopt            395
Died                 248
Disposal              35
Missing               22
Name: outcome type, dtype: int64

In [8]:
df["outcome type"] = df["outcome type"].fillna("Missing")

In [9]:
# base model
27434 / 59404

0.4618207528112585

In [10]:
target = "outcome type"
X = df.drop(target, axis=1)
y = df[target]

In [11]:
# test/train
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
# adding mapper
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer, StandardScaler

In [14]:
mapper = DataFrameMapper([
    ("intake type", LabelBinarizer()),
    ("intake condition", LabelBinarizer()),
    ("age upon intake", LabelBinarizer()),
    ("spayed/neutered", LabelBinarizer()),
    ("sex", LabelBinarizer()),
    ("breed", LabelBinarizer()),
    ("color", LabelBinarizer())],df_out=True)

In [15]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [16]:
# first model to check performance
from sklearn.linear_model import LogisticRegression

In [17]:
model = LogisticRegression().fit(Z_train, y_train)
print("Log regress train score is " + str(model.score(Z_train, y_train)))
print("Log regress test score is " + str(model.score(Z_test, y_test)))



Log regress train score is 0.5972886225394474
Log regress test score is 0.5811729849841761


In [18]:
# adding a tree model
from sklearn.tree import DecisionTreeClassifier

In [19]:
model = DecisionTreeClassifier().fit(Z_train, y_train)
print("Decision Tree train score is " + str(model.score(Z_train, y_train)))
print("Decision Tree test score is " + str(model.score(Z_test, y_test)))

Decision Tree train score is 0.8588422777366282
Decision Tree test score is 0.5220523870446434


In [20]:
# adding CatBoost for one more model
from catboost import CatBoostClassifier

In [21]:
model = CatBoostClassifier().fit(Z_train, y_train)
print("Cat Boost train score is " + str(model.score(Z_train, y_train)))
print("Cat Boost test score is " + str(model.score(Z_test, y_test)))

0:	learn: 2.0117129	total: 138ms	remaining: 2m 17s
1:	learn: 1.9514800	total: 201ms	remaining: 1m 40s
2:	learn: 1.8984502	total: 266ms	remaining: 1m 28s
3:	learn: 1.8498851	total: 327ms	remaining: 1m 21s
4:	learn: 1.8066359	total: 405ms	remaining: 1m 20s
5:	learn: 1.7660704	total: 465ms	remaining: 1m 17s
6:	learn: 1.7285818	total: 535ms	remaining: 1m 15s
7:	learn: 1.6947904	total: 591ms	remaining: 1m 13s
8:	learn: 1.6635071	total: 664ms	remaining: 1m 13s
9:	learn: 1.6337503	total: 723ms	remaining: 1m 11s
10:	learn: 1.6059663	total: 784ms	remaining: 1m 10s
11:	learn: 1.5801249	total: 845ms	remaining: 1m 9s
12:	learn: 1.5561935	total: 920ms	remaining: 1m 9s
13:	learn: 1.5337311	total: 978ms	remaining: 1m 8s
14:	learn: 1.5119530	total: 1.04s	remaining: 1m 8s
15:	learn: 1.4916542	total: 1.1s	remaining: 1m 7s
16:	learn: 1.4727404	total: 1.17s	remaining: 1m 7s
17:	learn: 1.4549995	total: 1.23s	remaining: 1m 7s
18:	learn: 1.4383447	total: 1.33s	remaining: 1m 8s
19:	learn: 1.4222688	total: 1.4

161:	learn: 1.0424633	total: 10.4s	remaining: 53.7s
162:	learn: 1.0423198	total: 10.5s	remaining: 53.7s
163:	learn: 1.0421637	total: 10.5s	remaining: 53.6s
164:	learn: 1.0420363	total: 10.6s	remaining: 53.6s
165:	learn: 1.0417782	total: 10.7s	remaining: 53.6s
166:	learn: 1.0416458	total: 10.7s	remaining: 53.6s
167:	learn: 1.0414655	total: 10.8s	remaining: 53.5s
168:	learn: 1.0413100	total: 10.9s	remaining: 53.4s
169:	learn: 1.0411834	total: 10.9s	remaining: 53.4s
170:	learn: 1.0410934	total: 11s	remaining: 53.3s
171:	learn: 1.0409726	total: 11.1s	remaining: 53.6s
172:	learn: 1.0407475	total: 11.2s	remaining: 53.7s
173:	learn: 1.0406583	total: 11.3s	remaining: 53.6s
174:	learn: 1.0405096	total: 11.4s	remaining: 53.6s
175:	learn: 1.0403291	total: 11.5s	remaining: 53.6s
176:	learn: 1.0401459	total: 11.5s	remaining: 53.5s
177:	learn: 1.0400687	total: 11.6s	remaining: 53.4s
178:	learn: 1.0398598	total: 11.6s	remaining: 53.4s
179:	learn: 1.0397531	total: 11.7s	remaining: 53.3s
180:	learn: 1.

321:	learn: 1.0281031	total: 22.8s	remaining: 48s
322:	learn: 1.0280183	total: 22.9s	remaining: 47.9s
323:	learn: 1.0279722	total: 22.9s	remaining: 47.9s
324:	learn: 1.0279093	total: 23s	remaining: 47.8s
325:	learn: 1.0278589	total: 23.1s	remaining: 47.8s
326:	learn: 1.0277401	total: 23.2s	remaining: 47.7s
327:	learn: 1.0276907	total: 23.3s	remaining: 47.7s
328:	learn: 1.0276537	total: 23.4s	remaining: 47.6s
329:	learn: 1.0276107	total: 23.4s	remaining: 47.5s
330:	learn: 1.0275575	total: 23.5s	remaining: 47.5s
331:	learn: 1.0275372	total: 23.6s	remaining: 47.4s
332:	learn: 1.0274850	total: 23.7s	remaining: 47.4s
333:	learn: 1.0274367	total: 23.7s	remaining: 47.4s
334:	learn: 1.0274033	total: 23.8s	remaining: 47.3s
335:	learn: 1.0273628	total: 23.9s	remaining: 47.2s
336:	learn: 1.0273398	total: 24s	remaining: 47.2s
337:	learn: 1.0272548	total: 24.1s	remaining: 47.1s
338:	learn: 1.0272192	total: 24.1s	remaining: 47s
339:	learn: 1.0270524	total: 24.2s	remaining: 47s
340:	learn: 1.0270395	

482:	learn: 1.0200414	total: 33.5s	remaining: 35.9s
483:	learn: 1.0200123	total: 33.6s	remaining: 35.8s
484:	learn: 1.0199573	total: 33.6s	remaining: 35.7s
485:	learn: 1.0199159	total: 33.7s	remaining: 35.7s
486:	learn: 1.0199003	total: 33.8s	remaining: 35.6s
487:	learn: 1.0198212	total: 33.9s	remaining: 35.5s
488:	learn: 1.0198067	total: 34s	remaining: 35.5s
489:	learn: 1.0197897	total: 34s	remaining: 35.4s
490:	learn: 1.0197720	total: 34.1s	remaining: 35.4s
491:	learn: 1.0197540	total: 34.2s	remaining: 35.3s
492:	learn: 1.0197053	total: 34.3s	remaining: 35.2s
493:	learn: 1.0196735	total: 34.3s	remaining: 35.2s
494:	learn: 1.0196301	total: 34.4s	remaining: 35.1s
495:	learn: 1.0195697	total: 34.5s	remaining: 35.1s
496:	learn: 1.0195622	total: 34.6s	remaining: 35s
497:	learn: 1.0195457	total: 34.6s	remaining: 34.9s
498:	learn: 1.0195288	total: 34.7s	remaining: 34.8s
499:	learn: 1.0194804	total: 34.7s	remaining: 34.7s
500:	learn: 1.0194583	total: 34.8s	remaining: 34.7s
501:	learn: 1.0194

642:	learn: 1.0154647	total: 43.4s	remaining: 24.1s
643:	learn: 1.0154358	total: 43.4s	remaining: 24s
644:	learn: 1.0154293	total: 43.5s	remaining: 23.9s
645:	learn: 1.0153975	total: 43.5s	remaining: 23.8s
646:	learn: 1.0153933	total: 43.6s	remaining: 23.8s
647:	learn: 1.0153824	total: 43.6s	remaining: 23.7s
648:	learn: 1.0153235	total: 43.7s	remaining: 23.6s
649:	learn: 1.0152765	total: 43.8s	remaining: 23.6s
650:	learn: 1.0152568	total: 43.8s	remaining: 23.5s
651:	learn: 1.0152255	total: 43.9s	remaining: 23.4s
652:	learn: 1.0151644	total: 43.9s	remaining: 23.4s
653:	learn: 1.0151477	total: 44s	remaining: 23.3s
654:	learn: 1.0151083	total: 44.1s	remaining: 23.2s
655:	learn: 1.0150768	total: 44.1s	remaining: 23.1s
656:	learn: 1.0150567	total: 44.2s	remaining: 23.1s
657:	learn: 1.0150217	total: 44.2s	remaining: 23s
658:	learn: 1.0150093	total: 44.3s	remaining: 22.9s
659:	learn: 1.0150000	total: 44.4s	remaining: 22.9s
660:	learn: 1.0149939	total: 44.4s	remaining: 22.8s
661:	learn: 1.0149

802:	learn: 1.0116802	total: 52.9s	remaining: 13s
803:	learn: 1.0116521	total: 53s	remaining: 12.9s
804:	learn: 1.0116430	total: 53s	remaining: 12.8s
805:	learn: 1.0116362	total: 53.1s	remaining: 12.8s
806:	learn: 1.0116250	total: 53.1s	remaining: 12.7s
807:	learn: 1.0115912	total: 53.2s	remaining: 12.6s
808:	learn: 1.0115688	total: 53.2s	remaining: 12.6s
809:	learn: 1.0115629	total: 53.3s	remaining: 12.5s
810:	learn: 1.0115343	total: 53.4s	remaining: 12.4s
811:	learn: 1.0115074	total: 53.4s	remaining: 12.4s
812:	learn: 1.0114875	total: 53.5s	remaining: 12.3s
813:	learn: 1.0114783	total: 53.5s	remaining: 12.2s
814:	learn: 1.0114410	total: 53.6s	remaining: 12.2s
815:	learn: 1.0114323	total: 53.7s	remaining: 12.1s
816:	learn: 1.0114260	total: 53.7s	remaining: 12s
817:	learn: 1.0114194	total: 53.8s	remaining: 12s
818:	learn: 1.0113862	total: 53.8s	remaining: 11.9s
819:	learn: 1.0113619	total: 53.9s	remaining: 11.8s
820:	learn: 1.0113509	total: 54s	remaining: 11.8s
821:	learn: 1.0113332	to

962:	learn: 1.0087875	total: 1m 2s	remaining: 2.4s
963:	learn: 1.0087816	total: 1m 2s	remaining: 2.33s
964:	learn: 1.0087719	total: 1m 2s	remaining: 2.27s
965:	learn: 1.0087652	total: 1m 2s	remaining: 2.2s
966:	learn: 1.0087544	total: 1m 2s	remaining: 2.14s
967:	learn: 1.0087191	total: 1m 2s	remaining: 2.07s
968:	learn: 1.0086990	total: 1m 2s	remaining: 2.01s
969:	learn: 1.0086290	total: 1m 2s	remaining: 1.94s
970:	learn: 1.0086091	total: 1m 2s	remaining: 1.88s
971:	learn: 1.0085901	total: 1m 2s	remaining: 1.81s
972:	learn: 1.0085514	total: 1m 3s	remaining: 1.75s
973:	learn: 1.0085403	total: 1m 3s	remaining: 1.68s
974:	learn: 1.0085305	total: 1m 3s	remaining: 1.62s
975:	learn: 1.0085244	total: 1m 3s	remaining: 1.55s
976:	learn: 1.0085025	total: 1m 3s	remaining: 1.49s
977:	learn: 1.0084896	total: 1m 3s	remaining: 1.42s
978:	learn: 1.0084835	total: 1m 3s	remaining: 1.36s
979:	learn: 1.0084699	total: 1m 3s	remaining: 1.29s
980:	learn: 1.0084550	total: 1m 3s	remaining: 1.23s
981:	learn: 1.

# Conclusion
Logistic Regression and Cat Boost perform about the same while Decision Tree is overfitted and useable.