In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("shelter_cleaned.csv").iloc[:, 1:]

In [3]:
data.head()

Unnamed: 0,Name,Named by Shelter,Type,Breed,Color,Sex,Protected,Size,Date Of Birth,Impound Number,...,Outcome Subtype,Intake Condition,Outcome Condition,Intake Jurisdiction,Outcome Jurisdiction,Outcome Zip Code,Location,Count,Age,Before Covid
0,REMY,False,DOG,BEAGLE,BLACK/WHITE,Female,False,MED,2014-04-11,K21-039564,...,OVER THE COUNTER_WEB,UNKNOWN,PENDING,SANTA ROSA,SANTA ROSA,95407.0,"95407(38.403734, -122.736775)",1,8.0,True
1,RONDE,True,DOG,CHIHUAHUA SH,TAN,Male,True,SMALL,2020-07-24,K21-039133,...,SCAS WEB,HEALTHY,HEALTHY,SANTA ROSA,*PETALUMA,94954.0,"94954(38.245316, -122.59871)",1,1.0,True
2,SNOOP,True,CAT,DOMESTIC SH,GRAY,Male,True,SMALL,2019-08-04,K21-039232,...,SCAS WEB,HEALTHY,HEALTHY,SANTA ROSA,SANTA ROSA,95405.0,"95405(38.439152, -122.672541)",1,2.0,True
3,FIFI,False,CAT,SIAMESE,SEAL PT,Female,True,KITTN,2020-10-27,K21-037796,...,SCAS WEB,UNKNOWN,PENDING,SANTA ROSA,SANTA ROSA,95407.0,"95407(38.403734, -122.736775)",1,1.0,True
4,MERLIN,True,CAT,DOMESTIC SH,BRN TABBY,Male,True,SMALL,2018-01-16,K21-037709,...,SCAS WEB,UNKNOWN,PENDING,SANTA ROSA,COUNTY,94952.0,"94952(38.236012, -122.730241)",1,4.0,True


In [4]:
data.isna().sum()

Name                    4452
Named by Shelter           0
Type                       0
Breed                      0
Color                      0
Sex                        0
Protected                  0
Size                      33
Date Of Birth           4155
Impound Number             0
Kennel Number             15
Animal ID                  0
Intake Date                0
Outcome Date             139
Days in Shelter            0
Intake Type                0
Intake Subtype             0
Outcome Type             140
Outcome Subtype          337
Intake Condition           0
Outcome Condition        360
Intake Jurisdiction        0
Outcome Jurisdiction    2632
Outcome Zip Code        2688
Location                2688
Count                      0
Age                      118
Before Covid               0
dtype: int64

In [5]:
data.dtypes

Name                     object
Named by Shelter           bool
Type                     object
Breed                    object
Color                    object
Sex                      object
Protected                  bool
Size                     object
Date Of Birth            object
Impound Number           object
Kennel Number            object
Animal ID                object
Intake Date              object
Outcome Date             object
Days in Shelter           int64
Intake Type              object
Intake Subtype           object
Outcome Type             object
Outcome Subtype          object
Intake Condition         object
Outcome Condition        object
Intake Jurisdiction      object
Outcome Jurisdiction     object
Outcome Zip Code        float64
Location                 object
Count                     int64
Age                     float64
Before Covid               bool
dtype: object

In [6]:
data["Location"].value_counts()

95403(38.486997, -122.749134)    2654
95407(38.403734, -122.736775)    2594
95404(38.458384, -122.675588)    1409
95492(38.541541, -122.809202)    1379
95401(38.446019, -122.766748)    1357
                                 ... 
97479(43.402832, -123.204046)       1
91321(34.370818, -118.506223)       1
95351(37.624093, -120.99781)        1
75116(32.659268, -96.913533)        1
95519(40.947237, -124.073351)       1
Name: Location, Length: 619, dtype: int64

In [7]:
data = data.dropna(subset=['Age', 'Outcome Type'])

In [8]:
include = ["Named by Shelter", "Type", "Breed", "Color", "Sex", "Protected",
           "Size", "Days in Shelter", "Intake Type", "Intake Condition", "Age", "Before Covid", "Location"]

In [9]:
xtrain, xtest, ytrain, ytest = train_test_split(data[include],
                                                data["Outcome Type"], train_size = 0.8)

In [10]:
ohe = Pipeline([("other_OHE", OneHotEncoder(
    handle_unknown='ignore'))])

stdscalar = Pipeline([("stdscalar", StandardScaler())])

transformer = ColumnTransformer([
    ("ohe", ohe, ["Type", "Breed", "Color", "Sex", "Size", "Intake Type", "Intake Condition", "Location"]),
    ("standardize", stdscalar, ["Days in Shelter", "Age"])
])

dtc = Pipeline([
    ("transform", transformer),
    ("classify", DecisionTreeClassifier())
])

In [11]:
dtc.fit(xtrain, ytrain)
dtc.score(xtest, ytest)

0.839859437751004

In [12]:
#OHE pipeline for mos_ethnicity, allegation, precinct
ohe = Pipeline([("other_OHE", OneHotEncoder(
    handle_unknown='ignore'))])

#Standardize pipeline for Months Before Resolved
stdscalar = Pipeline([("stdscalar", StandardScaler())])

transformer = ColumnTransformer([
    ("ohe", ohe, ["Type", "Breed", "Color", "Sex", "Size", "Intake Type", "Intake Condition", "Location"]),
    ("standardize", stdscalar, ["Days in Shelter", "Age"])
])

knn = Pipeline([
    ("transform", transformer),
    ("classify", KNeighborsClassifier())
])

In [13]:
knn.fit(xtrain, ytrain)
knn.score(xtest, ytest)

0.7743473895582329

In [14]:
#OHE pipeline for mos_ethnicity, allegation, precinct
ohe = Pipeline([("other_OHE", OneHotEncoder(
    handle_unknown='ignore'))])

#Standardize pipeline for Months Before Resolved
stdscalar = Pipeline([("stdscalar", StandardScaler())])

transformer = ColumnTransformer([
    ("ohe", ohe, ["Type", "Breed", "Color", "Sex", "Size", "Intake Type", "Intake Condition", "Location"]),
    ("standardize", stdscalar, ["Days in Shelter", "Age"])
])

rfc = Pipeline([
    ("transform", transformer),
    ("classify", RandomForestClassifier())
])

In [15]:
rfc.fit(xtrain, ytrain)
rfc.score(xtest, ytest)

0.8629518072289156