In [139]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

## Data Collection ##


In [140]:
data1 = pd.read_csv('Austin_Animal_Center_Intakes.csv')
data2 = pd.read_csv('Austin_Animal_Center_Outcomes.csv')

data = pd.merge(data1, data2, on='Animal ID', how='inner')
print(data.columns)

Index(['Animal ID', 'Name_x', 'DateTime_x', 'MonthYear_x', 'Found Location',
       'Intake Type', 'Intake Condition', 'Animal Type_x', 'Sex upon Intake',
       'Age upon Intake', 'Breed_x', 'Color_x', 'Name_y', 'DateTime_y',
       'MonthYear_y', 'Date of Birth', 'Outcome Type', 'Outcome Subtype',
       'Animal Type_y', 'Sex upon Outcome', 'Age upon Outcome', 'Breed_y',
       'Color_y'],
      dtype='object')


## Feature Selection ##


In [141]:
data = data.drop(columns=['Name_x','MonthYear_x','MonthYear_y','Found Location','Name_y','Animal Type_y','Color_y','Breed_y','Outcome Subtype','Date of Birth', 'Age upon Outcome', 'Sex upon Intake'])
print(data.head())
data.shape

  Animal ID              DateTime_x Intake Type Intake Condition  \
0   A786884  01/03/2019 04:19:00 PM       Stray           Normal   
1   A706918  07/05/2015 12:59:00 PM       Stray           Normal   
2   A724273  04/14/2016 06:43:00 PM       Stray           Normal   
3   A665644  10/21/2013 07:59:00 AM       Stray             Sick   
4   A682524  06/29/2014 10:38:00 AM       Stray           Normal   

  Animal Type_x Age upon Intake                                Breed_x  \
0           Dog         2 years                             Beagle Mix   
1           Dog         8 years               English Springer Spaniel   
2           Dog       11 months                            Basenji Mix   
3           Cat         4 weeks                 Domestic Shorthair Mix   
4           Dog         4 years  Doberman Pinsch/Australian Cattle Dog   

       Color_x              DateTime_y     Outcome Type Sex upon Outcome  
0     Tricolor  01/08/2019 03:11:00 PM         Transfer    Neutered Mal

(178310, 11)

## Data Cleaning ##



In [142]:
data = data.dropna()
data.drop_duplicates(subset='Animal ID', keep = False, inplace = True)
print(data)

       Animal ID              DateTime_x Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM       Stray           Normal   
1        A706918  07/05/2015 12:59:00 PM       Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM       Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM       Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM       Stray           Normal   
...          ...                     ...         ...              ...   
178299   A851458  02/12/2022 04:31:00 PM       Stray           Normal   
178300   A851391  02/11/2022 11:19:00 AM       Stray           Normal   
178301   A852148  02/25/2022 04:22:00 PM       Stray           Normal   
178302   A850397  01/24/2022 08:00:00 AM       Stray          Injured   
178303   A851459  02/12/2022 04:31:00 PM       Stray           Normal   

       Animal Type_x Age upon Intake                                Breed_x  \
0                Dog         2 years        

In [143]:
for i in range(len(data)):
    age = str(data.iloc[i,5])
    age = age.split()
    if len(age) < 2:
        data.drop(i)
        continue
    if (age[1] == 'years' or age[1] == 'year'):
        data.iloc[i,5] = int(age[0]) * 52
    elif (age[1] == 'months' or age[1] == 'month'):
        data.iloc[i,5] = int(age[0]) * 4
    else:
        data.iloc[i,5] = int(age[0])

#print(data.head())


In [144]:
data = data.loc[(data["DateTime_y"] > data["DateTime_x"])]
data.shape
print(data)

       Animal ID              DateTime_x      Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM            Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM            Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM            Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM            Stray           Normal   
5        A743852  02/18/2017 12:46:00 PM  Owner Surrender           Normal   
...          ...                     ...              ...              ...   
178299   A851458  02/12/2022 04:31:00 PM            Stray           Normal   
178300   A851391  02/11/2022 11:19:00 AM            Stray           Normal   
178301   A852148  02/25/2022 04:22:00 PM            Stray           Normal   
178302   A850397  01/24/2022 08:00:00 AM            Stray          Injured   
178303   A851459  02/12/2022 04:31:00 PM            Stray           Normal   

       Animal Type_x Age upon Intake                           

## Data Prep ## 


In [145]:
data = data.loc[data['Sex upon Outcome'] != 'Unknown']


In [146]:
label_df = data['Outcome Type']
feature_df = data.drop(columns=['Outcome Type'])
label_df = label_df.values.ravel()
print("shape of feature data frame: ", feature_df.shape)
print("length of label data frame: " , len(label_df))

print(data.head())

for i in range(len(label_df)):
    if label_df[i] == 'Adoption' or label_df[i] == 1:
        label_df[i] = 1
    else:
        label_df[i] = 0
print(sum(label_df))
print(label_df)
feature_df.head()

shape of feature data frame:  (91948, 10)
length of label data frame:  91948
  Animal ID              DateTime_x      Intake Type Intake Condition  \
0   A786884  01/03/2019 04:19:00 PM            Stray           Normal   
2   A724273  04/14/2016 06:43:00 PM            Stray           Normal   
3   A665644  10/21/2013 07:59:00 AM            Stray             Sick   
4   A682524  06/29/2014 10:38:00 AM            Stray           Normal   
5   A743852  02/18/2017 12:46:00 PM  Owner Surrender           Normal   

  Animal Type_x Age upon Intake                                Breed_x  \
0           Dog             104                             Beagle Mix   
2           Dog              44                            Basenji Mix   
3           Cat               4                 Domestic Shorthair Mix   
4           Dog             208  Doberman Pinsch/Australian Cattle Dog   
5           Dog             104                 Labrador Retriever Mix   

       Color_x              DateTime_y 

Unnamed: 0,Animal ID,DateTime_x,Intake Type,Intake Condition,Animal Type_x,Age upon Intake,Breed_x,Color_x,DateTime_y,Sex upon Outcome
0,A786884,01/03/2019 04:19:00 PM,Stray,Normal,Dog,104,Beagle Mix,Tricolor,01/08/2019 03:11:00 PM,Neutered Male
2,A724273,04/14/2016 06:43:00 PM,Stray,Normal,Dog,44,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,Neutered Male
3,A665644,10/21/2013 07:59:00 AM,Stray,Sick,Cat,4,Domestic Shorthair Mix,Calico,10/21/2013 11:39:00 AM,Intact Female
4,A682524,06/29/2014 10:38:00 AM,Stray,Normal,Dog,208,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,07/02/2014 02:16:00 PM,Neutered Male
5,A743852,02/18/2017 12:46:00 PM,Owner Surrender,Normal,Dog,104,Labrador Retriever Mix,Chocolate,02/21/2017 05:44:00 PM,Neutered Male


In [147]:
from datetime import datetime
from datetime import timedelta

durations = []

for i in range(len(feature_df)):
    into_shelter = datetime.strptime(feature_df.iloc[i, 1], "%m/%d/%Y %H:%M:%S %p")
    out_shelter = datetime.strptime(feature_df.iloc[i, 8], "%m/%d/%Y %H:%M:%S %p")
    duration = out_shelter - into_shelter
    duration = duration.days
    durations.append(duration)
    
feature_df = feature_df.drop(columns=['DateTime_x', 'DateTime_y'])


In [148]:
feature_df['Duration_days'] = durations
feature_df.head()

Unnamed: 0,Animal ID,Intake Type,Intake Condition,Animal Type_x,Age upon Intake,Breed_x,Color_x,Sex upon Outcome,Duration_days
0,A786884,Stray,Normal,Dog,104,Beagle Mix,Tricolor,Neutered Male,4
2,A724273,Stray,Normal,Dog,44,Basenji Mix,Sable/White,Neutered Male,6
3,A665644,Stray,Sick,Cat,4,Domestic Shorthair Mix,Calico,Intact Female,0
4,A682524,Stray,Normal,Dog,208,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Neutered Male,2
5,A743852,Owner Surrender,Normal,Dog,104,Labrador Retriever Mix,Chocolate,Neutered Male,2


In [149]:
fertility = []
gender = []

feature_df = feature_df.loc[feature_df['Sex upon Outcome'] != 'Unknown']
for i in range(len(feature_df)):
    sex = feature_df.iloc[i, 7].split()
    if sex[0] == 'Intact':
        sex[0] = 0
    else:
        sex[0] = 1

    if sex[1] == 'Male':
        sex[1] = 0
    else:
        sex[1] = 1

    fertility.append(sex[0])
    gender.append(sex[1])
        
feature_df = feature_df.drop(columns=['Sex upon Outcome'])


In [150]:
feature_df['Spayed/Neutered'] = fertility
feature_df['Gender'] = gender
feature_df.head()

Unnamed: 0,Animal ID,Intake Type,Intake Condition,Animal Type_x,Age upon Intake,Breed_x,Color_x,Duration_days,Spayed/Neutered,Gender
0,A786884,Stray,Normal,Dog,104,Beagle Mix,Tricolor,4,1,0
2,A724273,Stray,Normal,Dog,44,Basenji Mix,Sable/White,6,1,0
3,A665644,Stray,Sick,Cat,4,Domestic Shorthair Mix,Calico,0,0,1
4,A682524,Stray,Normal,Dog,208,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2,1,0
5,A743852,Owner Surrender,Normal,Dog,104,Labrador Retriever Mix,Chocolate,2,1,0


In [151]:
for i in range(len(feature_df)):
    if feature_df.iloc[i, 3] == 'Dog':
        feature_df.iloc[i, 3] = 0
    elif feature_df.iloc[i, 3] == 'Cat':
        feature_df.iloc[i, 3] = 1
    else:
        feature_df.iloc[i, 3] = 2



In [152]:
feature_df.head()

Unnamed: 0,Animal ID,Intake Type,Intake Condition,Animal Type_x,Age upon Intake,Breed_x,Color_x,Duration_days,Spayed/Neutered,Gender
0,A786884,Stray,Normal,0,104,Beagle Mix,Tricolor,4,1,0
2,A724273,Stray,Normal,0,44,Basenji Mix,Sable/White,6,1,0
3,A665644,Stray,Sick,1,4,Domestic Shorthair Mix,Calico,0,0,1
4,A682524,Stray,Normal,0,208,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2,1,0
5,A743852,Owner Surrender,Normal,0,104,Labrador Retriever Mix,Chocolate,2,1,0


In [154]:

feature_df['Intake Type'] = pd.factorize(feature_df['Intake Type'])[0]
feature_df['Intake Condition'] = pd.factorize(feature_df['Intake Condition'])[0]
feature_df['Breed_x'] = pd.factorize(feature_df['Breed_x'])[0]
feature_df['Color_x'] = pd.factorize(feature_df['Color_x'])[0]

feature_df.head()

Unnamed: 0,Animal ID,Intake Type,Intake Condition,Animal Type_x,Age upon Intake,Breed_x,Color_x,Duration_days,Spayed/Neutered,Gender
0,A786884,0,0,0,104,0,0,4,1,0
2,A724273,0,0,0,44,1,1,6,1,0
3,A665644,0,1,1,4,2,2,0,0,1
4,A682524,0,0,0,208,3,3,2,1,0
5,A743852,1,0,0,104,4,4,2,1,0


In [155]:
feature_df = feature_df.drop(columns=['Animal ID'])


In [158]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
train_feat, test_feat, train_label, test_label = train_test_split(feature_df, label_df, train_size = .8, test_size=.2)
print(len(train_feat))
print(len(test_feat))

clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(train_feat, train_label.astype(int))
pred = clf.predict(test_feat)
print("Accuracy = ", accuracy_score(test_label.astype(int), pred))

73558
18390
Accuracy =  0.7981511691136487


In [159]:
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(criterion="entropy")
accuracy = cross_val_score(clf,feature_df, label_df.astype(int), cv=10)
print(accuracy.mean())

0.8003109044683964
