In [8]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

## Data Collection ##


In [9]:
data1 = pd.read_csv('Austin_Animal_Center_Intakes.csv')
data2 = pd.read_csv('Austin_Animal_Center_Outcomes.csv')

data = pd.merge(data1, data2, on='Animal ID', how='inner')
print(data.columns)

Index(['Animal ID', 'Name_x', 'DateTime_x', 'MonthYear_x', 'Found Location',
       'Intake Type', 'Intake Condition', 'Animal Type_x', 'Sex upon Intake',
       'Age upon Intake', 'Breed_x', 'Color_x', 'Name_y', 'DateTime_y',
       'MonthYear_y', 'Date of Birth', 'Outcome Type', 'Outcome Subtype',
       'Animal Type_y', 'Sex upon Outcome', 'Age upon Outcome', 'Breed_y',
       'Color_y'],
      dtype='object')


## Feature Selection ##


In [10]:
data = data.drop(columns=['Name_x','MonthYear_x','MonthYear_y','Found Location','Name_y','Animal Type_y','Color_y','Breed_y','Outcome Subtype','Date of Birth'])
print(data.head())
data.shape

  Animal ID              DateTime_x Intake Type Intake Condition  \
0   A786884  01/03/2019 04:19:00 PM       Stray           Normal   
1   A706918  07/05/2015 12:59:00 PM       Stray           Normal   
2   A724273  04/14/2016 06:43:00 PM       Stray           Normal   
3   A665644  10/21/2013 07:59:00 AM       Stray             Sick   
4   A682524  06/29/2014 10:38:00 AM       Stray           Normal   

  Animal Type_x Sex upon Intake Age upon Intake  \
0           Dog   Neutered Male         2 years   
1           Dog   Spayed Female         8 years   
2           Dog     Intact Male       11 months   
3           Cat   Intact Female         4 weeks   
4           Dog   Neutered Male         4 years   

                                 Breed_x      Color_x              DateTime_y  \
0                             Beagle Mix     Tricolor  01/08/2019 03:11:00 PM   
1               English Springer Spaniel  White/Liver  07/05/2015 03:13:00 PM   
2                            Basenji Mix 

(178310, 13)

## Data Cleaning ##



In [11]:
data = data.dropna()
data.drop_duplicates(subset='Animal ID', keep = False, inplace = True)
print(data)

       Animal ID              DateTime_x Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM       Stray           Normal   
1        A706918  07/05/2015 12:59:00 PM       Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM       Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM       Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM       Stray           Normal   
...          ...                     ...         ...              ...   
178299   A851458  02/12/2022 04:31:00 PM       Stray           Normal   
178300   A851391  02/11/2022 11:19:00 AM       Stray           Normal   
178301   A852148  02/25/2022 04:22:00 PM       Stray           Normal   
178302   A850397  01/24/2022 08:00:00 AM       Stray          Injured   
178303   A851459  02/12/2022 04:31:00 PM       Stray           Normal   

       Animal Type_x Sex upon Intake Age upon Intake  \
0                Dog   Neutered Male         2 years   
1          

In [12]:
for i in range(len(data)):
    age = str(data.iloc[i,6])
    age2 = str(data.iloc[i,12])
    age = age.split()
    age2 = age2.split()
    if len(age2) < 2 or len(age) < 2:
        data.drop(i)
        continue
    if (age[1] == 'years' or age[1] == 'year'):
        data.iloc[i,6] = int(age[0]) * 52
    elif (age[1] == 'months' or age[1] == 'month'):
        data.iloc[i,6] = int(age[0]) * 4
    else:
        data.iloc[i,6] = int(age[0])

    if (age2[1] == 'years' or age2[1] == 'year'):
        data.iloc[i,12] = int(age2[0]) * 52
    elif (age2[1] == 'months' or age2[1] == 'month'):
        data.iloc[i,12] = int(age2[0]) * 4
    else:
        data.iloc[i,12] = int(age2[0])

#print(data.head())


In [13]:
data = data.loc[(data["DateTime_y"] > data["DateTime_x"])]
data.shape
print(data)

       Animal ID              DateTime_x      Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM            Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM            Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM            Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM            Stray           Normal   
5        A743852  02/18/2017 12:46:00 PM  Owner Surrender           Normal   
...          ...                     ...              ...              ...   
178299   A851458  02/12/2022 04:31:00 PM            Stray           Normal   
178300   A851391  02/11/2022 11:19:00 AM            Stray           Normal   
178301   A852148  02/25/2022 04:22:00 PM            Stray           Normal   
178302   A850397  01/24/2022 08:00:00 AM            Stray          Injured   
178303   A851459  02/12/2022 04:31:00 PM            Stray           Normal   

       Animal Type_x Sex upon Intake Age upon Intake  \
0      

## Data Prep ## 


In [14]:
label_df = data['Outcome Type']
feature_df = data.drop(columns=['Outcome Type'])
label_df = label_df.values.ravel()
print("shape of feature data frame: ", feature_df.shape)
print("length of label data frame: " , len(label_df))
feature_df.head()

shape of feature data frame:  (101755, 12)
length of label data frame:  101755


Unnamed: 0,Animal ID,DateTime_x,Intake Type,Intake Condition,Animal Type_x,Sex upon Intake,Age upon Intake,Breed_x,Color_x,DateTime_y,Sex upon Outcome,Age upon Outcome
0,A786884,01/03/2019 04:19:00 PM,Stray,Normal,Dog,Neutered Male,104,Beagle Mix,Tricolor,01/08/2019 03:11:00 PM,Neutered Male,104
2,A724273,04/14/2016 06:43:00 PM,Stray,Normal,Dog,Intact Male,44,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,Neutered Male,52
3,A665644,10/21/2013 07:59:00 AM,Stray,Sick,Cat,Intact Female,4,Domestic Shorthair Mix,Calico,10/21/2013 11:39:00 AM,Intact Female,4
4,A682524,06/29/2014 10:38:00 AM,Stray,Normal,Dog,Neutered Male,208,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,07/02/2014 02:16:00 PM,Neutered Male,208
5,A743852,02/18/2017 12:46:00 PM,Owner Surrender,Normal,Dog,Neutered Male,104,Labrador Retriever Mix,Chocolate,02/21/2017 05:44:00 PM,Neutered Male,104


In [29]:
# we really tried... please ignore everything underneath this line
from sklearn.preprocessing import OneHotEncoder
y = OneHotEncoder().fit_transform(feature_df).toarray()
print(y)

from sklearn.preprocessing import LabelBinarizer
y = LabelBinarizer().fit_transform(feature_df['Breed_x'])
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [34]:
from sklearn.model_selection import train_test_split

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

train_feat, test_feat, train_label, test_label = train_test_split(feature_df, label_df, train_size = .8, test_size=.2)


res = encode_and_bind(train_feat, 'Intake Type')
    
print(res)

       Animal ID              DateTime_x      Intake Type Intake Condition  \
45360    A717217  12/04/2015 01:11:00 PM    Public Assist             Aged   
84742    A767509  03/02/2018 01:28:00 PM            Stray           Normal   
45285    A842547  09/17/2021 01:03:00 PM            Stray           Normal   
148703   A787920  01/22/2019 02:49:00 PM  Owner Surrender           Normal   
128378   A820987  08/02/2020 02:27:00 PM            Stray           Normal   
...          ...                     ...              ...              ...   
144740   A759977  10/10/2017 05:04:00 PM            Stray           Normal   
36770    A771983  05/13/2018 11:14:00 AM  Owner Surrender           Normal   
2862     A684717  07/27/2014 03:16:00 PM  Owner Surrender           Normal   
161262   A700710  04/18/2015 02:37:00 PM            Stray           Normal   
154153   A840375  08/06/2021 03:14:00 PM            Stray           Normal   

       Animal Type_x Sex upon Intake Age upon Intake  \
45360  