In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

## Data Collection ##


In [3]:
data1 = pd.read_csv('Austin_Animal_Center_Intakes.csv')
data2 = pd.read_csv('Austin_Animal_Center_Outcomes.csv')

data = pd.merge(data1, data2, on='Animal ID', how='inner')
print(data.columns)

Index(['Animal ID', 'Name_x', 'DateTime_x', 'MonthYear_x', 'Found Location',
       'Intake Type', 'Intake Condition', 'Animal Type_x', 'Sex upon Intake',
       'Age upon Intake', 'Breed_x', 'Color_x', 'Name_y', 'DateTime_y',
       'MonthYear_y', 'Date of Birth', 'Outcome Type', 'Outcome Subtype',
       'Animal Type_y', 'Sex upon Outcome', 'Age upon Outcome', 'Breed_y',
       'Color_y'],
      dtype='object')


## Feature Selection ##


In [4]:
data = data.drop(columns=['Name_x','MonthYear_x','MonthYear_y','Found Location','Name_y','Animal Type_y','Color_y','Breed_y','Outcome Subtype','Date of Birth'])
print(data.head())
data.shape

  Animal ID              DateTime_x Intake Type Intake Condition  \
0   A786884  01/03/2019 04:19:00 PM       Stray           Normal   
1   A706918  07/05/2015 12:59:00 PM       Stray           Normal   
2   A724273  04/14/2016 06:43:00 PM       Stray           Normal   
3   A665644  10/21/2013 07:59:00 AM       Stray             Sick   
4   A682524  06/29/2014 10:38:00 AM       Stray           Normal   

  Animal Type_x Sex upon Intake Age upon Intake  \
0           Dog   Neutered Male         2 years   
1           Dog   Spayed Female         8 years   
2           Dog     Intact Male       11 months   
3           Cat   Intact Female         4 weeks   
4           Dog   Neutered Male         4 years   

                                 Breed_x      Color_x              DateTime_y  \
0                             Beagle Mix     Tricolor  01/08/2019 03:11:00 PM   
1               English Springer Spaniel  White/Liver  07/05/2015 03:13:00 PM   
2                            Basenji Mix 

(178310, 13)

## Data Cleaning ##



In [5]:
data = data.dropna()
data.drop_duplicates(subset='Animal ID', keep = False, inplace = True)
print(data)

       Animal ID              DateTime_x Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM       Stray           Normal   
1        A706918  07/05/2015 12:59:00 PM       Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM       Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM       Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM       Stray           Normal   
...          ...                     ...         ...              ...   
178299   A851458  02/12/2022 04:31:00 PM       Stray           Normal   
178300   A851391  02/11/2022 11:19:00 AM       Stray           Normal   
178301   A852148  02/25/2022 04:22:00 PM       Stray           Normal   
178302   A850397  01/24/2022 08:00:00 AM       Stray          Injured   
178303   A851459  02/12/2022 04:31:00 PM       Stray           Normal   

       Animal Type_x Sex upon Intake Age upon Intake  \
0                Dog   Neutered Male         2 years   
1          

In [6]:
for i in range(len(data)):
    age = str(data.iloc[i,6])
    age2 = str(data.iloc[i,12])
    age = age.split()
    age2 = age2.split()
    if len(age2) < 2 or len(age) < 2:
        data.drop(i)
        continue
    if (age[1] == 'years' or age[1] == 'year'):
        data.iloc[i,6] = int(age[0]) * 52
    elif (age[1] == 'months' or age[1] == 'month'):
        data.iloc[i,6] = int(age[0]) * 4
    else:
        data.iloc[i,6] = int(age[0])

    if (age2[1] == 'years' or age2[1] == 'year'):
        data.iloc[i,12] = int(age2[0]) * 52
    elif (age2[1] == 'months' or age2[1] == 'month'):
        data.iloc[i,12] = int(age2[0]) * 4
    else:
        data.iloc[i,12] = int(age2[0])

#print(data.head())


       Animal ID              DateTime_x    Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM          Stray           Normal   
1        A706918  07/05/2015 12:59:00 PM          Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM          Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM          Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM          Stray           Normal   
...          ...                     ...            ...              ...   
178305   A672109  09/28/2016 06:25:00 PM  Public Assist           Normal   
178306   A672109  02/04/2014 12:25:00 PM          Stray           Normal   
178307   A672109  02/04/2014 12:25:00 PM          Stray           Normal   
178308   A845912  11/06/2021 03:15:00 PM          Stray         Neonatal   
178309   A845912  04/13/2022 01:47:00 PM          Stray           Normal   

       Animal Type_x Sex upon Intake Age upon Intake  \
0                Dog   Neutered

In [7]:
data = data.loc[(data["DateTime_y"] > data["DateTime_x"])]
data.shape
print(data)

       Animal ID              DateTime_x      Intake Type Intake Condition  \
0        A786884  01/03/2019 04:19:00 PM            Stray           Normal   
2        A724273  04/14/2016 06:43:00 PM            Stray           Normal   
3        A665644  10/21/2013 07:59:00 AM            Stray             Sick   
4        A682524  06/29/2014 10:38:00 AM            Stray           Normal   
5        A743852  02/18/2017 12:46:00 PM  Owner Surrender           Normal   
...          ...                     ...              ...              ...   
178299   A851458  02/12/2022 04:31:00 PM            Stray           Normal   
178300   A851391  02/11/2022 11:19:00 AM            Stray           Normal   
178301   A852148  02/25/2022 04:22:00 PM            Stray           Normal   
178302   A850397  01/24/2022 08:00:00 AM            Stray          Injured   
178303   A851459  02/12/2022 04:31:00 PM            Stray           Normal   

       Animal Type_x Sex upon Intake Age upon Intake  \
0      

(102555, 13)