# Shelter Intake & Outcome Analysis
## Notebook 1: Data Cleaning

This notebook focuses on loading, inspecting, and cleaning the Austin Animal Center intake and outcome dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'..\data\aac_intakes_outcomes.csv')
df.head()

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,outcome_month,outcome_year,outcome_monthyear,outcome_weekday,outcome_hour,outcome_number,dob_year,dob_month,dob_monthyear,age_upon_intake,animal_id_intake,animal_type,breed,color,found_location,intake_condition,intake_type,sex_upon_intake,count,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
0,10 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,3650,10.0,"(7.5, 10.0]",2017-12-07 14:07:00,12,2017,2017-12,Thursday,0,1.0,2007,7,2017-12,10 years,A006100,Dog,Spinone Italiano Mix,Yellow/White,Colony Creek And Hunters Trace in Austin (TX),Normal,Stray,Neutered Male,1,3650,10.0,"(7.5, 10.0]",2017-12-07 00:00:00,12,2017,2017-12,Thursday,14,1.0,0 days 14:07:00.000000000,0.588194
1,7 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2555,7.0,"(5.0, 7.5]",2014-12-20 16:35:00,12,2014,2014-12,Saturday,16,2.0,2007,7,2014-12,7 years,A006100,Dog,Spinone Italiano Mix,Yellow/White,8700 Research Blvd in Austin (TX),Normal,Public Assist,Neutered Male,1,2555,7.0,"(5.0, 7.5]",2014-12-19 10:21:00,12,2014,2014-12,Friday,10,2.0,1 days 06:14:00.000000000,1.259722
2,6 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2190,6.0,"(5.0, 7.5]",2014-03-08 17:10:00,3,2014,2014-03,Saturday,17,3.0,2007,7,2014-03,6 years,A006100,Dog,Spinone Italiano Mix,Yellow/White,8700 Research in Austin (TX),Normal,Public Assist,Neutered Male,1,2190,6.0,"(5.0, 7.5]",2014-03-07 14:26:00,3,2014,2014-03,Friday,14,3.0,1 days 02:44:00.000000000,1.113889
3,10 years,A047759,2004-04-02 00:00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",2014-04-07 15:12:00,4,2014,2014-04,Monday,15,1.0,2004,4,2014-04,10 years,A047759,Dog,Dachshund,Tricolor,Austin (TX),Normal,Owner Surrender,Neutered Male,1,3650,10.0,"(7.5, 10.0]",2014-04-02 15:55:00,4,2014,2014-04,Wednesday,15,1.0,4 days 23:17:00.000000000,4.970139
4,16 years,A134067,1997-10-16 00:00:00,,Return to Owner,Neutered Male,5840,16.0,"(15.0, 17.5]",2013-11-16 11:54:00,11,2013,2013-11,Saturday,11,1.0,1997,10,2013-11,16 years,A134067,Dog,Shetland Sheepdog,Brown/White,12034 Research Blvd in Austin (TX),Injured,Public Assist,Neutered Male,1,5840,16.0,"(15.0, 17.5]",2013-11-16 09:02:00,11,2013,2013-11,Saturday,9,1.0,0 days 02:52:00.000000000,0.119444


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79672 entries, 0 to 79671
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age_upon_outcome            79672 non-null  object 
 1   animal_id_outcome           79672 non-null  object 
 2   date_of_birth               79672 non-null  object 
 3   outcome_subtype             36348 non-null  object 
 4   outcome_type                79662 non-null  object 
 5   sex_upon_outcome            79671 non-null  object 
 6   age_upon_outcome_(days)     79672 non-null  int64  
 7   age_upon_outcome_(years)    79672 non-null  float64
 8   age_upon_outcome_age_group  79672 non-null  object 
 9   outcome_datetime            79672 non-null  object 
 10  outcome_month               79672 non-null  int64  
 11  outcome_year                79672 non-null  int64  
 12  outcome_monthyear           79672 non-null  object 
 13  outcome_weekday             796

In [4]:
df.describe(include='all')

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,outcome_month,outcome_year,outcome_monthyear,outcome_weekday,outcome_hour,outcome_number,dob_year,dob_month,dob_monthyear,age_upon_intake,animal_id_intake,animal_type,breed,color,found_location,intake_condition,intake_type,sex_upon_intake,count,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
count,79672,79672,79672,36348,79662,79671,79672.0,79672.0,79672,79672,79672.0,79672.0,79672,79672,79672.0,79672.0,79672.0,79672.0,79672,79672,79672,79672,79672,79672,79672,79672,79672,79671,79672.0,79672.0,79672.0,79672,79672,79672.0,79672.0,79672,79672,79672.0,79672.0,79672,79672.0
unique,46,71961,5923,19,9,5,,,10,65686,,,55,7,,,,,55,46,71961,4,2155,529,36576,8,5,5,,,,10,56747,,,54,7,,,29319,
top,1 year,A721033,2014-05-05 00:00:00,Partner,Adoption,Neutered Male,,,"(-0.025, 2.5]",2016-04-18 00:00:00,,,2014-07,Saturday,,,,,2014-07,1 year,A721033,Dog,Domestic Shorthair Mix,Black/White,Austin (TX),Normal,Stray,Intact Male,,,,"(-0.025, 2.5]",2016-09-23 12:00:00,,,2015-06,Saturday,,,0 days 00:14:00.000000000,
freq,14750,13,112,19840,33594,28293,,,59412,39,,,2072,12848,,,,,2072,14580,13,45366,23423,8270,14311,70056,55935,25317,,,,59593,64,,,2188,12037,,,77,
mean,,,,,,,782.046127,2.142592,,,6.655425,2015.472563,,,14.297306,1.12682,2013.25487,6.31031,,,,,,,,,,,1.0,769.341701,2.107785,,,6.584032,2015.436101,,,13.487022,1.12682,,16.757116
std,,,,,,,1058.528519,2.900078,,,3.414284,1.305944,,,3.774317,0.456057,3.216517,3.289077,,,,,,,,,,,0.0,1056.00904,2.893175,,,3.366579,1.303157,,,3.121173,0.456057,,41.679359
min,,,,,,,0.0,0.0,,,1.0,2013.0,,,0.0,1.0,1991.0,1.0,,,,,,,,,,,1.0,0.0,0.0,,,1.0,2013.0,,,0.0,1.0,,0.0
25%,,,,,,,90.0,0.246575,,,4.0,2014.0,,,12.0,1.0,2012.0,4.0,,,,,,,,,,,1.0,60.0,0.164384,,,4.0,2014.0,,,11.0,1.0,,1.102083
50%,,,,,,,365.0,1.0,,,7.0,2015.0,,,15.0,1.0,2014.0,6.0,,,,,,,,,,,1.0,365.0,1.0,,,7.0,2015.0,,,13.0,1.0,,4.987153
75%,,,,,,,1095.0,3.0,,,10.0,2017.0,,,17.0,1.0,2015.0,9.0,,,,,,,,,,,1.0,1095.0,3.0,,,10.0,2017.0,,,16.0,1.0,,13.610764


In [5]:
df.isna().sum()

age_upon_outcome                  0
animal_id_outcome                 0
date_of_birth                     0
outcome_subtype               43324
outcome_type                     10
sex_upon_outcome                  1
age_upon_outcome_(days)           0
age_upon_outcome_(years)          0
age_upon_outcome_age_group        0
outcome_datetime                  0
outcome_month                     0
outcome_year                      0
outcome_monthyear                 0
outcome_weekday                   0
outcome_hour                      0
outcome_number                    0
dob_year                          0
dob_month                         0
dob_monthyear                     0
age_upon_intake                   0
animal_id_intake                  0
animal_type                       0
breed                             0
color                             0
found_location                    0
intake_condition                  0
intake_type                       0
sex_upon_intake             

In [6]:
df = pd.read_csv('../data/aac_intakes_outcomes.csv')
df.head()

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,outcome_month,outcome_year,outcome_monthyear,outcome_weekday,outcome_hour,outcome_number,dob_year,dob_month,dob_monthyear,age_upon_intake,animal_id_intake,animal_type,breed,color,found_location,intake_condition,intake_type,sex_upon_intake,count,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
0,10 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,3650,10.0,"(7.5, 10.0]",2017-12-07 14:07:00,12,2017,2017-12,Thursday,0,1.0,2007,7,2017-12,10 years,A006100,Dog,Spinone Italiano Mix,Yellow/White,Colony Creek And Hunters Trace in Austin (TX),Normal,Stray,Neutered Male,1,3650,10.0,"(7.5, 10.0]",2017-12-07 00:00:00,12,2017,2017-12,Thursday,14,1.0,0 days 14:07:00.000000000,0.588194
1,7 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2555,7.0,"(5.0, 7.5]",2014-12-20 16:35:00,12,2014,2014-12,Saturday,16,2.0,2007,7,2014-12,7 years,A006100,Dog,Spinone Italiano Mix,Yellow/White,8700 Research Blvd in Austin (TX),Normal,Public Assist,Neutered Male,1,2555,7.0,"(5.0, 7.5]",2014-12-19 10:21:00,12,2014,2014-12,Friday,10,2.0,1 days 06:14:00.000000000,1.259722
2,6 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2190,6.0,"(5.0, 7.5]",2014-03-08 17:10:00,3,2014,2014-03,Saturday,17,3.0,2007,7,2014-03,6 years,A006100,Dog,Spinone Italiano Mix,Yellow/White,8700 Research in Austin (TX),Normal,Public Assist,Neutered Male,1,2190,6.0,"(5.0, 7.5]",2014-03-07 14:26:00,3,2014,2014-03,Friday,14,3.0,1 days 02:44:00.000000000,1.113889
3,10 years,A047759,2004-04-02 00:00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",2014-04-07 15:12:00,4,2014,2014-04,Monday,15,1.0,2004,4,2014-04,10 years,A047759,Dog,Dachshund,Tricolor,Austin (TX),Normal,Owner Surrender,Neutered Male,1,3650,10.0,"(7.5, 10.0]",2014-04-02 15:55:00,4,2014,2014-04,Wednesday,15,1.0,4 days 23:17:00.000000000,4.970139
4,16 years,A134067,1997-10-16 00:00:00,,Return to Owner,Neutered Male,5840,16.0,"(15.0, 17.5]",2013-11-16 11:54:00,11,2013,2013-11,Saturday,11,1.0,1997,10,2013-11,16 years,A134067,Dog,Shetland Sheepdog,Brown/White,12034 Research Blvd in Austin (TX),Injured,Public Assist,Neutered Male,1,5840,16.0,"(15.0, 17.5]",2013-11-16 09:02:00,11,2013,2013-11,Saturday,9,1.0,0 days 02:52:00.000000000,0.119444


In [7]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('-', '_')
)
df.columns

Index(['age_upon_outcome', 'animal_id_outcome', 'date_of_birth',
       'outcome_subtype', 'outcome_type', 'sex_upon_outcome',
       'age_upon_outcome_(days)', 'age_upon_outcome_(years)',
       'age_upon_outcome_age_group', 'outcome_datetime', 'outcome_month',
       'outcome_year', 'outcome_monthyear', 'outcome_weekday', 'outcome_hour',
       'outcome_number', 'dob_year', 'dob_month', 'dob_monthyear',
       'age_upon_intake', 'animal_id_intake', 'animal_type', 'breed', 'color',
       'found_location', 'intake_condition', 'intake_type', 'sex_upon_intake',
       'count', 'age_upon_intake_(days)', 'age_upon_intake_(years)',
       'age_upon_intake_age_group', 'intake_datetime', 'intake_month',
       'intake_year', 'intake_monthyear', 'intake_weekday', 'intake_hour',
       'intake_number', 'time_in_shelter', 'time_in_shelter_days'],
      dtype='object')

In [8]:
df.duplicated().sum()

np.int64(35)

In [9]:
df = df.drop_duplicates()

In [10]:
for col in df.select_dtypes(include='object'):
    print(col, df[col].unique()[:20])
    print

age_upon_outcome ['10 years' '7 years' '6 years' '16 years' '15 years' '18 years'
 '14 years' '17 years' '13 years' '19 years' '12 years' '20 years'
 '11 years' '9 years' '5 years' '8 years' '3 years' '4 years' '1 year'
 '11 months']
animal_id_outcome ['A006100' 'A047759' 'A134067' 'A141142' 'A163459' 'A165752' 'A178569'
 'A189592' 'A191351' 'A197810' 'A200922' 'A208755' 'A210457' 'A212672'
 'A214991' 'A215368' 'A218622' 'A218624' 'A221327' 'A221448']
date_of_birth ['2007-07-09 00:00:00' '2004-04-02 00:00:00' '1997-10-16 00:00:00'
 '1998-06-01 00:00:00' '1999-10-19 00:00:00' '1999-08-18 00:00:00'
 '1999-03-17 00:00:00' '1997-08-01 00:00:00' '1999-08-21 00:00:00'
 '2000-01-21 00:00:00' '1997-10-03 00:00:00' '2000-04-27 00:00:00'
 '1999-06-01 00:00:00' '2000-03-05 00:00:00' '1994-03-01 00:00:00'
 '2001-01-17 00:00:00' '1998-10-19 00:00:00' '1999-05-02 00:00:00'
 '2000-11-29 00:00:00' '2000-06-17 00:00:00']
outcome_subtype [nan 'Partner' 'Foster' 'Suffering' 'Medical' 'Behavior' 'In Kenne

In [11]:
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip().str.lower()

In [12]:
df.sample(10)

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,outcome_month,outcome_year,outcome_monthyear,outcome_weekday,outcome_hour,outcome_number,dob_year,dob_month,dob_monthyear,age_upon_intake,animal_id_intake,animal_type,breed,color,found_location,intake_condition,intake_type,sex_upon_intake,count,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
64568,3 months,a746153,2017-01-18 00:00:00,,return to owner,spayed female,90,0.246575,"(-0.025, 2.5]",2017-04-30 17:44:00,4,2017,2017-04,sunday,17,2.0,2017,1,2017-04,3 months,a746153,dog,german shepherd/black mouth cur,red/white,airport blvd and mlk blvd in austin (tx),normal,stray,spayed female,1,90,0.246575,"(-0.025, 2.5]",2017-04-27 12:37:00,4,2017,2017-04,thursday,12,2.0,3 days 05:07:00.000000000,3.213194
5475,10 years,a665962,2004-10-25 00:00:00,,return to owner,spayed female,3650,10.0,"(7.5, 10.0]",2014-10-24 17:25:00,10,2014,2014-10,friday,17,1.0,2004,10,2014-10,10 years,a665962,dog,labrador retriever mix,black/white,11629 manchaca road in austin (tx),normal,stray,spayed female,1,3650,10.0,"(7.5, 10.0]",2014-10-24 11:33:00,10,2014,2014-10,friday,11,1.0,0 days 05:52:00.000000000,0.244444
60232,1 year,a739629,2015-12-05 00:00:00,partner,transfer,neutered male,365,1.0,"(-0.025, 2.5]",2016-12-09 18:05:00,12,2016,2016-12,friday,18,1.0,2015,12,2016-12,1 year,a739629,dog,dachshund,black/blue merle,9405 meadow vale in austin (tx),normal,stray,intact male,1,365,1.0,"(-0.025, 2.5]",2016-12-05 13:36:00,12,2016,2016-12,monday,13,1.0,4 days 04:29:00.000000000,4.186806
41906,1 month,a714860,2015-09-14 00:00:00,in kennel,died,intact male,30,0.082192,"(-0.025, 2.5]",2015-11-03 15:59:00,11,2015,2015-11,tuesday,15,1.0,2015,9,2015-11,1 month,a714860,cat,domestic shorthair mix,orange tabby,1723 pine knoll in austin (tx),normal,stray,intact male,1,30,0.082192,"(-0.025, 2.5]",2015-10-29 09:12:00,10,2015,2015-10,thursday,9,1.0,5 days 06:47:00.000000000,5.282639
30708,2 years,a700483,2013-04-14 00:00:00,,adoption,spayed female,730,2.0,"(-0.025, 2.5]",2015-04-24 15:28:00,4,2015,2015-04,friday,15,1.0,2013,4,2015-04,2 years,a700483,cat,domestic shorthair mix,black/white,12034 research in austin (tx),normal,stray,intact female,1,730,2.0,"(-0.025, 2.5]",2015-04-14 13:31:00,4,2015,2015-04,tuesday,13,1.0,10 days 01:57:00.000000000,10.08125
56373,2 years,a734534,2014-09-08 00:00:00,rabies risk,euthanasia,unknown,730,2.0,"(-0.025, 2.5]",2016-09-08 14:01:00,9,2016,2016-09,thursday,14,1.0,2014,9,2016-09,2 years,a734534,other,bat mix,brown,501 w 3rd st in austin (tx),normal,wildlife,unknown,1,730,2.0,"(-0.025, 2.5]",2016-09-08 12:56:00,9,2016,2016-09,thursday,12,1.0,0 days 01:05:00.000000000,0.045139
51945,2 months,a728823,2016-04-10 00:00:00,,adoption,spayed female,60,0.164384,"(-0.025, 2.5]",2016-06-13 17:10:00,6,2016,2016-06,monday,17,1.0,2016,4,2016-06,1 month,a728823,dog,german shepherd mix,black/tan,tara dr & salt springs in austin (tx),normal,stray,intact female,1,30,0.082192,"(-0.025, 2.5]",2016-06-09 08:24:00,6,2016,2016-06,thursday,8,1.0,4 days 08:46:00.000000000,4.365278
31019,2 years,a700891,2013-04-21 00:00:00,partner,transfer,unknown,730,2.0,"(-0.025, 2.5]",2015-04-21 12:26:00,4,2015,2015-04,tuesday,12,1.0,2013,4,2015-04,2 years,a700891,cat,domestic shorthair mix,blue/white,15208 fm 969 in austin (tx),nursing,stray,unknown,1,730,2.0,"(-0.025, 2.5]",2015-04-21 11:04:00,4,2015,2015-04,tuesday,11,1.0,0 days 01:22:00.000000000,0.056944
7027,2 years,a668210,2012-05-30 00:00:00,aggressive,euthanasia,intact female,730,2.0,"(-0.025, 2.5]",2014-08-31 15:20:00,8,2014,2014-08,sunday,15,1.0,2012,5,2014-08,2 years,a668210,dog,pit bull mix,tan,1614 elmira in austin (tx),injured,stray,intact female,1,730,2.0,"(-0.025, 2.5]",2014-08-24 09:39:00,8,2014,2014-08,sunday,9,1.0,7 days 05:41:00.000000000,7.236806
10408,8 months,a673277,2013-06-21 00:00:00,scrp,transfer,unknown,240,0.657534,"(-0.025, 2.5]",2014-02-22 19:08:00,2,2014,2014-02,saturday,19,1.0,2013,6,2014-02,8 months,a673277,cat,domestic shorthair mix,blue tabby,8805 north plaza dr in austin (tx),normal,stray,unknown,1,240,0.657534,"(-0.025, 2.5]",2014-02-21 11:45:00,2,2014,2014-02,friday,11,1.0,1 days 07:23:00.000000000,1.307639


# Key Derived Features for Analysis
These will be the features this project will rely on to answer questions regarding seasonality patterns, adoption trends, length-of-stay analysis, and age-based outcomes. 


In [13]:
#Length of Stay
df['time_in_shelter_days']

0        0.588194
1        1.259722
2        1.113889
3        4.970139
4        0.119444
           ...   
79667    0.077083
79668    0.053472
79669    0.047917
79670    1.762500
79671    0.813889
Name: time_in_shelter_days, Length: 79637, dtype: float64

In [14]:
#Age at Intake
df['age_upon_intake_(years)']

0        10.000000
1         7.000000
2         6.000000
3        10.000000
4        16.000000
           ...    
79667     0.038356
79668     2.000000
79669     1.000000
79670     0.410959
79671    10.000000
Name: age_upon_intake_(years), Length: 79637, dtype: float64

In [15]:
#Intake Year and Month
df['intake_year']
df['intake_month']

0        12
1        12
2         3
3         4
4        11
         ..
79667     3
79668     3
79669     3
79670     3
79671     3
Name: intake_month, Length: 79637, dtype: int64

In [16]:
#Outcome Year and Month
df['outcome_year']
df['outcome_month']

0        12
1        12
2         3
3         4
4        11
         ..
79667     3
79668     3
79669     3
79670     3
79671     3
Name: outcome_month, Length: 79637, dtype: int64

In [17]:
# grouped outcome categories
positive = ['adoption', 'return to owner', 'transfer', 'relocate']
negative = ['euthanasia', 'died']
neutral = ['missing']

def categorize_outcome(x):
    if x in positive:
        return 'positive'
    if x in negative:
        return 'negative'
    return 'neutral'

df['outcome_category'] = df['outcome_type'].apply(categorize_outcome)

In [18]:
# Fill missing values
df.isna().sum()

age_upon_outcome                  0
animal_id_outcome                 0
date_of_birth                     0
outcome_subtype               43304
outcome_type                     10
sex_upon_outcome                  1
age_upon_outcome_(days)           0
age_upon_outcome_(years)          0
age_upon_outcome_age_group        0
outcome_datetime                  0
outcome_month                     0
outcome_year                      0
outcome_monthyear                 0
outcome_weekday                   0
outcome_hour                      0
outcome_number                    0
dob_year                          0
dob_month                         0
dob_monthyear                     0
age_upon_intake                   0
animal_id_intake                  0
animal_type                       0
breed                             0
color                             0
found_location                    0
intake_condition                  0
intake_type                       0
sex_upon_intake             

In [19]:
df['outcome_type'] = df['outcome_type'].fillna('unknown')
df['sex_upon_outcome'] = df['sex_upon_outcome'].fillna('unknown')
df['sex_upon_intake'] = df['sex_upon_intake'].fillna('unknown')
df.drop(columns=['outcome_subtype'], inplace=True)


In [20]:
# Convert string data into 'days' numerical data

def convert_age_to_days(age_str):
    if pd.isna(age_str):
        return np.nan
    if isinstance(age_str, (int, float)):
        return age_str

    parts = str(age_str).split()
    if len(parts) != 2:
        return np.nan

    value, unit = parts
    try:
        value = float(value)
    except:
        return np.nan

    unit = unit.lower()

    if 'year' in unit:
        return value * 365
    if 'month' in unit:
        return value * 30
    if 'week' in unit:
        return value * 7
    if 'day' in unit:
        return value

    return np.nan


In [21]:
df['age_upon_intake'] = df['age_upon_intake'].apply(convert_age_to_days)
df['age_upon_intake']

0        3650.0
1        2555.0
2        2190.0
3        3650.0
4        5840.0
          ...  
79667      14.0
79668     730.0
79669     365.0
79670     150.0
79671    3650.0
Name: age_upon_intake, Length: 79637, dtype: float64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79637 entries, 0 to 79671
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age_upon_outcome            79637 non-null  object 
 1   animal_id_outcome           79637 non-null  object 
 2   date_of_birth               79637 non-null  object 
 3   outcome_type                79637 non-null  object 
 4   sex_upon_outcome            79637 non-null  object 
 5   age_upon_outcome_(days)     79637 non-null  int64  
 6   age_upon_outcome_(years)    79637 non-null  float64
 7   age_upon_outcome_age_group  79637 non-null  object 
 8   outcome_datetime            79637 non-null  object 
 9   outcome_month               79637 non-null  int64  
 10  outcome_year                79637 non-null  int64  
 11  outcome_monthyear           79637 non-null  object 
 12  outcome_weekday             79637 non-null  object 
 13  outcome_hour                79637 no

In [23]:
df.isna().sum()

age_upon_outcome              0
animal_id_outcome             0
date_of_birth                 0
outcome_type                  0
sex_upon_outcome              0
age_upon_outcome_(days)       0
age_upon_outcome_(years)      0
age_upon_outcome_age_group    0
outcome_datetime              0
outcome_month                 0
outcome_year                  0
outcome_monthyear             0
outcome_weekday               0
outcome_hour                  0
outcome_number                0
dob_year                      0
dob_month                     0
dob_monthyear                 0
age_upon_intake               0
animal_id_intake              0
animal_type                   0
breed                         0
color                         0
found_location                0
intake_condition              0
intake_type                   0
sex_upon_intake               0
count                         0
age_upon_intake_(days)        0
age_upon_intake_(years)       0
age_upon_intake_age_group     0
intake_d

In [24]:
df.head()

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,outcome_month,outcome_year,outcome_monthyear,outcome_weekday,outcome_hour,outcome_number,dob_year,dob_month,dob_monthyear,age_upon_intake,animal_id_intake,animal_type,breed,color,found_location,intake_condition,intake_type,sex_upon_intake,count,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days,outcome_category
0,10 years,a006100,2007-07-09 00:00:00,return to owner,neutered male,3650,10.0,"(7.5, 10.0]",2017-12-07 14:07:00,12,2017,2017-12,thursday,0,1.0,2007,7,2017-12,3650.0,a006100,dog,spinone italiano mix,yellow/white,colony creek and hunters trace in austin (tx),normal,stray,neutered male,1,3650,10.0,"(7.5, 10.0]",2017-12-07 00:00:00,12,2017,2017-12,thursday,14,1.0,0 days 14:07:00.000000000,0.588194,positive
1,7 years,a006100,2007-07-09 00:00:00,return to owner,neutered male,2555,7.0,"(5.0, 7.5]",2014-12-20 16:35:00,12,2014,2014-12,saturday,16,2.0,2007,7,2014-12,2555.0,a006100,dog,spinone italiano mix,yellow/white,8700 research blvd in austin (tx),normal,public assist,neutered male,1,2555,7.0,"(5.0, 7.5]",2014-12-19 10:21:00,12,2014,2014-12,friday,10,2.0,1 days 06:14:00.000000000,1.259722,positive
2,6 years,a006100,2007-07-09 00:00:00,return to owner,neutered male,2190,6.0,"(5.0, 7.5]",2014-03-08 17:10:00,3,2014,2014-03,saturday,17,3.0,2007,7,2014-03,2190.0,a006100,dog,spinone italiano mix,yellow/white,8700 research in austin (tx),normal,public assist,neutered male,1,2190,6.0,"(5.0, 7.5]",2014-03-07 14:26:00,3,2014,2014-03,friday,14,3.0,1 days 02:44:00.000000000,1.113889,positive
3,10 years,a047759,2004-04-02 00:00:00,transfer,neutered male,3650,10.0,"(7.5, 10.0]",2014-04-07 15:12:00,4,2014,2014-04,monday,15,1.0,2004,4,2014-04,3650.0,a047759,dog,dachshund,tricolor,austin (tx),normal,owner surrender,neutered male,1,3650,10.0,"(7.5, 10.0]",2014-04-02 15:55:00,4,2014,2014-04,wednesday,15,1.0,4 days 23:17:00.000000000,4.970139,positive
4,16 years,a134067,1997-10-16 00:00:00,return to owner,neutered male,5840,16.0,"(15.0, 17.5]",2013-11-16 11:54:00,11,2013,2013-11,saturday,11,1.0,1997,10,2013-11,5840.0,a134067,dog,shetland sheepdog,brown/white,12034 research blvd in austin (tx),injured,public assist,neutered male,1,5840,16.0,"(15.0, 17.5]",2013-11-16 09:02:00,11,2013,2013-11,saturday,9,1.0,0 days 02:52:00.000000000,0.119444,positive


In [25]:
df.to_csv('../processed/aac_cleaned.csv', index=False)