In [1]:
import pandas as pd
import numpy as np
from pydataset import data
import os
import env
import acquire as a
import prepare as p
import wrangle as w
from sklearn.model_selection import train_test_split

#Stats imports
from scipy import stats
from scipy.stats import pearsonr, spearmanr, ttest_1samp, ttest_ind, chi2_contingency
import math


#visual imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Scalers
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# modeling method
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

import requests


import warnings
warnings.filterwarnings("ignore")

In [2]:
# Intakes csv
intake_df = pd.read_csv('aac_intakes.csv')
intake_df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A665644,,10/21/2013 07:59:00 AM,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
1,A665739,*Alana,10/22/2013 11:11:00 AM,October 2013,Austin (TX),Stray,Normal,Cat,Intact Female,1 month,Domestic Medium Hair Mix,Black
2,A665763,,10/22/2013 03:10:00 PM,October 2013,E Riverside Dr/Royal Crest Dr in Austin (TX),Stray,Normal,Dog,Intact Male,4 months,Cairn Terrier Mix,Tan/White
3,A379998,Disciple,10/23/2013 11:42:00 AM,October 2013,51St And Grover in Austin (TX),Stray,Normal,Dog,Intact Male,10 years,Pit Bull,Black
4,A634503,Otter,10/01/2013 02:49:00 PM,October 2013,Manor (TX),Owner Surrender,Normal,Dog,Spayed Female,2 years,Norfolk Terrier Mix,Tan


In [3]:
# Outcomes csv
outcome_df = pd.read_csv('aac_outcomes.csv')
outcome_df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,03/18/2014 11:47:00 AM,Mar 2014,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


In [4]:
# Intake/Outcome shape
intake_df.shape, outcome_df.shape

((148133, 12), (148339, 12))

In [5]:
intake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148133 entries, 0 to 148132
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         148133 non-null  object
 1   Name              105132 non-null  object
 2   DateTime          148133 non-null  object
 3   MonthYear         148133 non-null  object
 4   Found Location    148133 non-null  object
 5   Intake Type       148133 non-null  object
 6   Intake Condition  148133 non-null  object
 7   Animal Type       148133 non-null  object
 8   Sex upon Intake   148131 non-null  object
 9   Age upon Intake   148132 non-null  object
 10  Breed             148133 non-null  object
 11  Color             148133 non-null  object
dtypes: object(12)
memory usage: 13.6+ MB


In [6]:
outcome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148339 entries, 0 to 148338
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         148339 non-null  object
 1   Name              105360 non-null  object
 2   DateTime          148339 non-null  object
 3   MonthYear         148339 non-null  object
 4   Date of Birth     148339 non-null  object
 5   Outcome Type      148315 non-null  object
 6   Outcome Subtype   67903 non-null   object
 7   Animal Type       148339 non-null  object
 8   Sex upon Outcome  148336 non-null  object
 9   Age upon Outcome  148279 non-null  object
 10  Breed             148339 non-null  object
 11  Color             148339 non-null  object
dtypes: object(12)
memory usage: 13.6+ MB


In [7]:
# Normalize columns
outcome_df.columns = outcome_df.columns.str.replace(' ','_')
intake_df.columns = intake_df.columns.str.replace(' ','_')

# Lower-case columns
outcome_df.columns = outcome_df.columns.str.lower()
intake_df.columns = intake_df.columns.str.lower()

**Outcome Type is the TARGET**

In [8]:
# Outcome Type Value Counts
outcome_df.outcome_type.value_counts()

Adoption           69093
Transfer           42695
Return to Owner    23808
Euthanasia          9572
Died                1384
Rto-Adopt            993
Disposal             667
Missing               75
Relocate              26
Stolen                 2
Name: outcome_type, dtype: int64

In [9]:
# Intake nulls
intake_df.isna().sum()

animal_id               0
name                43001
datetime                0
monthyear               0
found_location          0
intake_type             0
intake_condition        0
animal_type             0
sex_upon_intake         2
age_upon_intake         1
breed                   0
color                   0
dtype: int64

In [10]:
# Outcome nulls
outcome_df.isna().sum()

animal_id               0
name                42979
datetime                0
monthyear               0
date_of_birth           0
outcome_type           24
outcome_subtype     80436
animal_type             0
sex_upon_outcome        3
age_upon_outcome       60
breed                   0
color                   0
dtype: int64

In [11]:
outcome_df.outcome_subtype.value_counts()

Partner                35333
Foster                 13888
Rabies Risk             4249
Suffering               3682
SCRP                    3211
Snr                     3194
Out State                931
In Kennel                723
Aggressive               584
Offsite                  493
In Foster                349
Medical                  339
At Vet                   311
Behavior                 161
Field                    153
Enroute                   94
Court/Investigation       52
Underage                  37
In Surgery                30
Customer S                18
Possible Theft            16
Prc                       13
Emergency                 12
Barn                      12
In State                  12
Emer                       6
Name: outcome_subtype, dtype: int64

**Drop Outcome Subtype. Only interested in actual outcomes and more than half of data is nulls**

In [12]:
# dropping subtype
outcome_df = outcome_df.drop(['outcome_subtype'],axis=1)
outcome_df.head(3)

Unnamed: 0,animal_id,name,datetime,monthyear,date_of_birth,outcome_type,animal_type,sex_upon_outcome,age_upon_outcome,breed,color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,Other,Unknown,1 year,Raccoon,Gray


In [13]:
# Drop name columns
intake_df = intake_df.drop('name',axis=1)
outcome_df = outcome_df.drop('name',axis=1)

In [14]:
# Merged Data frames
combined = intake_df.merge(outcome_df, on='animal_id')
combined.head(3)

Unnamed: 0,animal_id,datetime_x,monthyear_x,found_location,intake_type,intake_condition,animal_type_x,sex_upon_intake,age_upon_intake,breed_x,color_x,datetime_y,monthyear_y,date_of_birth,outcome_type,animal_type_y,sex_upon_outcome,age_upon_outcome,breed_y,color_y
0,A665644,10/21/2013 07:59:00 AM,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico,10/21/2013 11:39:00 AM,Oct 2013,09/21/2013,Transfer,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
1,A665739,10/22/2013 11:11:00 AM,October 2013,Austin (TX),Stray,Normal,Cat,Intact Female,1 month,Domestic Medium Hair Mix,Black,12/20/2013 05:36:00 PM,Dec 2013,09/16/2013,Adoption,Cat,Spayed Female,3 months,Domestic Medium Hair Mix,Black
2,A665763,10/22/2013 03:10:00 PM,October 2013,E Riverside Dr/Royal Crest Dr in Austin (TX),Stray,Normal,Dog,Intact Male,4 months,Cairn Terrier Mix,Tan/White,10/26/2013 06:32:00 PM,Oct 2013,06/22/2013,Adoption,Dog,Neutered Male,4 months,Cairn Terrier Mix,Tan/White


In [15]:
combined.shape

(190609, 20)

In [16]:
# Check if ids match
intake_df[intake_df['animal_id'] == 'A665644']

Unnamed: 0,animal_id,datetime,monthyear,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color
0,A665644,10/21/2013 07:59:00 AM,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico


In [17]:
# Check if ids match
outcome_df[outcome_df['animal_id'] == 'A665644']

Unnamed: 0,animal_id,datetime,monthyear,date_of_birth,outcome_type,animal_type,sex_upon_outcome,age_upon_outcome,breed,color
58227,A665644,10/21/2013 11:39:00 AM,Oct 2013,09/21/2013,Transfer,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico


In [18]:
# Rename columns
combined = combined.rename(columns={'datetime_x': 'date_intake', 'breed_y':'breed',
                        'datetime_y':'date_outcome','animal_type_x':'animal_type','color_y':'color'})
combined.head(3)

Unnamed: 0,animal_id,date_intake,monthyear_x,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed_x,color_x,date_outcome,monthyear_y,date_of_birth,outcome_type,animal_type_y,sex_upon_outcome,age_upon_outcome,breed,color
0,A665644,10/21/2013 07:59:00 AM,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico,10/21/2013 11:39:00 AM,Oct 2013,09/21/2013,Transfer,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
1,A665739,10/22/2013 11:11:00 AM,October 2013,Austin (TX),Stray,Normal,Cat,Intact Female,1 month,Domestic Medium Hair Mix,Black,12/20/2013 05:36:00 PM,Dec 2013,09/16/2013,Adoption,Cat,Spayed Female,3 months,Domestic Medium Hair Mix,Black
2,A665763,10/22/2013 03:10:00 PM,October 2013,E Riverside Dr/Royal Crest Dr in Austin (TX),Stray,Normal,Dog,Intact Male,4 months,Cairn Terrier Mix,Tan/White,10/26/2013 06:32:00 PM,Oct 2013,06/22/2013,Adoption,Dog,Neutered Male,4 months,Cairn Terrier Mix,Tan/White


In [19]:
# Drop columns: breed_x, animal_type_y, monthyear columns, found_location
combined = combined.drop(['breed_x','animal_type_y','color_x', 'monthyear_x','monthyear_y', 'found_location'], axis=1)
combined.head(3)

Unnamed: 0,animal_id,date_intake,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,date_outcome,date_of_birth,outcome_type,sex_upon_outcome,age_upon_outcome,breed,color
0,A665644,10/21/2013 07:59:00 AM,Stray,Sick,Cat,Intact Female,4 weeks,10/21/2013 11:39:00 AM,09/21/2013,Transfer,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
1,A665739,10/22/2013 11:11:00 AM,Stray,Normal,Cat,Intact Female,1 month,12/20/2013 05:36:00 PM,09/16/2013,Adoption,Spayed Female,3 months,Domestic Medium Hair Mix,Black
2,A665763,10/22/2013 03:10:00 PM,Stray,Normal,Dog,Intact Male,4 months,10/26/2013 06:32:00 PM,06/22/2013,Adoption,Neutered Male,4 months,Cairn Terrier Mix,Tan/White


In [20]:
combined.isna().sum()

animal_id            0
date_intake          0
intake_type          0
intake_condition     0
animal_type          0
sex_upon_intake      2
age_upon_intake      1
date_outcome         0
date_of_birth        0
outcome_type        35
sex_upon_outcome     6
age_upon_outcome    63
breed                0
color                0
dtype: int64

In [21]:
# drop nulls, only 107 out of 190k
combined = combined.dropna()
combined.shape

(190511, 14)

In [22]:
combined.animal_type.unique()

array(['Cat', 'Dog', 'Other', 'Bird', 'Livestock'], dtype=object)

In [23]:
# covert date into datetime dtype
combined['date_intake'] = pd.to_datetime(combined.date_intake, infer_datetime_format=True)
combined['date_outcome'] = pd.to_datetime(combined.date_outcome, infer_datetime_format=True)

In [24]:
# Split combined to only be for cats and dogs.
combined_cat = combined[combined['animal_type'] == 'Cat' ]
combined_dog = combined[combined['animal_type'] == 'Dog' ]
combined_cat.shape, combined_dog.shape

((62056, 14), (119868, 14))

In [25]:
combined.outcome_type.unique()

array(['Transfer', 'Adoption', 'Return to Owner', 'Euthanasia',
       'Disposal', 'Died', 'Missing', 'Rto-Adopt', 'Relocate', 'Stolen'],
      dtype=object)

In [26]:
# Value counts Cat
combined_cat.outcome_type.value_counts()

Adoption           31463
Transfer           23969
Return to Owner     3103
Euthanasia          2294
Died                 832
Rto-Adopt            246
Disposal             102
Missing               42
Relocate               5
Name: outcome_type, dtype: int64

In [27]:
# Value counts Dog
combined_dog.outcome_type.value_counts()

Adoption           57470
Return to Owner    36689
Transfer           21335
Euthanasia          2304
Rto-Adopt           1585
Died                 356
Disposal              66
Missing               55
Stolen                 8
Name: outcome_type, dtype: int64

In [28]:
combined.head()

Unnamed: 0,animal_id,date_intake,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,date_outcome,date_of_birth,outcome_type,sex_upon_outcome,age_upon_outcome,breed,color
0,A665644,2013-10-21 07:59:00,Stray,Sick,Cat,Intact Female,4 weeks,2013-10-21 11:39:00,09/21/2013,Transfer,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
1,A665739,2013-10-22 11:11:00,Stray,Normal,Cat,Intact Female,1 month,2013-12-20 05:36:00,09/16/2013,Adoption,Spayed Female,3 months,Domestic Medium Hair Mix,Black
2,A665763,2013-10-22 03:10:00,Stray,Normal,Dog,Intact Male,4 months,2013-10-26 06:32:00,06/22/2013,Adoption,Neutered Male,4 months,Cairn Terrier Mix,Tan/White
3,A379998,2013-10-23 11:42:00,Stray,Normal,Dog,Intact Male,10 years,2013-10-25 12:53:00,02/21/2003,Return to Owner,Intact Male,10 years,Pit Bull,Black
4,A634503,2013-10-01 02:49:00,Owner Surrender,Normal,Dog,Spayed Female,2 years,2013-10-02 12:40:00,08/11/2011,Adoption,Spayed Female,2 years,Norfolk Terrier Mix,Tan
