# Exploratory Data Analysis

In [1]:
import pandas as pd
url = "https://data.austintexas.gov/api/views/9t4d-g238/rows.csv?accessType=DOWNLOAD"

df = pd.read_csv(url)

In [2]:
df.head()

Unnamed: 0,Animal ID,Date of Birth,Name,DateTime,MonthYear,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A668305,2012-12-01,,2013-12-02T00:00:00-05:00,12-2013,Transfer,Partner,Other,Unknown,1 year,Turtle Mix,Brown/Yellow
1,A673335,2012-02-22,,2014-02-22T00:00:00-05:00,02-2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon,Black/Gray
2,A675999,2013-04-03,,2014-04-07T00:00:00-05:00,04-2014,Transfer,Partner,Other,Unknown,1 year,Turtle Mix,Green
3,A679066,2014-04-16,,2014-05-16T00:00:00-05:00,05-2014,,,Other,Unknown,4 weeks,Rabbit Sh,Brown
4,A680855,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173775 entries, 0 to 173774
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         173775 non-null  object
 1   Date of Birth     173775 non-null  object
 2   Name              123991 non-null  object
 3   DateTime          173775 non-null  object
 4   MonthYear         173775 non-null  object
 5   Outcome Type      173729 non-null  object
 6   Outcome Subtype   79660 non-null   object
 7   Animal Type       173775 non-null  object
 8   Sex upon Outcome  173774 non-null  object
 9   Age upon Outcome  173766 non-null  object
 10  Breed             173775 non-null  object
 11  Color             173775 non-null  object
dtypes: object(12)
memory usage: 15.9+ MB


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173775 entries, 0 to 173774
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   animal_id         173775 non-null  object        
 1   date_of_birth     173775 non-null  datetime64[ns]
 2   name              123991 non-null  object        
 3   datetime          173775 non-null  object        
 4   month_year        173775 non-null  datetime64[ns]
 5   outcome_type      173729 non-null  category      
 6   outcome_subtype   173729 non-null  category      
 7   animal_type       173775 non-null  category      
 8   sex_upon_outcome  173774 non-null  category      
 9   age_upon_outcome  173766 non-null  object        
 10  breed             173775 non-null  object        
 11  color             173775 non-null  object        
dtypes: category(4), datetime64[ns](2), object(6)
memory usage: 11.3+ MB


### Nulls

In [7]:
round((df.isnull().mean() * 100), 2)

animal_id            0.00
date_of_birth        0.00
name                28.65
datetime             0.00
month_year           0.00
outcome_type         0.03
outcome_subtype      0.03
animal_type          0.00
sex_upon_outcome     0.00
age_upon_outcome     0.01
breed                0.00
color                0.00
dtype: float64

### Unique Values

In [11]:
df['Outcome Type'].unique()

Index(['Adoption', 'Died', 'Disposal', 'Euthanasia', 'Lost', 'Missing',
       'Relocate', 'Return to Owner', 'Rto-Adopt', 'Stolen', 'Transfer'],
      dtype='object')

In [13]:
df['Outcome Subtype'].unique()

Index(['Adoption', 'Died', 'Disposal', 'Euthanasia', 'Lost', 'Missing',
       'Relocate', 'Return to Owner', 'Rto-Adopt', 'Stolen', 'Transfer'],
      dtype='object')

In [14]:
df['Animal Type'].unique()

Index(['Bird', 'Cat', 'Dog', 'Livestock', 'Other'], dtype='object')

In [15]:
df['Sex upon Outcome'].unique()

Index(['Intact Female', 'Intact Male', 'Neutered Male', 'Spayed Female',
       'Unknown'],
      dtype='object')

In [None]:
 df.nunique()

### Duplicates

In [None]:
df.duplicated().sum()

### Breed & Color Columns

In [19]:
df['animal_type'].value_counts()

animal_type
Dog          94505
Cat          69399
Other         8960
Bird           877
Livestock       34
Name: count, dtype: int64

In [23]:
df[df['animal_type'] == "Livestock"]['breed'].unique()

array(['Pig Mix', 'Goat Mix', 'Miniature', 'Potbelly Pig Mix', 'Pig',
       'Sheep Mix', 'Pygmy', 'Goat', 'Potbelly Pig', 'Emu', 'Goat/Pygmy'],
      dtype=object)

In [24]:
df[df['animal_type'] == "Livestock"]['color'].unique()

array(['Pink', 'Brown', 'White', 'Liver/Cream', 'Black/White', 'Black',
       'Gray', 'Gray/Black', 'White/Black', 'Tan/Black', 'Tan',
       'Black/Brown', 'Tan/White', 'Tricolor/Brown'], dtype=object)

### Data Dictionary


| Column            | Data Type | Description |
|-------------------|-----------|-------------|
| animal_id         | Object| unique identifier for animal
| date_of_birth  | Object | date of birth of animal
| name        | Object |name of animal       
datetime         | Object| timestamp of outcome          
month_year       | Object| month and year of outcome             
outcome_type      | Object| type of outcome        
outcome_subtype      | Object| subtype of outcome   
animal_type        | Object| animal type          
sex_upon_outcome    | Object| sex and neutered status of animal         
age_upon_outcome   | Object| age of animal at time of outcome         
breed            | Object| breed of animal     
color               | Object| color of animal   

### Rename Columns and Change Datatypes

In [4]:
df.rename(columns={ "Animal ID": "animal_id",
        "Date of Birth": "date_of_birth",
        "Name": "name",
        "DateTime": "datetime",
        "MonthYear": "month_year",
        "Outcome Type": "outcome_type",
        "Outcome Subtype": "outcome_subtype",
        "Animal Type": "animal_type",
        "Sex upon Outcome": "sex_upon_outcome",
        "Age upon Outcome": "age_upon_outcome",
        "Breed": "breed",
        "Color": "color"}, inplace=True)

In [5]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df['month_year'] = pd.to_datetime(df['month_year'], format="%m-%Y")
df['outcome_type'] = df['outcome_type'].astype('category')
df['outcome_subtype'] = df['outcome_type'].astype('category')
df['animal_type'] = df['animal_type'].astype('category')
df['sex_upon_outcome'] = df['sex_upon_outcome'].astype('category')

## Cleaning