# Libraries

In [1]:
import pandas as pd
import numpy as np
import re

# Data

## Intake data

### Data url

In [2]:
intake_url = "https://data.austintexas.gov/resource/wter-evkm.csv?$limit=500000"

### Data pull

In [3]:
df_intakes = pd.read_csv(intake_url)

### Data preview

In [4]:
df_intakes.head()

Unnamed: 0,animal_id,name,datetime,datetime2,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color
0,A521520,Nina,2013-10-01T07:51:00.000,2013-10-01T07:51:00.000,Norht Ec in Austin (TX),Stray,Normal,Dog,Spayed Female,7 years,Border Terrier/Border Collie,White/Tan
1,A664235,,2013-10-01T08:33:00.000,2013-10-01T08:33:00.000,Abia in Austin (TX),Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White
2,A664236,,2013-10-01T08:33:00.000,2013-10-01T08:33:00.000,Abia in Austin (TX),Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White
3,A664237,,2013-10-01T08:33:00.000,2013-10-01T08:33:00.000,Abia in Austin (TX),Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White
4,A664233,Stevie,2013-10-01T08:53:00.000,2013-10-01T08:53:00.000,7405 Springtime in Austin (TX),Stray,Injured,Dog,Intact Female,3 years,Pit Bull Mix,Blue/White


## Outcome data

### Data url

In [5]:
outcome_url = "https://data.austintexas.gov/resource/9t4d-g238.csv?$limit=500000"

### Data pull

In [6]:
df_outcomes = pd.read_csv(outcome_url)

### Data preview

In [7]:
df_outcomes.head(20)

Unnamed: 0,animal_id,date_of_birth,name,datetime,monthyear,outcome_type,outcome_subtype,animal_type,sex_upon_outcome,age_upon_outcome,breed,color
0,A668305,2012-12-01,,2013-12-02T00:00:00-05:00,12-2013,Transfer,Partner,Other,Unknown,1 year,Turtle Mix,Brown/Yellow
1,A673335,2012-02-22,,2014-02-22T00:00:00-05:00,02-2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon,Black/Gray
2,A675999,2013-04-03,,2014-04-07T00:00:00-05:00,04-2014,Transfer,Partner,Other,Unknown,1 year,Turtle Mix,Green
3,A679066,2014-04-16,,2014-05-16T00:00:00-05:00,05-2014,,,Other,Unknown,4 weeks,Rabbit Sh,Brown
4,A680855,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
5,A680857,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
6,A680858,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
7,A680859,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
8,A680860,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
9,A680861,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black


# EDA

## Missing values in intakes

In [8]:
# Check for missing values in df_outcomes
missing_values = df_intakes.isnull().sum()
print("Missing values in df_intakes:\n", missing_values)

Missing values in df_intakes:
 animal_id               0
name                49991
datetime                0
datetime2               0
found_location          0
intake_type             0
intake_condition        0
animal_type             0
sex_upon_intake         1
age_upon_intake         0
breed                   0
color                   0
dtype: int64


## Missing values in outcomes

In [9]:
# Check for missing values in df_outcomes
missing_values = df_outcomes.isnull().sum()
print("Missing values in df_outcomes:\n", missing_values)

Missing values in df_outcomes:
 animal_id               0
date_of_birth           0
name                49784
datetime                0
monthyear               0
outcome_type           46
outcome_subtype     94115
animal_type             0
sex_upon_outcome        1
age_upon_outcome        9
breed                   0
color                   0
dtype: int64


## Unique values in color

In [10]:
df_intakes['color'].nunique()

661

In [11]:
df_outcomes['color'].nunique()

663

High cardinality exists here

## Unique values in found_location

In [12]:
df_intakes['found_location'].nunique()

70183

High cardinality exists here

## Unique values in breed

In [13]:
df_intakes['breed'].nunique()

3006

In [14]:
df_outcomes['breed'].nunique()

3010

High cardinality here too

# Preprocessing

## Fix datetime column

### Intakes

In [15]:
df_intakes['datetime'] = pd.to_datetime(df_intakes['datetime'])

In [16]:
df_intakes['monthyear'] = pd.to_datetime(df_intakes['datetime2']).dt.strftime('%m-%Y')

In [17]:
df_intakes = df_intakes.drop(columns=['datetime2'])

### Outcomes

In [18]:
df_outcomes['date_of_birth'] = pd.to_datetime(df_outcomes['date_of_birth'])
df_outcomes['monthyear'] = pd.to_datetime(df_outcomes['monthyear'], format='%m-%Y')

In [19]:
df_outcomes['datetime'] = pd.to_datetime(df_outcomes['datetime'], format='mixed', utc=True)

## Dropping null values from both dataframes

For **df_intakes**, missing names can be handled later in a binary variable, while, variables like *sex_upon_intake* will need to be omitted from the analysis.

For **df_outcomes**, missing values in age and sex will be omitted. 

In [20]:
df_intakes = df_intakes.dropna(subset=['sex_upon_intake'])

In [21]:
df_outcomes = df_outcomes.dropna(subset=['outcome_type', 'sex_upon_outcome', 'age_upon_outcome'])

## Age variable fix

In [22]:
def convert_age_to_days(age_str, name_str=None):
    """Convert age string to numeric days"""
    if pd.isna(age_str):
        return None
    
    age_str = age_str.lower()
    
    # Handle "0 years" case - check if newborn based on name containing "grams"
    if "0 year" in age_str:
        if name_str and "grams" in str(name_str).lower():
            return 0  # Newborn
        else:
            return None  # Missing age data
    
    total_days = 0
    
    # Find all number-unit pairs
    patterns = [
        (r'(\d+)\s*year', 365),
        (r'(\d+)\s*month', 30),
        (r'(\d+)\s*week', 7),
        (r'(\d+)\s*day', 1)
    ]
    
    for pattern, multiplier in patterns:
        matches = re.findall(pattern, age_str)
        for match in matches:
            total_days += int(match) * multiplier
    
    return total_days if total_days > 0 else None

def convert_age_to_years(age_str, name_str=None):
    """Convert age string to numeric years (decimal)"""
    days = convert_age_to_days(age_str, name_str)
    return days / 365 if days is not None else None

In [23]:
df_intakes['age_upon_intake_days'] = df_intakes.apply(lambda row: convert_age_to_days(row['age_upon_intake'], row.get('name')), axis=1)
df_intakes['age_upon_intake_years'] = df_intakes.apply(lambda row: convert_age_to_years(row['age_upon_intake'], row.get('name')), axis=1)

df_outcomes['age_upon_outcome_days'] = df_outcomes.apply(lambda row: convert_age_to_days(row['age_upon_outcome'], row.get('name')), axis=1)
df_outcomes['age_upon_outcome_years'] = df_outcomes.apply(lambda row: convert_age_to_years(row['age_upon_outcome'], row.get('name')), axis=1)


In [24]:
print("Sample conversions:")
test_ages = ["7 years", "2 weeks", "5 months", "10 days"]
for age in test_ages:
    print(f"{age} -> {convert_age_to_days(age)} days, {convert_age_to_years(age):.2f} years")

Sample conversions:
7 years -> 2555 days, 7.00 years
2 weeks -> 14 days, 0.04 years
5 months -> 150 days, 0.41 years
10 days -> 10 days, 0.03 years


In [25]:
df_outcomes.isna().sum()

animal_id                     0
date_of_birth                 0
name                      49759
datetime                      0
monthyear                     0
outcome_type                  0
outcome_subtype           94070
animal_type                   0
sex_upon_outcome              0
age_upon_outcome              0
breed                         0
color                         0
age_upon_outcome_days      2658
age_upon_outcome_years     2658
dtype: int64

## Breed variable fix

## Has name column

## Combining both dataframes into one

## Duplicate case fix

## Is fixed column

## Keeping only Dogs and Cats

## Fix found_location column to extract feature

## Create target variable **Is_Adopted**

## Drop *outcome_subtype* 