# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from missingno import matrix
from fastai.tabular.core import add_datepart
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans, AffinityPropagation, SpectralClustering
from kmodes.kprototypes import KPrototypes
from scipy.cluster.hierarchy import dendrogram, linkage
import gower
import warnings
warnings.filterwarnings('ignore')

# Data pull

In [3]:
df = pd.read_csv("longbeach_animal_shelter_complete.csv", sep=None, engine='python')

# Data transformation

## Removing whitespaces

In [None]:
df.columns = df.columns.str.strip().str.replace('\ufeff', '') # Removing whitespaces in column names

## Date time conversion

In [None]:
# Convert *dob*, *intake_date*, *outcome_date* to datetime
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['intake_date'] = pd.to_datetime(df['intake_date'], errors='coerce')
df['outcome_date'] = pd.to_datetime(df['outcome_date'], errors='coerce')

## Adding time to outcome variable

In [None]:
df['time_to_outcome'] = df['outcome_date'] - df['intake_date'] # Calculate time to outcome in days

## Dealing with duplicates

In this context, duplicate animal IDs reflect returning animals. This analysis will keep counts for returning cases, note time time between visits, and identify animals that return more than twice.

In [None]:
df = df.sort_values(['animal_id', 'intake_date'])

In [None]:
# Create visit_count using groupby and cumcount
df['visit_count'] = df.groupby('animal_id').cumcount() + 1

In [None]:
# Additional useful features
df['is_return_visit'] = (df['visit_count'] > 1).astype(int)
df['is_frequent_returner'] = (df['visit_count'] > 2).astype(int)

In [None]:
# Advanced features for return animals
df['days_since_last_visit'] = df.groupby('animal_id')['intake_date'].diff().dt.days
df['days_since_last_visit'] = df['days_since_last_visit'].fillna(-1)
df['previous_outcome_type'] = df.groupby('animal_id')['outcome_type'].shift(1)
df['previous_outcome_type'] = df['previous_outcome_type'].fillna('First Visit')

## Date of birth imputation

In [None]:
def impute_dob_statistical(df):
    """Impute DOB using statistical measures from similar animals"""
    df_imputed = df.copy()
    
    # Calculate age for animals with known DOB
    known_mask = df_imputed['dob'].notna()
    df_imputed.loc[known_mask, 'age_at_intake_days'] = (
        df_imputed.loc[known_mask, 'intake_date'] - df_imputed.loc[known_mask, 'dob']
    ).dt.days
    
    # Group by animal_type and intake_type for imputation
    imputation_groups = ['animal_type', 'intake_type', 'sex']
    
    missing_mask = df_imputed['dob'].isna()
    imputed_count = 0
    
    for idx in df_imputed[missing_mask].index:
        # Get characteristics of current animal
        animal_info = df_imputed.loc[idx]
        
        # Find similar animals with known DOB
        similar_animals = df_imputed[
            (df_imputed['animal_type'] == animal_info['animal_type']) &
            (df_imputed['intake_type'] == animal_info['intake_type']) &
            (df_imputed['sex'] == animal_info['sex']) &
            (df_imputed['dob'].notna())
        ]
        
        # If no exact match, broaden the criteria
        if len(similar_animals) < 5:
            similar_animals = df_imputed[
                (df_imputed['animal_type'] == animal_info['animal_type']) &
                (df_imputed['intake_type'] == animal_info['intake_type']) &
                (df_imputed['dob'].notna())
            ]
        
        # If still no match, use animal_type only
        if len(similar_animals) < 5:
            similar_animals = df_imputed[
                (df_imputed['animal_type'] == animal_info['animal_type']) &
                (df_imputed['dob'].notna())
            ]
        
        if len(similar_animals) > 0:
            # Use median age of similar animals
            median_age_days = similar_animals['age_at_intake_days'].median()
            
            # FIX: Check if median is valid and convert to int
            if pd.notna(median_age_days):
                # Calculate imputed DOB
                imputed_dob = animal_info['intake_date'] - timedelta(days=int(median_age_days))
                df_imputed.loc[idx, 'dob'] = imputed_dob
                imputed_count += 1
    
    print(f"Strategy 1: Imputed DOB for {imputed_count} animals using statistical method")
    return df_imputed

def impute_dob_domain_knowledge(df):
    """Impute DOB using domain knowledge about animal shelters"""
    df_imputed = df.copy()
    
    missing_mask = df_imputed['dob'].isna()
    imputed_count = 0
    
    for idx in df_imputed[missing_mask].index:
        animal_info = df_imputed.loc[idx]
        intake_date = animal_info['intake_date']
        
        # Define typical ages based on intake type and condition
        if animal_info['intake_type'] == 'WILDLIFE':
            # Wildlife often comes in as injured adults or orphaned babies
            if 'WEIGHT' in str(animal_info['intake_cond']).upper():
                # Likely a baby if underweight
                estimated_age_days = np.random.normal(30, 15)  # 1 month ± 2 weeks
            else:
                # Likely adult wildlife
                estimated_age_days = np.random.normal(365, 180)  # 1 year ± 6 months
                
        elif animal_info['intake_type'] == 'STRAY':
            # Strays are often young adults who got lost
            if animal_info['animal_type'] == 'CAT':
                estimated_age_days = np.random.normal(548, 365)  # 1.5 years ± 1 year
            else:  # DOG
                estimated_age_days = np.random.normal(730, 365)  # 2 years ± 1 year
                
        elif animal_info['intake_type'] == 'OWNER SURRENDER':
            # Owner surrenders often older animals due to life changes
            if animal_info['animal_type'] == 'CAT':
                estimated_age_days = np.random.normal(1095, 730)  # 3 years ± 2 years
            else:  # DOG
                estimated_age_days = np.random.normal(1460, 1095)  # 4 years ± 3 years
                
        else:
            # Default for other types
            estimated_age_days = np.random.normal(365, 180)  # 1 year ± 6 months
        
        # Ensure positive age and not born in the future
        estimated_age_days = max(1, estimated_age_days)
        estimated_age_days = min(estimated_age_days, 5475)  # Max 15 years
        
        # FIX: Convert to int for timedelta
        # Calculate DOB
        imputed_dob = intake_date - timedelta(days=int(estimated_age_days))
        df_imputed.loc[idx, 'dob'] = imputed_dob
        imputed_count += 1
    
    print(f"Strategy 2: Imputed DOB for {imputed_count} animals using domain knowledge")
    return df_imputed

def impute_dob_hybrid(df):
    """Combine multiple strategies for best results"""
    df_imputed = df.copy()
    
    print("Hybrid DOB Imputation Strategy:")
    print("=" * 40)
    
    # First, try statistical imputation for animals with many similar examples
    df_imputed = impute_dob_statistical(df_imputed)
    
    # Then, use domain knowledge for remaining missing values
    remaining_missing = df_imputed['dob'].isna().sum()
    if remaining_missing > 0:
        print(f"Applying domain knowledge to {remaining_missing} remaining missing DOBs")
        df_imputed = impute_dob_domain_knowledge(df_imputed)
    # Calculate age for ALL animals after imputation
    df_imputed['age_at_intake_days'] = (df_imputed['intake_date'] - df_imputed['dob']).dt.days
    
    return df_imputed

In [None]:
df = impute_dob_hybrid(df)

## Deleting values where DOB is after intake date
This measure is undertaken to ensure logical consistency.

In [None]:
df = df[(df['dob'] <= df['intake_date']) | df['dob'].isnull() | df['intake_date'].isnull()]

## Replace null values of secondary color to 'None'

This step can help classify animals with more colours/patterns.

In [None]:
df['secondary_color'] = df['secondary_color'].fillna('None')

## Dropping columns *reason*, *geopoint*, *was_outcome_alive*, *animal_id*

These features either don't convey any information, or, contain too many null values to impute.

In [None]:
df = df.drop(columns=['reason', 'geopoint', 'was_outcome_alive', 'animal_id'], axis=1)

## Making a binary column for name of animal

A binary variable with a name indicator is better for classification ML models instead of the name.

In [None]:
# New column called has_name where if animal_name is not null, then 1, else 0
df['has_name'] = df['animal_name'].notnull().astype(int)

## Dropping animal_name column

In [None]:
df = df.drop(columns=['animal_name'])

## Dropping null values for outocme_type & intake_subtype

The target variable nulls are small in number and would be better off excluded from the analysis. 

Knowing more about the intake reason will help the analysis become robust.

In [None]:
df = df.dropna(subset=['intake_subtype', 'outcome_type'])

## Dropping the *crossing* feature

In [None]:
df = df.drop(columns=['crossing'])

## Dropping *intake_is_dead* column

Can drop this column since no information is being conveyed through only 1 possible outcome here.

In [None]:
df = df.drop(columns=['intake_is_dead'], axis=1)

## Intake month feature creation

In [None]:
df['intake_month'] = df['intake_date'].dt.month

## Adding *is_fixed* column

In [None]:
df['is_fixed'] = df['sex'].str.contains('Spayed|Neutered', case=False, na=False).astype(int)

## Making changes to the *sex* column

Since the sex column contains the fix status, we want to change that to just male, female, and unknown options. The way we can do that is by changing "neutered" to male and "spayed" to female

In [None]:
df['sex'] = df['sex'].replace({'Neutered': 'Male', 'Spayed': 'Female'})

## Datetime feature engineering using fastai

In [None]:

for col in ['dob','intake_date','outcome_date']:
    df[col] = pd.to_datetime(df[col], utc=True)

# Explode each date column into date‐parts + an “Elapsed” (epoch seconds)
for col in ['dob','intake_date','outcome_date']:
    add_datepart(df, col, drop=True)  # drop=True removes the original datetime

## Dropping redundant date columns

Some date columns created above don't add meaningful information to the analysis and are thus, dropped. 

In [None]:
df = df.drop(columns=['intake_month', 'intake_Year', 'intake_Week', 'intake_Day', 'intake_Dayofyear', 'intake_Is_month_end', 'intake_Is_month_start', 'intake_Is_quarter_end', 'intake_Is_quarter_start', 'intake_Is_year_end', 'intake_Is_year_start', 'outcome_Year','outcome_Week', 'outcome_Day', 'outcome_Dayofyear', 'outcome_Is_month_end', 'outcome_Is_month_start', 'outcome_Is_quarter_end', 'outcome_Is_quarter_start', 'outcome_Is_year_end', 'outcome_Is_year_start', 'dobYear', 'dobWeek', 'dobDay', 'dobIs_month_end', 'dobIs_month_start', 'dobIs_quarter_end', 'dobIs_quarter_start', 'dobIs_year_end', 'dobIs_year_start', 'dobDayofyear'])

## Dealing with features that have high cardinality

### *primary_color*

In [None]:
def extract_base_color_and_pattern(color):
    """Extract base color and pattern information from primary color"""
    color = str(color).upper().strip()
    
    # Define pattern keywords
    pattern_keywords = ['TABBY', 'BRINDLE', 'MERLE', 'PT', 'POINT', 'SMOKE', 'TIGER', 'LYNX']
    
    # Check if it has a pattern
    has_pattern = any(keyword in color for keyword in pattern_keywords)
    
    # Extract base color
    if color in ['TRICOLOR', 'CALICO', 'CALICO DIL', 'CALICO PT', 'CALICO TAB']:
        base_color = 'Multicolor'
    elif 'BLACK' in color or 'BLK' in color:
        base_color = 'Black'
    elif 'WHITE' in color:
        base_color = 'White'
    elif 'BROWN' in color or 'BRN' in color or 'BR ' in color:
        base_color = 'Brown'
    elif 'GRAY' in color or 'GREY' in color:
        base_color = 'Gray'
    elif 'RED' in color or 'RD ' in color:
        base_color = 'Red'
    elif 'BLUE' in color or 'BL ' in color or 'BC ' in color:
        base_color = 'Blue'
    elif 'CREAM' in color or 'CRM' in color or 'CR ' in color:
        base_color = 'Cream'
    elif 'TAN' in color:
        base_color = 'Tan'
    elif 'YELLOW' in color:
        base_color = 'Yellow'
    elif 'ORANGE' in color or 'ORG' in color:
        base_color = 'Orange'
    elif 'GOLD' in color:
        base_color = 'Gold'
    elif 'SILVER' in color or 'SLVR' in color or 'SL ' in color:
        base_color = 'Silver'
    elif 'CHOCOLATE' in color or 'CHOC' in color or 'CH ' in color:
        base_color = 'Chocolate'
    elif 'BUFF' in color:
        base_color = 'Buff'
    elif 'BLONDE' in color:
        base_color = 'Blonde'
    elif 'APRICOT' in color:
        base_color = 'Apricot'
    elif 'TORTIE' in color or 'TORBI' in color:
        base_color = 'Tortoiseshell'
    elif 'SABLE' in color:
        base_color = 'Sable'
    elif 'WHEAT' in color:
        base_color = 'Wheat'
    elif 'FAWN' in color:
        base_color = 'Fawn'
    elif 'SEAL' in color:
        base_color = 'Seal'
    elif 'LILAC' in color:
        base_color = 'Lilac'
    elif 'LIVER' in color:
        base_color = 'Liver'
    elif 'PINK' in color:
        base_color = 'Pink'
    elif 'GREEN' in color:
        base_color = 'Green'
    elif 'FLAME' in color:
        base_color = 'Flame'
    elif 'PEACH' in color:
        base_color = 'Peach'
    elif 'RUDDY' in color:
        base_color = 'Ruddy'
    elif 'DAPPLE' in color:
        base_color = 'Dapple'
    elif 'SNOWSHOE' in color:
        base_color = 'Snowshoe'
    elif 'TICK' in color:
        base_color = 'Tick'
    elif 'UNKNOWN' in color:
        base_color = 'Unknown'
    else:
        base_color = 'Other'
    
    return base_color, int(has_pattern)

# Apply transformation
print("Extracting base colors and patterns...")
primary_color_info = df['primary_color'].apply(extract_base_color_and_pattern)
df['primary_base_color'] = [info[0] for info in primary_color_info]
df['has_pattern'] = [info[1] for info in primary_color_info]

# Display results
print("Primary color transformation results:")
print(f"primary_base_color unique values: {df['primary_base_color'].nunique()}")
print("Base color distribution:")
print(df['primary_base_color'].value_counts().head(10))
print(f"\nPattern distribution:")
print(df['has_pattern'].value_counts())

### *secondary_color*

In [None]:
# Create binary secondary color feature
df['has_secondary_color'] = (df['secondary_color'] != 'None').astype(int)

print("Secondary color transformation results:")
print("has_secondary_color distribution:")
print(df['has_secondary_color'].value_counts())
print(f"Percentage with secondary color: {df['has_secondary_color'].mean()*100:.1f}%")

print("\n" + "="*60)
print("TRANSFORMING ANIMAL TYPE")
print("="*60)

### *animal_type*

In [None]:
def categorize_animal_type(animal_type):
    """Categorize animal type into Cat, Dog, Other"""
    animal_type = str(animal_type).upper().strip()
    
    if animal_type == 'CAT':
        return 'Cat'
    elif animal_type == 'DOG':
        return 'Dog'
    else:  # OTHER, RABBIT, REPTILE, BIRD, LIVESTOCK, WILD, GUINEA PIG
        return 'Other'

# Apply transformation
df['animal_type_grouped'] = df['animal_type'].apply(categorize_animal_type)

print("Animal type transformation results:")
print("Original vs. Grouped:")
print(pd.crosstab(df['animal_type'], df['animal_type_grouped'], margins=True))

print("\nGrouped animal type distribution:")
print(df['animal_type_grouped'].value_counts())

### *outcome_type*

In [None]:
def categorize_outcome_type(outcome_type):
    """Categorize outcome type into 6 main categories"""
    outcome_type = str(outcome_type).upper().strip()
    
    if outcome_type == 'RESCUE':
        return 'Rescue'
    elif outcome_type == 'ADOPTION':
        return 'Adoption'
    elif outcome_type == 'EUTHANASIA':
        return 'Euthanasia'
    elif outcome_type == 'TRANSFER':
        return 'Transfer'
    elif outcome_type == 'RETURN TO OWNER':
        return 'Return to Owner'
    else:  # All other outcomes go to 'Other'
        return 'Other'

# Apply transformation
df['outcome_type_grouped'] = df['outcome_type'].apply(categorize_outcome_type)

print("Outcome type transformation results:")
print("Original vs. Grouped mapping:")
outcome_mapping = df.groupby('outcome_type')['outcome_type_grouped'].first().sort_values()
for original, grouped in outcome_mapping.items():
    count = (df['outcome_type'] == original).sum()
    print(f"  {original} → {grouped} ({count:,} records)")

print("\nGrouped outcome type distribution:")
grouped_counts = df['outcome_type_grouped'].value_counts()
for outcome, count in grouped_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {outcome}: {count:,} ({percentage:.1f}%)")

# ML Models

## Tree-based

## Classification