# Libraries

In [1]:
import requests
import pandas as pd
import json
from typing import Optional, Dict, Any, List

## No column display limit

In [2]:
pd.set_option('display.max_columns', None)

# Data extraction

## API setup

In [3]:
class LongBeachAnimalShelterAPI:
    """
    A Python client for extracting data from the Long Beach Animal Shelter API
    using the Opendatasoft Explore API v2.1
    """
    
    def __init__(self):
        self.base_url = "https://longbeach.opendatasoft.com/api/explore/v2.1"
        self.dataset_id = "animal-shelter-intakes-and-outcomes"
        
    def get_all_records(self, 
                       select: Optional[str] = None,
                       where: Optional[str] = None,
                       order_by: Optional[str] = None) -> pd.DataFrame:
        """
        Extract all records from the animal shelter dataset without limits.
        
        Args:
            select: Fields to select (default: all fields)
            where: Filter conditions using ODSQL syntax
            order_by: Order by clause
            
        Returns:
            pandas.DataFrame with all records
        """
        
        all_records = []
        offset = 0
        limit = 100  # Maximum allowed per request
        
        while True:
            # Build API request
            url = f"{self.base_url}/catalog/datasets/{self.dataset_id}/records"
            
            params = {
                'limit': limit,
                'offset': offset
            }
            
            # Add optional parameters
            if select:
                params['select'] = select
            if where:
                params['where'] = where
            if order_by:
                params['order_by'] = order_by
                
            try:
                response = requests.get(url, params=params)
                response.raise_for_status()
                
                data = response.json()
                
                # Extract records from response
                records = data.get('results', [])
                
                if not records:
                    break
                    
                all_records.extend(records)
                
                # Check if we've retrieved all records
                total_count = data.get('total_count', 0)
                if len(all_records) >= total_count:
                    break
                    
                # Move to next batch
                offset += limit
                
                print(f"Retrieved {len(all_records)} of {total_count} records...")
                
            except requests.RequestException as e:
                print(f"Error fetching data: {e}")
                break
                
        # Convert to DataFrame
        df = pd.DataFrame(all_records)
        print(f"Successfully retrieved {len(df)} total records")
        
        return df
    
    def get_dataset_info(self) -> Dict[str, Any]:
        """
        Get metadata about the dataset including field information.
        
        Returns:
            Dictionary with dataset metadata
        """
        url = f"{self.base_url}/catalog/datasets/{self.dataset_id}"
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.json()
            
        except requests.RequestException as e:
            print(f"Error fetching dataset info: {e}")
            return {}
    
    def export_to_csv(self, 
                     filename: str = "animal_shelter_data.csv",
                     select: Optional[str] = None,
                     where: Optional[str] = None) -> bool:
        """
        Export data directly to CSV using the API's export endpoint.
        
        Args:
            filename: Output CSV filename
            select: Fields to select
            where: Filter conditions
            
        Returns:
            True if successful, False otherwise
        """
        url = f"{self.base_url}/catalog/datasets/{self.dataset_id}/exports/csv"
        
        params = {'limit': -1}  # -1 means no limit for exports
        
        if select:
            params['select'] = select
        if where:
            params['where'] = where
            
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            
            with open(filename, 'wb') as f:
                f.write(response.content)
                
            print(f"Data exported to {filename}")
            return True
            
        except requests.RequestException as e:
            print(f"Error exporting data: {e}")
            return False

## Run

In [4]:
if __name__ == "__main__":
    
    # Initialize the API client
    api = LongBeachAnimalShelterAPI()
    
    # Example 1: Get all records
    print("Fetching all animal shelter records...")
    df = api.get_all_records()
    print(f"Retrieved {len(df)} records")
    print(f"Columns: {list(df.columns)}")
    
    
    # Example 5: Export directly to CSV
    print("\nExporting all data to CSV...")
    api.export_to_csv("longbeach_animal_shelter_complete.csv")
    
    
    # Display sample data
    if not df.empty:
        print("\nSample data:")
        print(df.head())

Fetching all animal shelter records...
Retrieved 100 of 31444 records...
Retrieved 200 of 31444 records...
Retrieved 300 of 31444 records...
Retrieved 400 of 31444 records...
Retrieved 500 of 31444 records...
Retrieved 600 of 31444 records...
Retrieved 700 of 31444 records...
Retrieved 800 of 31444 records...
Retrieved 900 of 31444 records...
Retrieved 1000 of 31444 records...
Retrieved 1100 of 31444 records...
Retrieved 1200 of 31444 records...
Retrieved 1300 of 31444 records...
Retrieved 1400 of 31444 records...
Retrieved 1500 of 31444 records...
Retrieved 1600 of 31444 records...
Retrieved 1700 of 31444 records...
Retrieved 1800 of 31444 records...
Retrieved 1900 of 31444 records...
Retrieved 2000 of 31444 records...
Retrieved 2100 of 31444 records...
Retrieved 2200 of 31444 records...
Retrieved 2300 of 31444 records...
Retrieved 2400 of 31444 records...
Retrieved 2500 of 31444 records...
Retrieved 2600 of 31444 records...
Retrieved 2700 of 31444 records...
Retrieved 2800 of 31444 r

In [5]:
df = pd.read_csv("longbeach_animal_shelter_complete.csv", sep=None, engine='python')

## Data info

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31444 entries, 0 to 31443
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ﻿animal_id         31444 non-null  object 
 1   animal_name        18516 non-null  object 
 2   animal_type        31444 non-null  object 
 3   primary_color      31444 non-null  object 
 4   secondary_color    14932 non-null  object 
 5   sex                31444 non-null  object 
 6   dob                27502 non-null  object 
 7   intake_date        31444 non-null  object 
 8   intake_cond        31444 non-null  object 
 9   intake_type        31444 non-null  object 
 10  intake_subtype     31036 non-null  object 
 11  reason             2016 non-null   object 
 12  outcome_date       31190 non-null  object 
 13  crossing           31444 non-null  object 
 14  jurisdiction       31443 non-null  object 
 15  outcome_type       31181 non-null  object 
 16  outcome_subtype    278

## Data preview

In [7]:
df.head()

Unnamed: 0,﻿animal_id,animal_name,animal_type,primary_color,secondary_color,sex,dob,intake_date,intake_cond,intake_type,intake_subtype,reason,outcome_date,crossing,jurisdiction,outcome_type,outcome_subtype,latitude,longitude,intake_is_dead,outcome_is_dead,was_outcome_alive,geopoint
0,A708149,,REPTILE,BROWN,GREEN,Unknown,,2023-10-03,NORMAL,STRAY,FIELD,,2023-10-03,"`600 BLK E HARCOURT, LB 90805",LONG BEACH,RESCUE,OTHER RESC,33.867999,-118.200931,Alive on Intake,False,1,"33.8679994, -118.2009307"
1,A639310,,BIRD,WHITE,GRAY,Unknown,,2020-02-02,ILL SEVERE,WILDLIFE,FIELD,,2020-02-02,"0 BLK TEMPLE AVE, LONG BEACH, CA 90803",LONG BEACH,TRANSFER,LBAH,33.76246,-118.159678,Alive on Intake,False,1,"33.7624598, -118.1596777"
2,A618968,*MORGAN,CAT,BLACK,WHITE,Female,2014-12-18,2018-12-18,INJURED SEVERE,STRAY,FIELD,,2019-01-13,"0 BLK W ZANE ST, LONG BEACH, CA 90805",LONG BEACH,RESCUE,LITTLELION,33.849501,-118.194905,Alive on Intake,False,1,"33.8495009, -118.1949053"
3,A730385,*BRANDON,RABBIT,BLACK,WHITE,Neutered,2023-04-19,2024-10-18,NORMAL,STRAY,OTC,,2024-11-15,00 AQUARIUM WAY LONG BEACH CA 90802,LONG BEACH,ADOPTION,WEB,33.763986,-118.19441,Alive on Intake,False,1,"33.7639859, -118.1944096"
4,A595225,,DOG,BLUE MERLE,WHITE,Male,2015-08-12,2017-08-12,NORMAL,STRAY,FIELD,,2017-08-12,"00 BLK 1ST ST, SEAL BEACH,, CA 90740",SEAL BEACH,TRANSFER,SBACC,33.743036,-118.113624,Alive on Intake,False,1,"33.7430361, -118.1136241"


# EDA

## Null values

In [9]:
# Percentage of null values in each column
df.isnull().mean()*100

animal_id            0.000000
animal_name          41.114362
animal_type           0.000000
primary_color         0.000000
secondary_color      52.512403
sex                   0.000000
dob                  12.536573
intake_date           0.000000
intake_cond           0.000000
intake_type           0.000000
intake_subtype        1.297545
reason               93.588602
outcome_date          0.807785
crossing              0.000000
jurisdiction          0.003180
outcome_type          0.836408
outcome_subtype      11.525251
latitude              0.000000
longitude             0.000000
intake_is_dead        0.000000
outcome_is_dead       0.000000
was_outcome_alive     0.000000
geopoint              0.000000
dtype: float64

1. Drop the following columns: *reason*
2. Add a *has_name* column, then drop *animal_name*
3. Replace null values in *secondary_color* to "None" so that data for animals with more colours is preserved
4. 

Change